RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1 //
2 // This file is part of the CCTBX distribution:
3 // http://cctbx.sourceforge.net/
4 // Downloaded from here:
5 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6 //
7 // Copyright (c) 2006, The Regents of the University of
8 // California, through Lawrence Berkeley National Laboratory (subject to
9 // receipt of any required approvals from the U.S. Dept. of Energy). All
10 // rights reserved.
11 //
12 // The license is here:
13 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14 //
15 #include <RDGeneral/export.h>
16 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19 #include <boost/python/object.hpp>
20 #include <boost/python/str.hpp>
21 #include <boost/python/extract.hpp>
22 
23 #include <boost/optional.hpp>
24 #include <boost/utility/typed_in_place_factory.hpp>
26 
27 //#include <tbxx/error_utils.hpp>
28 #include <RDGeneral/Invariant.h>
29 #include <RDGeneral/Exceptions.h>
30 
31 #include <streambuf>
32 #include <iostream>
33 
34 namespace boost_adaptbx {
35 namespace python {
36 
37 namespace bp = boost::python;
38 
39 /// A stream buffer getting data from and putting data into a Python file object
40 /** The aims are as follow:
41 
42  - Given a C++ function acting on a standard stream, e.g.
43 
44  \code
45  void read_inputs(std::istream& input) {
46  ...
47  input >> something >> something_else;
48  }
49  \endcode
50 
51  and given a piece of Python code which creates a file-like object,
52  to be able to pass this file object to that C++ function, e.g.
53 
54  \code
55  import gzip
56  gzip_file_obj = gzip.GzipFile(...)
57  read_inputs(gzip_file_obj)
58  \endcode
59 
60  and have the standard stream pull data from and put data into the Python
61  file object.
62 
63  - When Python \c read_inputs() returns, the Python object is able to
64  continue reading or writing where the C++ code left off.
65 
66  - Operations in C++ on mere files should be competitively fast compared
67  to the direct use of \c std::fstream.
68 
69 
70  \b Motivation
71 
72  - the standard Python library offer of file-like objects (files,
73  compressed files and archives, network, ...) is far superior to the
74  offer of streams in the C++ standard library and Boost C++ libraries.
75 
76  - i/o code involves a fair amount of text processing which is more
77  efficiently prototyped in Python but then one may need to rewrite
78  a time-critical part in C++, in as seamless a manner as possible.
79 
80  \b Usage
81 
82  This is 2-step:
83 
84  - a trivial wrapper function
85 
86  \code
87  using boost_adaptbx::python::streambuf;
88  void read_inputs_wrapper(streambuf& input)
89  {
90  streambuf::istream is(input);
91  read_inputs(is);
92  }
93 
94  def("read_inputs", read_inputs_wrapper);
95  \endcode
96 
97  which has to be written every time one wants a Python binding for
98  such a C++ function.
99 
100  - the Python side
101 
102  \code
103  from boost.python import streambuf
104  read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
105  \endcode
106 
107  \c buffer_size is optional. See also: \c default_buffer_size
108 
109  Note: references are to the C++ standard (the numbers between parentheses
110  at the end of references are margin markers).
111 */
112 class streambuf : public std::basic_streambuf<char> {
113  private:
114  typedef std::basic_streambuf<char> base_t;
115 
116  public:
117  /* The syntax
118  using base_t::char_type;
119  would be nicer but Visual Studio C++ 8 chokes on it
120  */
121  typedef base_t::char_type char_type;
122  typedef base_t::int_type int_type;
123  typedef base_t::pos_type pos_type;
124  typedef base_t::off_type off_type;
125  typedef base_t::traits_type traits_type;
126 
127  // work around Visual C++ 7.1 problem
128  inline static int traits_type_eof() { return traits_type::eof(); }
129 
130  /// The default size of the read and write buffer.
131  /** They are respectively used to buffer data read from and data written to
132  the Python file object. It can be modified from Python.
133  */
134  const static std::size_t default_buffer_size = 1024;
135 
136  /// Construct from a Python file object
137  /** if buffer_size is 0 the current default_buffer_size is used.
138  */
139  streambuf(bp::object& python_file_obj, std::size_t buffer_size_ = 0)
140  : py_read(getattr(python_file_obj, "read", bp::object())),
141  py_write(getattr(python_file_obj, "write", bp::object())),
142  py_seek(getattr(python_file_obj, "seek", bp::object())),
143  py_tell(getattr(python_file_obj, "tell", bp::object())),
144  buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
145  write_buffer(nullptr),
146  pos_of_read_buffer_end_in_py_file(0),
147  pos_of_write_buffer_end_in_py_file(buffer_size),
148  farthest_pptr(nullptr) {
149  TEST_ASSERT(buffer_size != 0);
150  /* Some Python file objects (e.g. sys.stdout and sys.stdin)
151  have non-functional seek and tell. If so, assign None to
152  py_tell and py_seek.
153  */
154  if (py_tell != bp::object()) {
155  try {
156  off_type py_pos = bp::extract<off_type>(py_tell());
157  if (py_seek != bp::object()) {
158  /* Make sure we can actually seek.
159  bzip2 readers from python have a seek method, but it fails
160  when they are in write mode.
161  */
162  py_seek(py_pos);
163  }
164  } catch (bp::error_already_set&) {
165  py_tell = bp::object();
166  py_seek = bp::object();
167  /* Boost.Python does not do any Python exception handling whatsoever
168  So we need to catch it by hand like so.
169  */
170  PyErr_Clear();
171  }
172  }
173 
174  if (py_write != bp::object()) {
175  // C-like string to make debugging easier
176  write_buffer = new char[buffer_size + 1];
177  write_buffer[buffer_size] = '\0';
178  setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
179  farthest_pptr = pptr();
180  } else {
181  // The first attempt at output will result in a call to overflow
182  setp(nullptr, nullptr);
183  }
184 
185  if (py_tell != bp::object()) {
186  off_type py_pos = bp::extract<off_type>(py_tell());
187  pos_of_read_buffer_end_in_py_file = py_pos;
188  pos_of_write_buffer_end_in_py_file = py_pos;
189  }
190  }
191 
192  /// constructor to enforce a mode (binary or text)
193  streambuf(bp::object& python_file_obj, char mode,
194  std::size_t buffer_size_ = 0)
195  : streambuf(python_file_obj, buffer_size_) {
196 #if 1
197  bp::object io_mod = bp::import("io");
198  CHECK_INVARIANT(io_mod, "module not found");
199  bp::object iobase = io_mod.attr("TextIOBase");
200  CHECK_INVARIANT(iobase, "base class not found");
201 #else
202  // using statics to save an undetermined amount of time results in
203  // alarming seg faults on windows. so we don't do it. Keep this here
204  // for the moment though in case someone manages to figure that out in
205  // the future
206  static bp::object io_mod = bp::object();
207  static bp::object iobase = bp::object();
208  if (!io_mod) io_mod = bp::import("io");
209  if (io_mod && !iobase) iobase = io_mod.attr("TextIOBase");
210  CHECK_INVARIANT(io_mod, "module not found");
211  CHECK_INVARIANT(iobase, "base class not found");
212 #endif
213 
214  df_isTextMode = PyObject_IsInstance(python_file_obj.ptr(), iobase.ptr());
215  switch (mode) {
216  case 's': /// yeah, is redundant, but it is somehow natural to do "s"
217  case 't':
218  if (!df_isTextMode)
219  throw ValueErrorException(
220  "Need a text mode file object like StringIO or a file opened "
221  "with mode 't'");
222  break;
223  case 'b':
224  if (df_isTextMode)
225  throw ValueErrorException(
226  "Need a binary mode file object like BytesIO or a file opened "
227  "with mode 'b'");
228  break;
229  default:
230  throw std::invalid_argument("bad mode character");
231  }
232  }
233 
234  /// Mundane destructor freeing the allocated resources
235  virtual ~streambuf() {
236  if (write_buffer) delete[] write_buffer;
237  }
238 
239  /// C.f. C++ standard section 27.5.2.4.3
240  /** It is essential to override this virtual function for the stream
241  member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
242  */
243  virtual std::streamsize showmanyc() {
244  int_type const failure = traits_type::eof();
245  int_type status = underflow();
246  if (status == failure) return -1;
247  return egptr() - gptr();
248  }
249 
250  /// C.f. C++ standard section 27.5.2.4.3
251  virtual int_type underflow() {
252  int_type const failure = traits_type::eof();
253  if (py_read == bp::object()) {
254  throw std::invalid_argument(
255  "That Python file object has no 'read' attribute");
256  }
257  read_buffer = py_read(buffer_size);
258  char* read_buffer_data;
259  bp::ssize_t py_n_read;
260  if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
261  &py_n_read) == -1) {
262  setg(nullptr, nullptr, nullptr);
263  throw std::invalid_argument(
264  "The method 'read' of the Python file object "
265  "did not return a string.");
266  }
267  off_type n_read = (off_type)py_n_read;
268  pos_of_read_buffer_end_in_py_file += n_read;
269  setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
270  // ^^^27.5.2.3.1 (4)
271  if (n_read == 0) return failure;
272  return traits_type::to_int_type(read_buffer_data[0]);
273  }
274 
275  /// C.f. C++ standard section 27.5.2.4.5
277  if (py_write == bp::object()) {
278  throw std::invalid_argument(
279  "That Python file object has no 'write' attribute");
280  }
281  farthest_pptr = std::max(farthest_pptr, pptr());
282  off_type n_written = (off_type)(farthest_pptr - pbase());
283  off_type orig_n_written = n_written;
284  const unsigned int STD_ASCII = 0x7F;
285  if (df_isTextMode && c > STD_ASCII) {
286  // we're somewhere in the middle of a utf8 block. If we
287  // only write part of it we'll end up with an exception,
288  // so push everything that could be utf8 into the next block
289  while (n_written > 0 &&
290  static_cast<unsigned int>(write_buffer[n_written - 1]) > STD_ASCII) {
291  --n_written;
292  }
293  }
294  bp::str chunk(pbase(), pbase() + n_written);
295  py_write(chunk);
296 
297  if ((!df_isTextMode || c <= STD_ASCII) &&
298  !traits_type::eq_int_type(c, traits_type::eof())) {
299  py_write(traits_type::to_char_type(c));
300  n_written++;
301  }
302 
303  setp(pbase(), epptr());
304  // ^^^ 27.5.2.4.5 (5)
305  farthest_pptr = pptr();
306  if (n_written) {
307  pos_of_write_buffer_end_in_py_file += n_written;
308  if (df_isTextMode && c > STD_ASCII &&
309  !traits_type::eq_int_type(c, traits_type::eof())) {
310  size_t n_to_copy = orig_n_written - n_written;
311 
312  for (size_t i = 0; i < n_to_copy; ++i) {
313  sputc(write_buffer[n_written + i]);
314  ++farthest_pptr;
315  }
316  sputc(c);
317  ++farthest_pptr;
318  }
319  }
320  return traits_type::eq_int_type(c, traits_type::eof())
321  ? traits_type::not_eof(c)
322  : c;
323  }
324 
325  /// Update the python file to reflect the state of this stream buffer
326  /** Empty the write buffer into the Python file object and set the seek
327  position of the latter accordingly (C++ standard section 27.5.2.4.2).
328  If there is no write buffer or it is empty, but there is a non-empty
329  read buffer, set the Python file object seek position to the
330  seek position in that read buffer.
331  */
332  virtual int sync() {
333  int result = 0;
334  farthest_pptr = std::max(farthest_pptr, pptr());
335  if (farthest_pptr && farthest_pptr > pbase()) {
336  off_type delta = pptr() - farthest_pptr;
337  int_type status = overflow();
338  if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
339  if (py_seek != bp::object()) py_seek(delta, 1);
340  } else if (gptr() && gptr() < egptr()) {
341  if (py_seek != bp::object()) py_seek(gptr() - egptr(), 1);
342  }
343  return result;
344  }
345 
346  /// C.f. C++ standard section 27.5.2.4.2
347  /** This implementation is optimised to look whether the position is within
348  the buffers, so as to avoid calling Python seek or tell. It is
349  important for many applications that the overhead of calling into Python
350  is avoided as much as possible (e.g. parsers which may do a lot of
351  backtracking)
352  */
353  virtual pos_type seekoff(off_type off, std::ios_base::seekdir way,
354  std::ios_base::openmode which = std::ios_base::in |
355  std::ios_base::out) {
356  /* In practice, "which" is either std::ios_base::in or out
357  since we end up here because either seekp or seekg was called
358  on the stream using this buffer. That simplifies the code
359  in a few places.
360  */
361  int const failure = off_type(-1);
362 
363  if (py_seek == bp::object()) {
364  throw std::invalid_argument(
365  "That Python file object has no 'seek' attribute");
366  }
367 
368  // we need the read buffer to contain something!
369  if (which == std::ios_base::in && !gptr()) {
370  if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
371  return failure;
372  }
373  }
374 
375  // compute the whence parameter for Python seek
376  int whence;
377  switch (way) {
378  case std::ios_base::beg:
379  whence = 0;
380  break;
381  case std::ios_base::cur:
382  whence = 1;
383  break;
384  case std::ios_base::end:
385  whence = 2;
386  break;
387  default:
388  return failure;
389  }
390 
391  // Let's have a go
392  boost::optional<off_type> result =
393  seekoff_without_calling_python(off, way, which);
394  if (!result) {
395  // we need to call Python
396  if (which == std::ios_base::out) overflow();
397  if (way == std::ios_base::cur) {
398  if (which == std::ios_base::in)
399  off -= egptr() - gptr();
400  else if (which == std::ios_base::out)
401  off += pptr() - pbase();
402  }
403  py_seek(off, whence);
404  result = off_type(bp::extract<off_type>(py_tell()));
405  if (which == std::ios_base::in) underflow();
406  }
407  return *result;
408  }
409 
410  /// C.f. C++ standard section 27.5.2.4.2
412  std::ios_base::openmode which = std::ios_base::in |
413  std::ios_base::out) {
414  return streambuf::seekoff(sp, std::ios_base::beg, which);
415  }
416 
417  private:
418  bp::object py_read, py_write, py_seek, py_tell;
419 
420  std::size_t buffer_size;
421 
422  /* This is actually a Python string and the actual read buffer is
423  its internal data, i.e. an array of characters. We use a Boost.Python
424  object so as to hold on it: as a result, the actual buffer can't
425  go away.
426  */
427  bp::object read_buffer;
428 
429  /* A mere array of char's allocated on the heap at construction time and
430  de-allocated only at destruction time.
431  */
432  char* write_buffer;
433  bool df_isTextMode;
434 
435  off_type pos_of_read_buffer_end_in_py_file,
436  pos_of_write_buffer_end_in_py_file;
437 
438  // the farthest place the buffer has been written into
439  char* farthest_pptr;
440 
441  boost::optional<off_type> seekoff_without_calling_python(
442  off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
443  boost::optional<off_type> const failure;
444 
445  // Buffer range and current position
446  off_type buf_begin, buf_end, buf_cur, upper_bound;
447  off_type pos_of_buffer_end_in_py_file;
448  if (which == std::ios_base::in) {
449  pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
450  buf_begin = reinterpret_cast<std::streamsize>(eback());
451  buf_cur = reinterpret_cast<std::streamsize>(gptr());
452  buf_end = reinterpret_cast<std::streamsize>(egptr());
453  upper_bound = buf_end;
454  } else if (which == std::ios_base::out) {
455  pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
456  buf_begin = reinterpret_cast<std::streamsize>(pbase());
457  buf_cur = reinterpret_cast<std::streamsize>(pptr());
458  buf_end = reinterpret_cast<std::streamsize>(epptr());
459  farthest_pptr = std::max(farthest_pptr, pptr());
460  upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
461  } else {
462  CHECK_INVARIANT(0, "unreachable code");
463  }
464 
465  // Sought position in "buffer coordinate"
466  off_type buf_sought;
467  if (way == std::ios_base::cur) {
468  buf_sought = buf_cur + off;
469  } else if (way == std::ios_base::beg) {
470  buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
471  } else if (way == std::ios_base::end) {
472  return failure;
473  } else {
474  CHECK_INVARIANT(0, "unreachable code");
475  }
476 
477  // if the sought position is not in the buffer, give up
478  if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
479 
480  // we are in wonderland
481  if (which == std::ios_base::in)
482  gbump(buf_sought - buf_cur);
483  else if (which == std::ios_base::out)
484  pbump(buf_sought - buf_cur);
485  return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
486  }
487 
488  public:
489  class istream : public std::istream {
490  public:
491  istream(streambuf& buf) : std::istream(&buf) {
492  exceptions(std::ios_base::badbit);
493  }
494 
496  // do nothing.
497  // This used to do:
498  // if (this->good()) this->sync();
499  // but that caused problems if the underlying file had been closed
500  // (see github #579) and really doesn't seem necessary for what we're
501  // doing.
502  }
503  };
504 
505  class ostream : public std::ostream {
506  public:
507  ostream(streambuf& buf) : std::ostream(&buf) {
508  exceptions(std::ios_base::badbit);
509  }
510 
512  if (this->good()) this->flush();
513  }
514  };
515 };
516 
517 // std::size_t streambuf::default_buffer_size = 1024;
518 
521 
522  streambuf_capsule(bp::object& python_file_obj, std::size_t buffer_size = 0)
523  : python_streambuf(python_file_obj, buffer_size) {}
524 };
525 
527  ostream(bp::object& python_file_obj, std::size_t buffer_size = 0)
528  : streambuf_capsule(python_file_obj, buffer_size),
530 
531  ~ostream() noexcept {
532  if (this->good()) {
533  this->flush();
534  }
535  }
536 };
537 } // namespace python
538 } // namespace boost_adaptbx
539 
540 #endif // GUARD
#define TEST_ASSERT(expr)
Definition: Invariant.h:152
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:39
A stream buffer getting data from and putting data into a Python file object.
virtual int sync()
Update the python file to reflect the state of this stream buffer.
static const std::size_t default_buffer_size
The default size of the read and write buffer.
virtual std::streamsize showmanyc()
C.f. C++ standard section 27.5.2.4.3.
virtual pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
streambuf(bp::object &python_file_obj, char mode, std::size_t buffer_size_=0)
constructor to enforce a mode (binary or text)
virtual int_type underflow()
C.f. C++ standard section 27.5.2.4.3.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
virtual int_type overflow(int_type c=traits_type_eof())
C.f. C++ standard section 27.5.2.4.5.
virtual ~streambuf()
Mundane destructor freeing the allocated resources.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)