RDKit
Open-source cheminformatics and machine learning.
FPBReader.h
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FPBREADER_H_DEC2015
12 #define RD_FPBREADER_H_DEC2015
13 /*! \file FPBReader.h
14 
15  \brief contains a simple class for reading and searching FPB files
16 
17  \b Note that this functionality is experimental and the API may change
18  in future releases.
19 */
20 
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <string>
27 
28 #include <cstdint>
29 #include <boost/shared_ptr.hpp>
30 #include <boost/shared_array.hpp>
31 
32 namespace RDKit {
33 namespace detail {
34 struct FPBReader_impl;
35 }
36 
37 //! class for reading and searching FPB files
38 /*!
39  basic usage:
40  \code
41  FPBReader reader("foo.fpb");
42  reader.init();
43  boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
44  std::vector<std::pair<double, unsigned int> > nbrs =
45  reader.getTanimotoNeighbors(*ebv.get(), 0.70);
46  \endcode
47 
48  \b Note: this functionality is experimental and the API may change
49  in future releases.
50 
51  <b>Note on thread safety</b>
52  Operations that involve reading from the FPB file are not thread safe.
53  This means that the \c init() method is not thread safe and none of the
54  search operations are thread safe when an \c FPBReader is initialized in
55  \c lazyRead mode.
56 
57 */
59  public:
61 
62  {};
63  //! ctor for reading from a named file
64  /*!
65  \param fname the name of the file to reads
66  \param lazyRead if set to \c false all fingerprints from the file will be read
67  into memory when \c init() is called.
68  */
69  FPBReader(const char *fname, bool lazyRead = false) {
70  _initFromFilename(fname, lazyRead);
71  };
72  //! \overload
73  FPBReader(const std::string &fname, bool lazyRead = false) {
74  _initFromFilename(fname.c_str(), lazyRead);
75  };
76  //! ctor for reading from an open istream
77  /*!
78  \param inStream the stream to read from
79  \param takeOwnership if set, we will take over ownership of the stream pointer
80  \param lazyRead if set to \c false all fingerprints from the file will be read
81  into memory when \c init() is called.
82 
83  Some additional notes:
84  - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
85  tellg() operations.
86 
87  */
88  FPBReader(std::istream *inStream, bool takeOwnership = true,
89  bool lazyRead = false)
90  : dp_istrm(inStream),
91  dp_impl(nullptr),
92  df_owner(takeOwnership),
93  df_init(false),
94  df_lazyRead(lazyRead){};
96  destroy();
97  if (df_owner) delete dp_istrm;
98  dp_istrm = nullptr;
99  df_init = false;
100  };
101 
102  //! Read the data from the file and initialize internal data structures
103  /*!
104  This must be called before most of the other methods of this class.
105 
106  Some notes:
107  \li if \c lazyRead is not set, all fingerprints will be read into memory. This
108  can require substantial amounts of memory for large files.
109  \li For large files, this can take a long time.
110  \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
111  and delete inStream after calling \c init()
112  */
113  void init();
114  //! cleanup
115  /*!
116  Cleans up whatever memory was allocated during init()
117  */
118  void cleanup() {
119  if (!df_init) return;
120  destroy();
121  df_init = false;
122  };
123  //! returns the requested fingerprint as an \c ExplicitBitVect
124  boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
125  //! returns the requested fingerprint as an array of bytes
126  boost::shared_array<std::uint8_t> getBytes(unsigned int idx) const;
127 
128  //! returns the id of the requested fingerprint
129  std::string getId(unsigned int idx) const;
130  //! returns the fingerprint and id of the requested fingerprint
131  std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
132  unsigned int idx) const {
133  return std::make_pair(getFP(idx), getId(idx));
134  };
135 
136  //! returns beginning and end indices of fingerprints having on-bit counts
137  //! within the range (including end points)
138  std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
139  unsigned int minCount, unsigned int maxCount);
140 
141  //! returns the number of fingerprints
142  unsigned int length() const;
143  //! returns the number of bits in our fingerprints
144  unsigned int nBits() const;
145 
146  //! returns the tanimoto similarity between the specified fingerprint and the
147  //! provided fingerprint
148  double getTanimoto(unsigned int idx, const std::uint8_t *bv) const;
149  //! \overload
150  double getTanimoto(unsigned int idx,
151  boost::shared_array<std::uint8_t> bv) const {
152  return getTanimoto(idx, bv.get());
153  };
154  //! \overload
155  double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
156 
157  //! returns tanimoto neighbors that are within a similarity threshold
158  /*!
159  The result vector of (similarity,index) pairs is sorted in order
160  of decreasing similarity
161 
162  \param bv the query fingerprint
163  \param threshold the minimum similarity to return
164  \param usePopcountScreen if this is true (the default) the popcount of the
165  neighbors will be used to reduce the number of calculations that need
166  to be done
167 
168  */
169  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
170  const std::uint8_t *bv, double threshold = 0.7,
171  bool usePopcountScreen = true) const;
172  //! \overload
173  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
174  boost::shared_array<std::uint8_t> bv, double threshold = 0.7,
175  bool usePopcountScreen = true) const {
176  return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
177  };
178  //! \overload
179  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
180  const ExplicitBitVect &ebv, double threshold = 0.7,
181  bool usePopcountScreen = true) const;
182 
183  //! returns the Tversky similarity between the specified fingerprint and the
184  //! provided fingerprint
185  /*!
186 
187  \param idx the fingerprint to compare to
188  \param bv the query fingerprint
189  \param ca the Tversky a coefficient
190  \param cb the Tversky a coefficient
191 
192  */
193  double getTversky(unsigned int idx, const std::uint8_t *bv, double ca,
194  double cb) const;
195  //! \overload
196  double getTversky(unsigned int idx, boost::shared_array<std::uint8_t> bv,
197  double ca, double cb) const {
198  return getTversky(idx, bv.get(), ca, cb);
199  };
200  //! \overload
201  double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
202  double cb) const;
203 
204  //! returns Tversky neighbors that are within a similarity threshold
205  /*!
206  The result vector of (similarity,index) pairs is sorted in order
207  of decreasing similarity
208 
209  \param bv the query fingerprint
210  \param ca the Tversky a coefficient
211  \param cb the Tversky a coefficient
212  \param threshold the minimum similarity to return
213  \param usePopcountScreen if this is true (the default) the popcount of the
214  neighbors will be used to reduce the number of calculations that need
215  to be done
216 
217  */
218  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
219  const std::uint8_t *bv, double ca, double cb, double threshold = 0.7,
220  bool usePopcountScreen = true) const;
221  //! \overload
222  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
223  boost::shared_array<std::uint8_t> bv, double ca, double cb,
224  double threshold = 0.7, bool usePopcountScreen = true) const {
225  return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
226  };
227  //! \overload
228  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
229  const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
230  bool usePopcountScreen = true) const;
231 
232  //! returns indices of all fingerprints that completely contain this one
233  /*! (i.e. where all the bits set in the query are also set in the db
234  molecule)
235  */
236  std::vector<unsigned int> getContainingNeighbors(
237  const std::uint8_t *bv) const;
238  //! \overload
239  std::vector<unsigned int> getContainingNeighbors(
240  boost::shared_array<std::uint8_t> bv) const {
241  return getContainingNeighbors(bv.get());
242  };
243  //! \overload
244  std::vector<unsigned int> getContainingNeighbors(
245  const ExplicitBitVect &ebv) const;
246 
247  private:
248  std::istream *dp_istrm{nullptr};
249  detail::FPBReader_impl *dp_impl{nullptr}; // implementation details
250  bool df_owner{false};
251  bool df_init{false};
252  bool df_lazyRead{false};
253 
254  // disable automatic copy constructors and assignment operators
255  // for this class and its subclasses. They will likely be
256  // carrying around stream pointers and copying those is a recipe
257  // for disaster.
258  FPBReader(const FPBReader &);
259  FPBReader &operator=(const FPBReader &);
260  void destroy();
261  void _initFromFilename(const char *fname, bool lazyRead) {
262  std::istream *tmpStream = static_cast<std::istream *>(
263  new std::ifstream(fname, std::ios_base::binary));
264  if (!(*tmpStream) || (tmpStream->bad())) {
265  std::ostringstream errout;
266  errout << "Bad input file " << fname;
267  delete tmpStream;
268  throw BadFileException(errout.str());
269  }
270  dp_istrm = tmpStream;
271  dp_impl = nullptr;
272  df_owner = true;
273  df_init = false;
274  df_lazyRead = lazyRead;
275  }
276 };
277 } // namespace RDKit
278 #endif
a class for bit vectors that are densely occupied
class for reading and searching FPB files
Definition: FPBReader.h:58
void cleanup()
cleanup
Definition: FPBReader.h:118
double getTversky(unsigned int idx, const std::uint8_t *bv, double ca, double cb) const
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const ExplicitBitVect &ebv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::pair< unsigned int, unsigned int > getFPIdsInCountRange(unsigned int minCount, unsigned int maxCount)
unsigned int length() const
returns the number of fingerprints
double getTanimoto(unsigned int idx, const std::uint8_t *bv) const
boost::shared_ptr< ExplicitBitVect > getFP(unsigned int idx) const
returns the requested fingerprint as an ExplicitBitVect
boost::shared_array< std::uint8_t > getBytes(unsigned int idx) const
returns the requested fingerprint as an array of bytes
double getTanimoto(unsigned int idx, boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:150
double getTversky(unsigned int idx, boost::shared_array< std::uint8_t > bv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:196
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const ExplicitBitVect &ebv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition: FPBReader.h:88
std::vector< unsigned int > getContainingNeighbors(const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition: FPBReader.h:69
FPBReader(const std::string &fname, bool lazyRead=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:73
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:239
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const std::uint8_t *bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
returns Tversky neighbors that are within a similarity threshold
unsigned int nBits() const
returns the number of bits in our fingerprints
std::vector< unsigned int > getContainingNeighbors(const std::uint8_t *bv) const
returns indices of all fingerprints that completely contain this one
double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::string getId(unsigned int idx) const
returns the id of the requested fingerprint
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition: FPBReader.h:131
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< std::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:173
void init()
Read the data from the file and initialize internal data structures.
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const std::uint8_t *bv, double threshold=0.7, bool usePopcountScreen=true) const
returns tanimoto neighbors that are within a similarity threshold
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< std::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:222
#define RDKIT_DATASTRUCTS_EXPORT
Definition: export.h:138
Std stuff.
Definition: Abbreviations.h:17
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)