RDKit
Open-source cheminformatics and machine learning.
SubstructLibrary.h
Go to the documentation of this file.
1 // Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc.
2 // and other RDKit contributors
3 //
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are
8 // met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following
14 // disclaimer in the documentation and/or other materials provided
15 // with the distribution.
16 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
17 // nor the names of its contributors may be used to endorse or promote
18 // products derived from this software without specific prior written
19 // permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 //
33 #ifndef RDK_SUBSTRUCT_LIBRARY
34 #define RDK_SUBSTRUCT_LIBRARY
35 #include <utility>
36 
37 #include <RDGeneral/export.h>
38 #include <GraphMol/RDKitBase.h>
39 #include <GraphMol/MolPickler.h>
40 #include <GraphMol/MolBundle.h>
46 #include <DataStructs/BitOps.h>
47 #include <GraphMol/MolOps.h>
49 
50 #include <algorithm>
51 #include <string>
52 #include <boost/lexical_cast.hpp>
53 
54 namespace RDKit {
55 
57 
58 //! Base class API for holding molecules to substructure search.
59 /*!
60  This is an API that hides the implementation details used for
61  indexing molecules for substructure searching. It simply
62  provides an API for adding and getting molecules from a set.
63  */
65  public:
66  virtual ~MolHolderBase() {}
67 
68  //! Add a new molecule to the substructure search library
69  //! Returns the molecules index in the library
70  virtual unsigned int addMol(const ROMol &m) = 0;
71 
72  // implementations should throw IndexError on out of range
73  virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
74 
75  //! Get the current library size
76  virtual unsigned int size() const = 0;
77 };
78 
79 //! Concrete class that holds molecules in memory
80 /*!
81  This is currently one of the faster implementations.
82  However it is very memory intensive.
83 */
85  std::vector<boost::shared_ptr<ROMol>> mols;
86 
87  public:
88  MolHolder() : MolHolderBase(), mols() {}
89 
90  unsigned int addMol(const ROMol &m) override {
91  mols.push_back(boost::make_shared<ROMol>(m));
92  return size() - 1;
93  }
94 
95  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
96  if (idx >= mols.size()) throw IndexErrorException(idx);
97  return mols[idx];
98  }
99 
100  unsigned int size() const override {
101  return rdcast<unsigned int>(mols.size());
102  }
103 
104  std::vector<boost::shared_ptr<ROMol>> &getMols() { return mols; }
105  const std::vector<boost::shared_ptr<ROMol>> &getMols() const { return mols; }
106 };
107 
108 //! Concrete class that holds binary cached molecules in memory
109 /*!
110  This implementation uses quite a bit less memory than the
111  non cached implementation. However, due to the reduced speed
112  it should be used in conjunction with a pattern fingerprinter.
113 
114  See RDKit::FPHolder
115 */
117  std::vector<std::string> mols;
118 
119  public:
121 
122  unsigned int addMol(const ROMol &m) override {
123  mols.emplace_back();
124  MolPickler::pickleMol(m, mols.back());
125  return size() - 1;
126  }
127 
128  //! Adds a pickled binary molecule, no validity checking of the input
129  //! is done.
130  unsigned int addBinary(const std::string &pickle) {
131  mols.push_back(pickle);
132  return size() - 1;
133  }
134 
135  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
136  if (idx >= mols.size()) throw IndexErrorException(idx);
137  boost::shared_ptr<ROMol> mol(new ROMol);
138  MolPickler::molFromPickle(mols[idx], mol.get());
139  return mol;
140  }
141 
142  unsigned int size() const override {
143  return rdcast<unsigned int>(mols.size());
144  }
145 
146  std::vector<std::string> &getMols() { return mols; }
147  const std::vector<std::string> &getMols() const { return mols; }
148 };
149 
150 //! Concrete class that holds smiles strings in memory
151 /*!
152  This implementation uses quite a bit less memory than the
153  cached binary or uncached implementation. However, due to the
154  reduced speed it should be used in conjunction with a pattern
155  fingerprinter.
156 
157  See RDKit::FPHolder
158 */
160  : public MolHolderBase {
161  std::vector<std::string> mols;
162 
163  public:
165 
166  unsigned int addMol(const ROMol &m) override {
167  bool doIsomericSmiles = true;
168  mols.push_back(MolToSmiles(m, doIsomericSmiles));
169  return size() - 1;
170  }
171 
172  //! Add a smiles to the dataset, no validation is done
173  //! to the inputs.
174  unsigned int addSmiles(const std::string &smiles) {
175  mols.push_back(smiles);
176  return size() - 1;
177  }
178 
179  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
180  if (idx >= mols.size()) throw IndexErrorException(idx);
181 
182  boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
183  return mol;
184  }
185 
186  unsigned int size() const override {
187  return rdcast<unsigned int>(mols.size());
188  }
189 
190  std::vector<std::string> &getMols() { return mols; }
191  const std::vector<std::string> &getMols() const { return mols; }
192 };
193 
194 //! Concrete class that holds trusted smiles strings in memory
195 /*!
196  A trusted smiles is essentially a smiles string that
197  RDKit has generated. This indicates that fewer
198  sanitization steps are required. See
199  http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
200 
201  This implementation uses quite a bit less memory than the
202  cached binary or uncached implementation. However, due to the
203  reduced speed it should be used in conjunction with a pattern
204  fingerprinter.
205 
206  See RDKit::FPHolder
207 */
209  : public MolHolderBase {
210  std::vector<std::string> mols;
211 
212  public:
214 
215  unsigned int addMol(const ROMol &m) override {
216  bool doIsomericSmiles = true;
217  mols.push_back(MolToSmiles(m, doIsomericSmiles));
218  return size() - 1;
219  }
220 
221  //! Add a smiles to the dataset, no validation is done
222  //! to the inputs.
223  unsigned int addSmiles(const std::string &smiles) {
224  mols.push_back(smiles);
225  return size() - 1;
226  }
227 
228  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
229  if (idx >= mols.size()) throw IndexErrorException(idx);
230 
231  RWMol *m = SmilesToMol(mols[idx], 0, false);
232  if (m) {
233  m->updatePropertyCache();
234  }
235  return boost::shared_ptr<ROMol>(m);
236  }
237 
238  unsigned int size() const override {
239  return rdcast<unsigned int>(mols.size());
240  }
241 
242  std::vector<std::string> &getMols() { return mols; }
243  const std::vector<std::string> &getMols() const { return mols; }
244 };
245 
246 //! Base FPI for the fingerprinter used to rule out impossible matches
248  std::vector<ExplicitBitVect *> fps;
249 
250  public:
251  virtual ~FPHolderBase() {
252  for (size_t i = 0; i < fps.size(); ++i) delete fps[i];
253  }
254 
255  virtual unsigned int size() const { return rdcast<unsigned int>(fps.size()); }
256 
257  //! Adds a molecule to the fingerprinter
258  unsigned int addMol(const ROMol &m) {
259  fps.push_back(makeFingerprint(m));
260  return rdcast<unsigned int>(fps.size() - 1);
261  }
262 
263  //! Adds a raw bit vector pointer to the fingerprinter, which takes ownership
264  //! PLEASE NOTE: make sure that the passed ExplicitBitVect
265  //! is compatible with the one generated by makeFingerprint()
266  unsigned int addFingerprint(ExplicitBitVect *v) {
267  fps.push_back(v);
268  return rdcast<unsigned int>(fps.size() - 1);
269  }
270 
271  //! Adds a raw bit vector to the fingerprinter
272  //! PLEASE NOTE: make sure that the passed ExplicitBitVect
273  //! is compatible with the one generated by makeFingerprint()
274  unsigned int addFingerprint(const ExplicitBitVect &v) {
275  return addFingerprint(new ExplicitBitVect(v));
276  }
277 
278  //! Return false if a substructure search can never match the molecule
279  bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
280  if (idx >= fps.size()) throw IndexErrorException(idx);
281 
282  return AllProbeBitsMatch(query, *fps[idx]);
283  }
284 
285  //! Get the bit vector at the specified index (throws IndexError if out of
286  //! range)
287  const ExplicitBitVect &getFingerprint(unsigned int idx) const {
288  if (idx >= fps.size()) throw IndexErrorException(idx);
289  return *fps[idx];
290  }
291 
292  //! make the query vector
293  //! Caller owns the vector!
294  virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
295 
296  std::vector<ExplicitBitVect *> &getFingerprints() { return fps; }
297  const std::vector<ExplicitBitVect *> &getFingerprints() const { return fps; }
298 };
299 
300 //! Uses the pattern fingerprinter with a user-defined number of bits (default:
301 //! 2048) to rule out matches
303  unsigned int numBits;
304 
305  public:
306  PatternHolder() : FPHolderBase(), numBits(defaultNumBits()) {}
307  PatternHolder(unsigned int numBits) : FPHolderBase(), numBits(numBits) {}
308  //! Caller owns the vector!
309  ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
310  return PatternFingerprintMol(m, numBits);
311  }
312  const unsigned int &getNumBits() const { return numBits; };
313  unsigned int &getNumBits() { return numBits; };
314  static unsigned int defaultNumBits() {
315  static const unsigned int DEFAULT_NUM_BITS = 2048;
316  return DEFAULT_NUM_BITS;
317  };
318 };
319 
321  : public PatternHolder {
322  public:
324  TautomerPatternHolder(unsigned int numBits) : PatternHolder(numBits) {}
325  ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
326  std::vector<unsigned int> *atomCounts = nullptr;
327  ExplicitBitVect *setOnlyBits = nullptr;
328  const bool tautomericFingerprint = true;
329  return PatternFingerprintMol(m, getNumBits(), atomCounts, setOnlyBits,
330  tautomericFingerprint);
331  }
332 };
333 
335 public:
336  virtual ~KeyHolderBase() {}
337 
338  //! Add a key to the database getting it from the molecule
339  virtual unsigned int addMol(const ROMol &m) = 0;
340 
341  //! Add a key to the database, this needs to be in the same order
342  //! as the molecule, no validation is done
343  virtual unsigned int addKey(const std::string &) = 0;
344 
345  // !get the key at the requested index
346  // implementations should throw IndexError on out of range
347  virtual const std::string & getKey(unsigned int) const = 0;
348 
349  // !get keys from a bunch of indices
350  virtual std::vector<std::string> getKeys(const std::vector<unsigned int> &indices) const = 0;
351  //! Get the current keeyholder size
352  virtual unsigned int size() const = 0;
353 };
354 
356  std::string propname;
357  std::vector<std::string> keys;
358  const std::string empty_string = {};
359 
360 public:
361  KeyFromPropHolder(const std::string &propname = "_Name") : propname(propname) {
362  }
363 
364  std::string &getPropName() { return propname; }
365  const std::string &getPropName() const { return propname; }
366 
367  std::vector<std::string> &getKeys() { return keys; }
368  const std::vector<std::string> &getKeys() const { return keys; }
369 
370  unsigned int addMol(const ROMol &m) override {
371  std::string key;
372  if (m.getPropIfPresent(propname, key)) {
373  keys.push_back(std::move(key));
374  } else {
375  // XXX is this a warning? it could be verbose. Should we push back the string repr of the
376  // numeric index?
377  const static std::string prefix("LIBIDX-");
378  keys.emplace_back(prefix + boost::lexical_cast<std::string>(keys.size()));
379  }
380  return keys.size() - 1u;
381  };
382 
383  unsigned int addKey(const std::string &key) override {
384  keys.push_back(key);
385  return keys.size() - 1u;
386  }
387 
388  const std::string & getKey(unsigned int idx) const override {
389  if (idx >= keys.size()) throw IndexErrorException(idx);
390  return keys[idx];
391  }
392 
393  std::vector<std::string> getKeys(const std::vector<unsigned int> &indices) const override{
394  std::vector<std::string> res;
395  std::transform(indices.begin(), indices.end(), std::back_inserter(res),
396  [=](unsigned idx){return keys.at(idx);});
397  return res;
398  }
399  unsigned int size() const override {
400  return keys.size();
401  }
402 
403 };
404 
405 //! Substructure Search a library of molecules
406 /*! This class allows for multithreaded substructure searches of
407  large datasets.
408 
409  The implementations can use fingerprints to speed up searches
410  and have molecules cached as binary forms to reduce memory
411  usage.
412 
413  basic usage:
414  \code
415  SubstructLibrary lib;
416  lib.addMol(mol);
417  std::vector<unsigned int> results = lib.getMatches(query);
418  for(std::vector<unsigned int>::const_iterator matchIndex=results.begin();
419  matchIndex != results.end();
420  ++matchIndex) {
421  boost::shared_ptr<ROMol> match = lib.getMol(*matchIndex);
422  }
423  \endcode
424 
425  Using different mol holders and pattern fingerprints.
426 
427  \code
428  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
429  boost::make_shared<CachedTrustedSmilesMolHolder>();
430  boost::shared_ptr<PatternHolder> patternHolder = \
431  boost::make_shared<PatternHolder>();
432 
433  SubstructLibrary lib(molHolder, patternHolder);
434  lib.addMol(mol);
435  \endcode
436 
437  Cached molecule holders create molecules on demand. There are currently
438  three styles of cached molecules.
439 
440  CachedMolHolder: stores molecules in the rdkit binary format.
441  CachedSmilesMolHolder: stores molecules in smiles format.
442  CachedTrustedSmilesMolHolder: stores molecules in smiles format.
443 
444  The CachedTrustedSmilesMolHolder is made to add molecules from
445  a trusted source. This makes the basic assumption that RDKit was
446  used to sanitize and canonicalize the smiles string. In practice
447  this is considerably faster than using arbitrary smiles strings since
448  certain assumptions can be made. Molecules generated from trusted
449  smiles do not have ring information (although this is created
450  in the molecule being searched if necessary).
451 
452  When loading from external data, as opposed to using the "addMol" API,
453  care must be taken to ensure that the pattern fingerprints and smiles
454  are synchronized.
455 
456  Each pattern holder has an API point for making its fingerprint. This
457  is useful to ensure that the pattern stored in the database will be
458  compatible with the patterns made when analyzing queries.
459 
460  \code
461  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
462  boost::make_shared<CachedTrustedSmilesMolHolder>();
463  boost::shared_ptr<PatternHolder> patternHolder = \
464  boost::make_shared<PatternHolder>();
465 
466  // the PatternHolder instance is able to make fingerprints.
467  // These, of course, can be read from a file. For demonstration
468  // purposes we construct them here.
469  const std::string trustedSmiles = "c1ccccc1";
470  ROMol *m = SmilesToMol(trustedSmiles);
471  const ExplicitBitVect *bitVector = patternHolder->makeFingerprint(*m);
472 
473  // The trusted smiles and bitVector can be read from any source.
474  // This is the fastest way to load a substruct library.
475  molHolder->addSmiles( trustedSmiles );
476  patternHolder->addFingerprint( *bitVector );
477  SubstructLibrary lib(molHolder, patternHolder);
478  delete m;
479  delete bitVector;
480  \endcode
481 
482  Finally, using the KeyFromPropHolder will store user ids or keys.
483  By default, it uses RDKit's default _Name prop, but can be changed
484  to any property.
485 
486  \code
487  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
488  boost::make_shared<CachedTrustedSmilesMolHolder>();
489  boost::shared_ptr<KeyFromPropHolder> keyHolder = \
490  boost::make_shared<KeyFromPropHolder>();
491  SubstructLibrary lib(molHolder, keyHolder);
492  ...
493 
494  You can get the keys in multiple through the use of the keyholder
495  auto key = lib.getKeys().getKey(idx);
496  auto keys = lib.getKeys().getKeys(lib.GetMatch(query));
497  \endcode
498 
499 */
501  boost::shared_ptr<MolHolderBase> molholder;
502  boost::shared_ptr<FPHolderBase> fpholder;
503  boost::shared_ptr<KeyHolderBase> keyholder;
504 
505  MolHolderBase *mols; // used for a small optimization
506  FPHolderBase *fps{nullptr};
507  bool is_tautomerquery = false;
508  std::vector<unsigned int> searchOrder;
509 
510  public:
512  : molholder(new MolHolder), fpholder(), keyholder(), mols(molholder.get()) {}
513 
514  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
515  : molholder(std::move(molecules)),
516  fpholder(),
517  keyholder(),
518  mols(molholder.get()),
519  fps(nullptr) {}
520 
521  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
522  boost::shared_ptr<FPHolderBase> fingerprints)
523  : molholder(std::move(molecules)),
524  fpholder(std::move(fingerprints)),
525  keyholder(),
526  mols(molholder.get()),
527  fps(fpholder.get()) {
528  if (fpholder.get() &&
529  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
530  is_tautomerquery = true;
531  }
532  }
533 
534  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
535  boost::shared_ptr<KeyHolderBase> keys)
536  : molholder(std::move(molecules)),
537  fpholder(),
538  keyholder(std::move(keys)),
539  mols(molholder.get()),
540  fps(nullptr) {
541  if (fpholder.get() &&
542  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
543  is_tautomerquery = true;
544  }
545  }
546 
547  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
548  boost::shared_ptr<FPHolderBase> fingerprints,
549  boost::shared_ptr<KeyHolderBase> keys)
550  : molholder(std::move(molecules)),
551  fpholder(std::move(fingerprints)),
552  keyholder(std::move(keys)),
553  mols(molholder.get()),
554  fps(fpholder.get()) {
555  if (fpholder.get() &&
556  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
557  is_tautomerquery = true;
558  }
559  }
560 
561  SubstructLibrary(const std::string &pickle)
562  : molholder(new MolHolder),
563  fpholder(),
564  mols(molholder.get()),
565  fps(nullptr) {
566  initFromString(pickle);
567  if (fpholder.get() &&
568  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
569  is_tautomerquery = true;
570  }
571  }
572 
573  //! Get the underlying molecule holder implementation
574  boost::shared_ptr<MolHolderBase> &getMolHolder() { return molholder; }
575 
576  const boost::shared_ptr<MolHolderBase> &getMolHolder() const {
577  return molholder;
578  }
579 
580  //! Get the underlying molecule holder implementation
581  boost::shared_ptr<FPHolderBase> &getFpHolder() { return fpholder; }
582 
583  //! Get the underlying molecule holder implementation
584  const boost::shared_ptr<FPHolderBase> &getFpHolder() const {
585  return fpholder;
586  }
587 
588  //! Get the underlying molecule holder implementation
589  boost::shared_ptr<KeyHolderBase> &getKeyHolder() { return keyholder; }
590 
591  //! Get the underlying molecule holder implementation
592  const boost::shared_ptr<KeyHolderBase> &getKeyHolder() const {
593  return keyholder;
594  }
595 
596  const MolHolderBase &getMolecules() const {
597  PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
598  return *mols;
599  }
600 
601  //! Get the underlying fingerprint implementation.
602  /*! Throws a value error if no fingerprints have been set */
604  if (!fps)
605  throw ValueErrorException("Substruct Library does not have fingerprints");
606  return *fps;
607  }
608 
609  const FPHolderBase &getFingerprints() const {
610  if (!fps)
611  throw ValueErrorException("Substruct Library does not have fingerprints");
612  return *fps;
613  }
614 
615  //! Get the underlying key holder implementation.
616  /*! Throws a value error if no keyholder have been set */
618  if(!keyholder.get())
619  throw ValueErrorException("Substruct Library does not have fingerprints");
620  return *keyholder.get();
621  }
622 
623  //! Get the underlying key holder implementation.
624  /*! Throws a value error if no keyholder have been set */
625  const KeyHolderBase &getKeys() const {
626  if(!keyholder.get())
627  throw ValueErrorException("Substruct Library does not have fingerprints");
628  return *keyholder.get();
629  }
630 
631  //! Add a molecule to the library
632  /*!
633  \param mol Molecule to add
634 
635  returns index for the molecule in the library
636  */
637  unsigned int addMol(const ROMol &mol);
638 
639  //! Get the matching indices for the query
640  /*!
641  \param query Query or Tautomer Query to match against molecules
642  \param recursionPossible flags whether or not recursive matches are allowed
643  [default true]
644  \param useChirality use atomic CIP codes as part of the comparison
645  [default true]
646  \param useQueryQueryMatches if set, the contents of atom and bond queries
647  will be used as part of the matching
648  [default false]
649  \param numThreads If -1 use all available processors [default -1]
650  \param maxResults Maximum results to return, -1 means return all
651  [default -1]
652  */
653  template <class Query>
654  std::vector<unsigned int> getMatches(const Query &query,
655  bool recursionPossible = true,
656  bool useChirality = true,
657  bool useQueryQueryMatches = false,
658  int numThreads = -1,
659  int maxResults = -1) const {
661  params.recursionPossible = recursionPossible;
662  params.useChirality = useChirality;
663  params.useQueryQueryMatches = useQueryQueryMatches;
664  return getMatches(query, 0, size(), params, numThreads, maxResults);
665  }
666  //! overload
667  template <class Query>
668  std::vector<unsigned int> getMatches(const Query &query,
669  const SubstructMatchParameters &params,
670  int numThreads = -1,
671  int maxResults = -1) const {
672  return getMatches(query, 0, size(), params, numThreads, maxResults);
673  }
674  //! Get the matching indices for the query between the given indices
675  /*!
676  \param query Query to match against molecules
677  \param startIdx Start index of the search
678  \param endIdx Ending idx (non-inclusive) of the search.
679  \param recursionPossible flags whether or not recursive matches are allowed
680  [default true]
681  \param useChirality use atomic CIP codes as part of the comparison
682  [default true]
683  \param useQueryQueryMatches if set, the contents of atom and bond queries
684  will be used as part of the matching
685  [default false]
686  \param numThreads If -1 use all available processors [default -1]
687  \param maxResults Maximum results to return, -1 means return all
688  [default -1]
689  */
690  template <class Query>
691  std::vector<unsigned int> getMatches(
692  const Query &query, unsigned int startIdx, unsigned int endIdx,
693  bool recursionPossible = true, bool useChirality = true,
694  bool useQueryQueryMatches = false, int numThreads = -1,
695  int maxResults = -1) const {
697  params.recursionPossible = recursionPossible;
698  params.useChirality = useChirality;
699  params.useQueryQueryMatches = useQueryQueryMatches;
700  return getMatches(query, startIdx, endIdx, params, numThreads, maxResults);
701  };
702  //! overload
703  std::vector<unsigned int> getMatches(const ROMol &query,
704  unsigned int startIdx,
705  unsigned int endIdx,
706  const SubstructMatchParameters &params,
707  int numThreads = -1,
708  int maxResults = -1) const;
709  //! overload
710  std::vector<unsigned int> getMatches(const MolBundle &query,
711  unsigned int startIdx,
712  unsigned int endIdx,
713  const SubstructMatchParameters &params,
714  int numThreads = -1,
715  int maxResults = -1) const;
716  //! overload
717  std::vector<unsigned int> getMatches(const TautomerQuery &query,
718  unsigned int startIdx,
719  unsigned int endIdx,
720  const SubstructMatchParameters &params,
721  int numThreads = -1,
722  int maxResults = -1) const;
723 
724  //! Return the number of matches for the query
725  /*!
726  \param query Molecule or Tautomer Query to match against molecules
727  \param recursionPossible flags whether or not recursive matches are allowed
728  [default true]
729  \param useChirality use atomic CIP codes as part of the comparison
730  [default true]
731  \param useQueryQueryMatches if set, the contents of atom and bond queries
732  will be used as part of the matching
733  [default false]
734  \param numThreads If -1 use all available processors [default -1]
735  */
736  template <class Query>
737  unsigned int countMatches(const Query &query, bool recursionPossible = true,
738  bool useChirality = true,
739  bool useQueryQueryMatches = false,
740  int numThreads = -1) const {
742  params.recursionPossible = recursionPossible;
743  params.useChirality = useChirality;
744  params.useQueryQueryMatches = useQueryQueryMatches;
745  return countMatches(query, 0, size(), params, numThreads);
746  }
747  //! overload
748  template <class Query>
749  unsigned int countMatches(const Query &query,
750  const SubstructMatchParameters &params,
751  int numThreads = -1) const {
752  return countMatches(query, 0, size(), params, numThreads);
753  }
754 
755  //! Return the number of matches for the query
756 
757  //! Return the number of matches for the query between the given indices
758  /*!
759  \param query Query to match against molecules
760  \param startIdx Start index of the search
761  \param endIdx Ending idx (non-inclusive) of the search.
762  \param recursionPossible flags whether or not recursive matches are allowed
763  [default true]
764  \param useChirality use atomic CIP codes as part of the comparison
765  [default true]
766  \param useQueryQueryMatches if set, the contents of atom and bond queries
767  will be used as part of the matching
768  [default false]
769  \param numThreads If -1 use all available processors [default -1]
770  */
771  template <class Query>
772  unsigned int countMatches(const Query &query, unsigned int startIdx,
773  unsigned int endIdx, bool recursionPossible = true,
774  bool useChirality = true,
775  bool useQueryQueryMatches = false,
776  int numThreads = -1) const {
778  params.recursionPossible = recursionPossible;
779  params.useChirality = useChirality;
780  params.useQueryQueryMatches = useQueryQueryMatches;
781  return countMatches(query, startIdx, endIdx, params, numThreads);
782  };
783 
784  //! overload
785  unsigned int countMatches(const ROMol &query, unsigned int startIdx,
786  unsigned int endIdx,
787  const SubstructMatchParameters &params,
788  int numThreads = -1) const;
789  //! overload
790  unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx,
791  unsigned int endIdx,
792  const SubstructMatchParameters &params,
793  int numThreads = -1) const;
794  //! overload
795  unsigned int countMatches(const MolBundle &query, unsigned int startIdx,
796  unsigned int endIdx,
797  const SubstructMatchParameters &params,
798  int numThreads = -1) const;
799 
800  //! Returns true if any match exists for the query
801  /*!
802  \param query Molecule or Tautomer Query to match against molecules
803  \param recursionPossible flags whether or not recursive matches are allowed
804  [default true]
805  \param useChirality use atomic CIP codes as part of the comparison
806  [default true]
807  \param useQueryQueryMatches if set, the contents of atom and bond queries
808  will be used as part of the matching
809  [default false]
810  \param numThreads If -1 use all available processors [default -1]
811  */
812  template <class Query>
813  bool hasMatch(const Query &query, bool recursionPossible = true,
814  bool useChirality = true, bool useQueryQueryMatches = false,
815  int numThreads = -1) const {
817  params.recursionPossible = recursionPossible;
818  params.useChirality = useChirality;
819  params.useQueryQueryMatches = useQueryQueryMatches;
820  return hasMatch(query, 0, size(), params, numThreads);
821  }
822  //! overload
823  template <class Query>
824  bool hasMatch(const Query &query, const SubstructMatchParameters &params,
825  int numThreads = -1) const {
826  return hasMatch(query, 0, size(), params, numThreads);
827  }
828  //! Returns true if any match exists for the query between the specified
829  //! indices
830  /*!
831  \param query Query to match against molecules
832  \param startIdx Start index of the search
833  \param endIdx Ending idx (inclusive) of the search.
834  \param recursionPossible flags whether or not recursive matches are
835  allowed [default true] \param useChirality use atomic CIP codes as part
836  of the comparison [default true] \param useQueryQueryMatches if set, the
837  contents of atom and bond queries will be used as part of the matching
838  [default false]
839  \param numThreads If -1 use all available processors [default -1]
840  */
841  template <class Query>
842  bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx,
843  bool recursionPossible = true, bool useChirality = true,
844  bool useQueryQueryMatches = false, int numThreads = -1) const {
846  params.recursionPossible = recursionPossible;
847  params.useChirality = useChirality;
848  params.useQueryQueryMatches = useQueryQueryMatches;
849  return hasMatch(query, startIdx, endIdx, params, numThreads);
850  };
851  //! overload
852  bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx,
853  const SubstructMatchParameters &params,
854  int numThreads = -1) const;
855  //! overload
856  bool hasMatch(const TautomerQuery &query, unsigned int startIdx,
857  unsigned int endIdx, const SubstructMatchParameters &params,
858  int numThreads = -1) const;
859  //! overload
860  bool hasMatch(const MolBundle &query, unsigned int startIdx,
861  unsigned int endIdx, const SubstructMatchParameters &params,
862  int numThreads = -1) const;
863  //! Returns the molecule at the given index
864  /*!
865  \param idx Index of the molecule in the library (n.b. could contain
866  null)
867  */
868  boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
869  // expects implementation to throw IndexError if out of range
870  PRECONDITION(mols, "molholder is null in SubstructLibrary");
871  return mols->getMol(idx);
872  }
873 
874  //! Returns the molecule at the given index
875  /*!
876  \param idx Index of the molecule in the library (n.b. could contain
877  null)
878  */
879  boost::shared_ptr<ROMol> operator[](unsigned int idx) {
880  // expects implementation to throw IndexError if out of range
881  PRECONDITION(mols, "molholder is null in SubstructLibrary");
882  return mols->getMol(idx);
883  }
884 
885  //! return the number of molecules in the library
886  unsigned int size() const {
887  PRECONDITION(mols, "molholder is null in SubstructLibrary");
888  return rdcast<unsigned int>(molholder->size());
889  }
890 
891  //! does error checking
892  void setSearchOrder(const std::vector<unsigned int> &order) {
893  for (const auto idx : order) {
894  if (idx >= mols->size()) {
895  throw IndexErrorException(idx);
896  }
897  }
898  searchOrder = order;
899  }
900 
901  const std::vector<unsigned int> &getSearchOrder() const {
902  return searchOrder;
903  }
904 
905  std::vector<unsigned int> &getSearchOrder() { return searchOrder; }
906  //! access required for serialization
907  void resetHolders() {
908  is_tautomerquery = false;
909  mols = molholder.get();
910  fps = fpholder.get();
911  if (fps && dynamic_cast<TautomerPatternHolder *>(fps) != nullptr) {
912  is_tautomerquery = true;
913  }
914  }
915 
916  //! serializes (pickles) to a stream
917  void toStream(std::ostream &ss) const;
918  //! returns a string with a serialized (pickled) representation
919  std::string Serialize() const;
920  //! initializes from a stream pickle
921  void initFromStream(std::istream &ss);
922  //! initializes from a string pickle
923  void initFromString(const std::string &text);
924 };
925 } // namespace RDKit
926 
928 #endif
Contains general bit-comparison and similarity operations.
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
Defines a class for managing bundles of molecules.
pulls in the core RDKit functionality
a class for bit vectors that are densely occupied
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:20
Concrete class that holds binary cached molecules in memory.
unsigned int size() const override
Get the current library size.
const std::vector< std::string > & getMols() const
unsigned int addMol(const ROMol &m) override
std::vector< std::string > & getMols()
unsigned int addBinary(const std::string &pickle)
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
Concrete class that holds smiles strings in memory.
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
const std::vector< std::string > & getMols() const
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
unsigned int addMol(const ROMol &m) override
unsigned int size() const override
Get the current library size.
Concrete class that holds trusted smiles strings in memory.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
unsigned int addMol(const ROMol &m) override
unsigned int size() const override
Get the current library size.
const std::vector< std::string > & getMols() const
Base FPI for the fingerprinter used to rule out impossible matches.
const ExplicitBitVect & getFingerprint(unsigned int idx) const
unsigned int addMol(const ROMol &m)
Adds a molecule to the fingerprinter.
virtual unsigned int size() const
std::vector< ExplicitBitVect * > & getFingerprints()
virtual ExplicitBitVect * makeFingerprint(const ROMol &m) const =0
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const
Return false if a substructure search can never match the molecule.
unsigned int addFingerprint(ExplicitBitVect *v)
const std::vector< ExplicitBitVect * > & getFingerprints() const
unsigned int addFingerprint(const ExplicitBitVect &v)
const std::vector< std::string > & getKeys() const
KeyFromPropHolder(const std::string &propname="_Name")
std::vector< std::string > & getKeys()
unsigned int addKey(const std::string &key) override
unsigned int size() const override
Get the current keeyholder size.
std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const override
unsigned int addMol(const ROMol &m) override
Add a key to the database getting it from the molecule.
const std::string & getKey(unsigned int idx) const override
const std::string & getPropName() const
virtual const std::string & getKey(unsigned int) const =0
virtual std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const =0
virtual unsigned int addMol(const ROMol &m)=0
Add a key to the database getting it from the molecule.
virtual unsigned int size() const =0
Get the current keeyholder size.
virtual unsigned int addKey(const std::string &)=0
MolBundle contains a collection of related ROMols.
Definition: MolBundle.h:39
Base class API for holding molecules to substructure search.
virtual unsigned int addMol(const ROMol &m)=0
virtual unsigned int size() const =0
Get the current library size.
virtual boost::shared_ptr< ROMol > getMol(unsigned int) const =0
Concrete class that holds molecules in memory.
const std::vector< boost::shared_ptr< ROMol > > & getMols() const
unsigned int addMol(const ROMol &m) override
std::vector< boost::shared_ptr< ROMol > > & getMols()
unsigned int size() const override
Get the current library size.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
static void molFromPickle(const std::string &pickle, ROMol *mol, unsigned int propertyFlags)
constructs a molecule from a pickle stored in a string
static void pickleMol(const ROMol *mol, std::ostream &ss)
pickles a molecule and sends the results to stream ss
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
PatternHolder(unsigned int numBits)
unsigned int & getNumBits()
static unsigned int defaultNumBits()
const unsigned int & getNumBits() const
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
Substructure Search a library of molecules.
unsigned int countMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
unsigned int addMol(const ROMol &mol)
Add a molecule to the library.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Returns the molecule at the given index.
void initFromStream(std::istream &ss)
initializes from a stream pickle
const MolHolderBase & getMolecules() const
const FPHolderBase & getFingerprints() const
bool hasMatch(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Returns true if any match exists for the query.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints, boost::shared_ptr< KeyHolderBase > keys)
unsigned int countMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
void initFromString(const std::string &text)
initializes from a string pickle
unsigned int countMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const boost::shared_ptr< FPHolderBase > & getFpHolder() const
Get the underlying molecule holder implementation.
FPHolderBase & getFingerprints()
Get the underlying fingerprint implementation.
std::vector< unsigned int > getMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query between the given indices.
boost::shared_ptr< MolHolderBase > & getMolHolder()
Get the underlying molecule holder implementation.
bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
boost::shared_ptr< KeyHolderBase > & getKeyHolder()
Get the underlying molecule holder implementation.
unsigned int countMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
std::vector< unsigned int > getMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
void setSearchOrder(const std::vector< unsigned int > &order)
does error checking
bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
boost::shared_ptr< FPHolderBase > & getFpHolder()
Get the underlying molecule holder implementation.
const std::vector< unsigned int > & getSearchOrder() const
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< KeyHolderBase > keys)
const KeyHolderBase & getKeys() const
Get the underlying key holder implementation.
bool hasMatch(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
KeyHolderBase & getKeys()
Get the underlying key holder implementation.
unsigned int countMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
bool hasMatch(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints)
std::vector< unsigned int > getMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
std::vector< unsigned int > getMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
std::vector< unsigned int > & getSearchOrder()
bool hasMatch(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
void resetHolders()
access required for serialization
unsigned int size() const
return the number of molecules in the library
std::vector< unsigned int > getMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
const boost::shared_ptr< KeyHolderBase > & getKeyHolder() const
Get the underlying molecule holder implementation.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules)
SubstructLibrary(const std::string &pickle)
std::string Serialize() const
returns a string with a serialized (pickled) representation
unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const boost::shared_ptr< MolHolderBase > & getMolHolder() const
void toStream(std::ostream &ss) const
serializes (pickles) to a stream
boost::shared_ptr< ROMol > operator[](unsigned int idx)
Returns the molecule at the given index.
std::vector< unsigned int > getMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query.
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
TautomerPatternHolder(unsigned int numBits)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
#define RDKIT_SUBSTRUCTLIBRARY_EXPORT
Definition: export.h:465
RDKIT_CHEMREACTIONS_EXPORT void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
Std stuff.
Definition: Abbreviations.h:18
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * PatternFingerprintMol(const ROMol &mol, unsigned int fpSize=2048, std::vector< unsigned int > *atomCounts=nullptr, ExplicitBitVect *setOnlyBits=nullptr, bool tautomericFingerprint=false)
Generates a topological fingerprint for a molecule using a series of pre-defined structural patterns.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SUBSTRUCTLIBRARY_EXPORT bool SubstructLibraryCanSerialize()
RDKIT_SMILESPARSE_EXPORT RWMol * SmilesToMol(const std::string &smi, const SmilesParserParams &params)
bool recursionPossible
Allow recursive queries.