RDKit
Open-source cheminformatics and machine learning.
MolStandardize/Tautomer.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2018 Susan H. Leung
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_TAUTOMER_H
12 #define RD_TAUTOMER_H
13 
14 #include <boost/function.hpp>
15 #include <string>
16 #include <iterator>
17 #include <Catalogs/Catalog.h>
22 #include <boost/dynamic_bitset.hpp>
23 
24 namespace RDKit {
25 class ROMol;
26 class RWMol;
27 
28 namespace MolStandardize {
29 
30 typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
31  int>
33 
34 namespace TautomerScoringFunctions {
35 const std::string tautomerScoringVersion = "1.0.0";
36 
40 
41 inline int scoreTautomer(const ROMol &mol) {
42  return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol);
43 }
44 } // namespace TautomerScoringFunctions
45 
47  Completed = 0,
50  Canceled
51 };
52 
53 class Tautomer {
54  friend class TautomerEnumerator;
55 
56  public:
57  Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0) {}
58  Tautomer(const ROMOL_SPTR &t, const ROMOL_SPTR &k, size_t a = 0, size_t b = 0)
59  : tautomer(t),
60  kekulized(k),
61  d_numModifiedAtoms(a),
62  d_numModifiedBonds(b) {}
65 
66  private:
67  size_t d_numModifiedAtoms;
68  size_t d_numModifiedBonds;
69 };
70 
71 typedef std::map<std::string, Tautomer> SmilesTautomerMap;
72 typedef std::pair<std::string, Tautomer> SmilesTautomerPair;
73 
74 //! Contains results of tautomer enumeration
76  friend class TautomerEnumerator;
77 
78  public:
80  public:
82  typedef std::ptrdiff_t difference_type;
83  typedef const ROMol *pointer;
84  typedef const ROMOL_SPTR &reference;
85  typedef std::bidirectional_iterator_tag iterator_category;
86 
87  explicit const_iterator(const SmilesTautomerMap::const_iterator &it)
88  : d_it(it) {}
89  reference operator*() const { return d_it->second.tautomer; }
90  pointer operator->() const { return d_it->second.tautomer.get(); }
91  bool operator==(const const_iterator &other) const {
92  return (d_it == other.d_it);
93  }
94  bool operator!=(const const_iterator &other) const {
95  return !(*this == other);
96  }
98  const_iterator copy(d_it);
99  operator++();
100  return copy;
101  }
103  ++d_it;
104  return *this;
105  }
107  const_iterator copy(d_it);
108  operator--();
109  return copy;
110  }
112  --d_it;
113  return *this;
114  }
115 
116  private:
117  SmilesTautomerMap::const_iterator d_it;
118  };
121  : d_tautomers(other.d_tautomers),
122  d_status(other.d_status),
123  d_modifiedAtoms(other.d_modifiedAtoms),
124  d_modifiedBonds(other.d_modifiedBonds) {
125  fillTautomersItVec();
126  }
127  const const_iterator begin() const {
128  return const_iterator(d_tautomers.begin());
129  }
130  const const_iterator end() const { return const_iterator(d_tautomers.end()); }
131  size_t size() const { return d_tautomers.size(); }
132  bool empty() const { return d_tautomers.empty(); }
133  const ROMOL_SPTR &at(size_t pos) const {
134  PRECONDITION(pos < d_tautomers.size(), "index out of bounds");
135  return d_tautomersItVec.at(pos)->second.tautomer;
136  }
137  const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); }
138  const boost::dynamic_bitset<> &modifiedAtoms() const {
139  return d_modifiedAtoms;
140  }
141  const boost::dynamic_bitset<> &modifiedBonds() const {
142  return d_modifiedBonds;
143  }
144  TautomerEnumeratorStatus status() const { return d_status; }
145  std::vector<ROMOL_SPTR> tautomers() const {
146  std::vector<ROMOL_SPTR> tautomerVec;
147  tautomerVec.reserve(d_tautomers.size());
148  std::transform(
149  d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec),
150  [](const SmilesTautomerPair &t) { return t.second.tautomer; });
151  return tautomerVec;
152  }
153  std::vector<ROMOL_SPTR> operator()() const { return tautomers(); }
154  std::vector<std::string> smiles() const {
155  std::vector<std::string> smilesVec;
156  smilesVec.reserve(d_tautomers.size());
157  std::transform(d_tautomers.begin(), d_tautomers.end(),
158  std::back_inserter(smilesVec),
159  [](const SmilesTautomerPair &t) { return t.first; });
160  return smilesVec;
161  }
162  const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; }
163 
164  private:
165  void fillTautomersItVec() {
166  for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) {
167  d_tautomersItVec.push_back(it);
168  }
169  }
170  // the enumerated tautomers
171  SmilesTautomerMap d_tautomers;
172  // internal; vector of iterators into map items to enable random
173  // access to map items by index
174  std::vector<SmilesTautomerMap::const_iterator> d_tautomersItVec;
175  // status of the enumeration: did it complete? did it hit a limit?
176  // was it canceled?
177  TautomerEnumeratorStatus d_status;
178  // bit vector: flags atoms modified by the transforms
179  boost::dynamic_bitset<> d_modifiedAtoms;
180  // bit vector: flags bonds modified by the transforms
181  boost::dynamic_bitset<> d_modifiedBonds;
182 };
183 
185  public:
188  virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &) = 0;
189 };
190 
192  public:
194  : dp_catalog(tautCat),
195  d_maxTautomers(1000),
196  d_maxTransforms(1000),
197  d_removeSp3Stereo(true),
198  d_removeBondStereo(true),
199  d_removeIsotopicHs(true),
200  d_reassignStereo(true) {}
203  : dp_catalog(other.dp_catalog),
204  d_callback(other.d_callback.get()),
205  d_maxTautomers(other.d_maxTautomers),
206  d_maxTransforms(other.d_maxTransforms),
207  d_removeSp3Stereo(other.d_removeSp3Stereo),
208  d_removeBondStereo(other.d_removeBondStereo),
209  d_removeIsotopicHs(other.d_removeIsotopicHs),
210  d_reassignStereo(other.d_reassignStereo) {}
212  if (this == &other) return *this;
213  dp_catalog = other.dp_catalog;
214  d_callback.reset(other.d_callback.get());
215  d_maxTautomers = other.d_maxTautomers;
216  d_maxTransforms = other.d_maxTransforms;
217  d_removeSp3Stereo = other.d_removeSp3Stereo;
218  d_removeBondStereo = other.d_removeBondStereo;
219  d_removeIsotopicHs = other.d_removeIsotopicHs;
220  d_reassignStereo = other.d_reassignStereo;
221  return *this;
222  }
223  //! \param maxTautomers maximum number of tautomers to be generated
224  void setMaxTautomers(unsigned int maxTautomers) {
225  d_maxTautomers = maxTautomers;
226  }
227  //! \return maximum number of tautomers to be generated
228  unsigned int getMaxTautomers() { return d_maxTautomers; }
229  /*! \param maxTransforms maximum number of transformations to be applied
230  this limit is usually hit earlier than the maxTautomers limit
231  and leads to a more linear scaling of CPU time with increasing
232  number of tautomeric centers (see Sitzmann et al.)
233  */
234  void setMaxTransforms(unsigned int maxTransforms) {
235  d_maxTransforms = maxTransforms;
236  }
237  //! \return maximum number of transformations to be applied
238  unsigned int getMaxTransforms() { return d_maxTransforms; }
239  /*! \param removeSp3Stereo; if set to true, stereochemistry information
240  will be removed from sp3 atoms involved in tautomerism.
241  This means that S-aminoacids will lose their stereochemistry after going
242  through tautomer enumeration because of the amido-imidol tautomerism.
243  This defaults to true in RDKit, false in the workflow described
244  by Sitzmann et al.
245  */
246  void setRemoveSp3Stereo(bool removeSp3Stereo) {
247  d_removeSp3Stereo = removeSp3Stereo;
248  }
249  /*! \return whether stereochemistry information will be removed from
250  sp3 atoms involved in tautomerism
251  */
252  bool getRemoveSp3Stereo() { return d_removeSp3Stereo; }
253  /*! \param removeBondStereo; if set to true, stereochemistry information
254  will be removed from double bonds involved in tautomerism.
255  This means that enols will lose their E/Z stereochemistry after going
256  through tautomer enumeration because of the keto-enolic tautomerism.
257  This defaults to true in RDKit and also in the workflow described
258  by Sitzmann et al.
259  */
260  void setRemoveBondStereo(bool removeBondStereo) {
261  d_removeBondStereo = removeBondStereo;
262  }
263  /*! \return whether stereochemistry information will be removed from
264  double bonds involved in tautomerism
265  */
266  bool getRemoveBondStereo() { return d_removeBondStereo; }
267  /*! \param removeIsotopicHs; if set to true, isotopic Hs
268  will be removed from centers involved in tautomerism.
269  */
270  void setRemoveIsotopicHs(bool removeIsotopicHs) {
271  d_removeIsotopicHs = removeIsotopicHs;
272  }
273  /*! \return whether isotpoic Hs will be removed from
274  centers involved in tautomerism
275  */
276  bool getRemoveIsotopicHs() { return d_removeIsotopicHs; }
277  /*! \param reassignStereo; if set to true, assignStereochemistry
278  will be called on each tautomer generated by the enumerate() method.
279  This defaults to true.
280  */
281  void setReassignStereo(bool reassignStereo) {
282  d_reassignStereo = reassignStereo;
283  }
284  /*! \return whether assignStereochemistry will be called on each
285  tautomer generated by the enumerate() method
286  */
287  bool getReassignStereo() { return d_reassignStereo; }
288  /*! set this to an instance of a class derived from
289  TautomerEnumeratorCallback where operator() is overridden.
290  DO NOT delete the instance as ownership of the pointer is transferred
291  to the TautomerEnumerator
292  */
294  d_callback.reset(callback);
295  }
296  /*! \return pointer to an instance of a class derived from
297  TautomerEnumeratorCallback.
298  DO NOT delete the instance as ownership of the pointer is transferred
299  to the TautomerEnumerator
300  */
301  TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); }
302 
303  //! returns a \c TautomerEnumeratorResult structure for the input molecule
304  /*!
305  The enumeration rules are inspired by the publication:
306  M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
307  https://doi.org/10.1007/s10822-010-9346-4
308 
309  \param mol: the molecule to be enumerated
310 
311  Note: the definitions used here are that the atoms modified during
312  tautomerization are the atoms at the beginning and end of each tautomer
313  transform (the H "donor" and H "acceptor" in the transform) and the bonds
314  modified during transformation are any bonds whose order is changed during
315  the tautomer transform (these are the bonds between the "donor" and the
316  "acceptor")
317 
318  */
320 
321  //! Deprecated, please use the form returning a \c TautomerEnumeratorResult
322  //! instead
323  [
324  [deprecated("please use the form returning a TautomerEnumeratorResult "
325  "instead")]] std::vector<ROMOL_SPTR>
326  enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
327  boost::dynamic_bitset<> *modifiedBonds = nullptr) const;
328 
329  //! returns the canonical tautomer from a \c TautomerEnumeratorResult
331  boost::function<int(const ROMol &mol)> scoreFunc =
333 
334  //! returns the canonical tautomer from an iterable of possible tautomers
335  // When Iterable is TautomerEnumeratorResult we use the other non-templated
336  // overload for efficiency (TautomerEnumeratorResult already has SMILES so no
337  // need to recompute them)
338  template <class Iterable,
339  typename std::enable_if<
340  !std::is_same<Iterable, TautomerEnumeratorResult>::value,
341  int>::type = 0>
342  ROMol *pickCanonical(const Iterable &tautomers,
343  boost::function<int(const ROMol &mol)> scoreFunc =
345  ROMOL_SPTR bestMol;
346  if (tautomers.size() == 1) {
347  bestMol = *tautomers.begin();
348  } else {
349  // Calculate score for each tautomer
350  int bestScore = std::numeric_limits<int>::min();
351  std::string bestSmiles = "";
352  for (const auto &t : tautomers) {
353  auto score = scoreFunc(*t);
354 #ifdef VERBOSE_ENUMERATION
355  std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl;
356 #endif
357  if (score > bestScore) {
358  bestScore = score;
359  bestSmiles = MolToSmiles(*t);
360  bestMol = t;
361  } else if (score == bestScore) {
362  auto smiles = MolToSmiles(*t);
363  if (smiles < bestSmiles) {
364  bestSmiles = smiles;
365  bestMol = t;
366  }
367  }
368  }
369  }
370  ROMol *res = new ROMol(*bestMol);
371  static const bool cleanIt = true;
372  static const bool force = true;
373  MolOps::assignStereochemistry(*res, cleanIt, force);
374 
375  return res;
376  }
377 
378  //! returns the canonical tautomer for a molecule
379  /*!
380  Note that the canonical tautomer is very likely not the most stable tautomer
381  for any given conditions. The default scoring rules are designed to produce
382  "reasonable" tautomers, but the primary concern is that the results are
383  canonical: you always get the same canonical tautomer for a molecule
384  regardless of what the input tautomer or atom ordering were.
385 
386  The default scoring scheme is inspired by the publication:
387  M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
388  https://doi.org/10.1007/s10822-010-9346-4
389 
390  */
391  ROMol *canonicalize(const ROMol &mol,
392  boost::function<int(const ROMol &mol)> scoreFunc =
394 
395  private:
396  bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut,
397  const TautomerEnumeratorResult &res) const;
398  std::shared_ptr<TautomerCatalog> dp_catalog;
399  std::unique_ptr<TautomerEnumeratorCallback> d_callback;
400  unsigned int d_maxTautomers;
401  unsigned int d_maxTransforms;
402  bool d_removeSp3Stereo;
403  bool d_removeBondStereo;
404  bool d_removeIsotopicHs;
405  bool d_reassignStereo;
406 }; // TautomerEnumerator class
407 
408 } // namespace MolStandardize
409 } // namespace RDKit
410 
411 #endif
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
A Catalog with a hierarchical structure.
Definition: Catalog.h:135
virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &)=0
Contains results of tautomer enumeration.
const ROMOL_SPTR & operator[](size_t pos) const
const boost::dynamic_bitset & modifiedBonds() const
const boost::dynamic_bitset & modifiedAtoms() const
TautomerEnumeratorResult(const TautomerEnumeratorResult &other)
ROMol * canonicalize(const ROMol &mol, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer for a molecule
TautomerEnumerator(const CleanupParameters &params=CleanupParameters())
ROMol * pickCanonical(const TautomerEnumeratorResult &tautRes, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer from a TautomerEnumeratorResult
TautomerEnumerator & operator=(const TautomerEnumerator &other)
TautomerEnumeratorResult enumerate(const ROMol &mol) const
returns a TautomerEnumeratorResult structure for the input molecule
TautomerEnumeratorCallback * getCallback() const
std::vector< ROMOL_SPTR > enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds=nullptr) const
void setCallback(TautomerEnumeratorCallback *callback)
TautomerEnumerator(const TautomerEnumerator &other)
ROMol * pickCanonical(const Iterable &tautomers, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer from an iterable of possible tautomers
void setMaxTransforms(unsigned int maxTransforms)
void setMaxTautomers(unsigned int maxTautomers)
Tautomer(const ROMOL_SPTR &t, const ROMOL_SPTR &k, size_t a=0, size_t b=0)
#define RDKIT_MOLSTANDARDIZE_EXPORT
Definition: export.h:489
RDKIT_GRAPHMOL_EXPORT void assignStereochemistry(ROMol &mol, bool cleanIt=false, bool force=false, bool flagPossibleStereoCenters=false)
Assign stereochemistry tags to atoms (i.e. R/S) and bonds (i.e. Z/E)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol)
std::map< std::string, Tautomer > SmilesTautomerMap
RDCatalog::HierarchCatalog< TautomerCatalogEntry, TautomerCatalogParams, int > TautomerCatalog
std::pair< std::string, Tautomer > SmilesTautomerPair
Std stuff.
Definition: Abbreviations.h:17
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles=true, bool doKekule=false, int rootedAtAtom=-1, bool canonical=true, bool allBondsExplicit=false, bool allHsExplicit=false, bool doRandom=false)
returns canonical SMILES for a molecule
double score(const std::vector< size_t > &permutation, const std::vector< std::vector< RGroupMatch >> &matches, const std::set< int > &labels)
boost::shared_ptr< ROMol > ROMOL_SPTR
The CleanupParameters structure defines the default parameters for the.