RDKit
Open-source cheminformatics and machine learning.
Embedder.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2004-2017 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef RD_EMBEDDER_H_GUARD
13 #define RD_EMBEDDER_H_GUARD
14 
15 #include <map>
16 #include <Geometry/point.h>
17 #include <GraphMol/ROMol.h>
18 #include <boost/shared_ptr.hpp>
19 #include <DistGeom/BoundsMatrix.h>
20 
21 namespace RDKit {
22 namespace DGeomHelpers {
23 
24 //! Parameter object for controlling embedding
25 /*!
26  numConfs Number of conformations to be generated
27  numThreads Sets the number of threads to use (more than one thread
28  will only be used if the RDKit was build with multithread
29  support) If set to zero, the max supported by the system will
30  be used.
31  maxIterations Max. number of times the embedding will be tried if
32  coordinates are not obtained successfully. The default
33  value is 10x the number of atoms.
34  randomSeed provides a seed for the random number generator (so that
35  the same coordinates can be obtained for a
36  molecule on multiple runs) If -1, the
37  RNG will not be seeded.
38  clearConfs Clear all existing conformations on the molecule
39  useRandomCoords Start the embedding from random coordinates instead of
40  using eigenvalues of the distance matrix.
41  boxSizeMult Determines the size of the box that is used for
42  random coordinates. If this is a positive number, the
43  side length will equal the largest element of the distance
44  matrix times \c boxSizeMult. If this is a negative number,
45  the side length will equal \c -boxSizeMult (i.e. independent
46  of the elements of the distance matrix).
47  randNegEig Picks coordinates at random when a embedding process produces
48  negative eigenvalues
49  numZeroFail Fail embedding if we find this many or more zero eigenvalues
50  (within a tolerance)
51  pruneRmsThresh Retain only the conformations out of 'numConfs' after
52  embedding that are at least this far apart from each other.
53  RMSD is computed on the heavy atoms.
54  Prunining is greedy; i.e. the first embedded conformation is
55  retained and from then on only those that are at least
56  \c pruneRmsThresh away from already
57  retained conformations are kept. The pruning is done
58  after embedding and bounds violation minimization.
59  No pruning by default.
60  coordMap a map of int to Point3D, between atom IDs and their locations
61  their locations. If this container is provided, the
62  coordinates are used to set distance constraints on the
63  embedding. The resulting conformer(s) should have distances
64  between the specified atoms that reproduce those between the
65  points in \c coordMap. Because the embedding produces a
66  molecule in an arbitrary reference frame, an alignment step
67  is required to actually reproduce the provided coordinates.
68  optimizerForceTol set the tolerance on forces in the DGeom optimizer
69  (this shouldn't normally be altered in client code).
70  ignoreSmoothingFailures try to embed the molecule even if triangle bounds
71  smoothing fails
72  enforceChirality enforce the correct chirality if chiral centers are present
73  useExpTorsionAnglePrefs impose experimental torsion-angle preferences
74  useBasicKnowledge impose "basic knowledge" terms such as flat
75  aromatic rings, ketones, etc.
76  ETversion version of the experimental torsion-angle preferences
77  verbose print output of experimental torsion-angle preferences
78  basinThresh set the basin threshold for the DGeom force field,
79  (this shouldn't normally be altered in client code).
80  onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
81  boundsMat custom bound matrix to specify upper and lower bounds of atom
82  pairs embedFragmentsSeparately embed each fragment of molecule in turn
83  useSmallRingTorsions optional torsions to improve small ring conformer
84  sampling
85 
86  useMacrocycleTorsions optional torsions to improve macrocycle conformer
87  sampling useMacrocycle14config If 1-4 distances bound heuristics for
88  macrocycles is used
89 
90  CPCI custom columbic interactions between atom pairs
91 */
93  unsigned int maxIterations{0};
94  int numThreads{1};
95  int randomSeed{-1};
96  bool clearConfs{true};
97  bool useRandomCoords{false};
98  double boxSizeMult{2.0};
99  bool randNegEig{true};
100  unsigned int numZeroFail{1};
101  const std::map<int, RDGeom::Point3D> *coordMap{nullptr};
102  double optimizerForceTol{1e-3};
103  bool ignoreSmoothingFailures{false};
104  bool enforceChirality{true};
105  bool useExpTorsionAnglePrefs{false};
106  bool useBasicKnowledge{false};
107  bool verbose{false};
108  double basinThresh{5.0};
109  double pruneRmsThresh{-1.0};
110  bool onlyHeavyAtomsForRMS{false};
111  unsigned int ETversion{1};
112  boost::shared_ptr<const DistGeom::BoundsMatrix> boundsMat;
113  bool embedFragmentsSeparately{true};
114  bool useSmallRingTorsions{false};
115  bool useMacrocycleTorsions{false};
116  bool useMacrocycle14config{false};
117  std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>> CPCI;
119  :
120  boundsMat(nullptr),
121 
122  CPCI(nullptr){};
124  unsigned int maxIterations, int numThreads, int randomSeed,
125  bool clearConfs, bool useRandomCoords, double boxSizeMult,
126  bool randNegEig, unsigned int numZeroFail,
127  const std::map<int, RDGeom::Point3D> *coordMap, double optimizerForceTol,
128  bool ignoreSmoothingFailures, bool enforceChirality,
129  bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose,
130  double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS,
131  unsigned int ETversion = 1,
132  const DistGeom::BoundsMatrix *boundsMat = nullptr,
133  bool embedFragmentsSeparately = true, bool useSmallRingTorsions = false,
134  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false,
135  std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>>
136  CPCI = nullptr)
137  : maxIterations(maxIterations),
138  numThreads(numThreads),
139  randomSeed(randomSeed),
140  clearConfs(clearConfs),
141  useRandomCoords(useRandomCoords),
142  boxSizeMult(boxSizeMult),
143  randNegEig(randNegEig),
144  numZeroFail(numZeroFail),
145  coordMap(coordMap),
146  optimizerForceTol(optimizerForceTol),
147  ignoreSmoothingFailures(ignoreSmoothingFailures),
148  enforceChirality(enforceChirality),
149  useExpTorsionAnglePrefs(useExpTorsionAnglePrefs),
150  useBasicKnowledge(useBasicKnowledge),
151  verbose(verbose),
152  basinThresh(basinThresh),
153  pruneRmsThresh(pruneRmsThresh),
154  onlyHeavyAtomsForRMS(onlyHeavyAtomsForRMS),
155  ETversion(ETversion),
156  boundsMat(boundsMat),
157  embedFragmentsSeparately(embedFragmentsSeparately),
158  useSmallRingTorsions(useSmallRingTorsions),
159  useMacrocycleTorsions(useMacrocycleTorsions),
160  useMacrocycle14config(useMacrocycle14config),
161  CPCI(CPCI){};
162 };
163 
164 //*! Embed multiple conformations for a molecule
166  ROMol &mol, INT_VECT &res, unsigned int numConfs,
167  const EmbedParameters &params);
168 inline INT_VECT EmbedMultipleConfs(ROMol &mol, unsigned int numConfs,
169  const EmbedParameters &params) {
170  INT_VECT res;
171  EmbedMultipleConfs(mol, res, numConfs, params);
172  return res;
173 }
174 
175 //! Compute an embedding (in 3D) for the specified molecule using Distance
176 // Geometry
177 inline int EmbedMolecule(ROMol &mol, const EmbedParameters &params) {
178  INT_VECT confIds;
179  EmbedMultipleConfs(mol, confIds, 1, params);
180 
181  int res;
182  if (confIds.size()) {
183  res = confIds[0];
184  } else {
185  res = -1;
186  }
187  return res;
188 }
189 
190 //! Compute an embedding (in 3D) for the specified molecule using Distance
191 // Geometry
192 /*!
193  The following operations are performed (in order) here:
194  -# Build a distance bounds matrix based on the topology, including 1-5
195  distances but not VDW scaling
196  -# Triangle smooth this bounds matrix
197  -# If step 2 fails - repeat step 1, this time without 1-5 bounds and with vdW
198  scaling, and repeat step 2
199  -# Pick a distance matrix at random using the bounds matrix
200  -# Compute initial coordinates from the distance matrix
201  -# Repeat steps 3 and 4 until maxIterations is reached or embedding is
202  successful
203  -# Adjust initial coordinates by minimizing a Distance Violation error
204  function
205  **NOTE**: if the molecule has multiple fragments, they will be embedded
206  separately,
207  this means that they will likely occupy the same region of space.
208  \param mol Molecule of interest
209  \param maxIterations Max. number of times the embedding will be tried if
210  coordinates are not obtained successfully. The default
211  value is 10x the number of atoms.
212  \param seed provides a seed for the random number generator (so that
213  the same coordinates can be obtained for a molecule on
214  multiple runs). If negative, the RNG will not be seeded.
215  \param clearConfs Clear all existing conformations on the molecule
216  \param useRandomCoords Start the embedding from random coordinates instead of
217  using eigenvalues of the distance matrix.
218  \param boxSizeMult Determines the size of the box that is used for
219  random coordinates. If this is a positive number, the
220  side length will equal the largest element of the
221  distance matrix times \c boxSizeMult. If this is a
222  negative number, the side length will equal
223  \c -boxSizeMult (i.e. independent of the elements of the
224  distance matrix).
225  \param randNegEig Picks coordinates at random when a embedding process
226  produces negative eigenvalues
227  \param numZeroFail Fail embedding if we find this many or more zero
228  eigenvalues (within a tolerance)
229  \param coordMap a map of int to Point3D, between atom IDs and their locations
230  their locations. If this container is provided, the
231  coordinates are used to set distance constraints on the
232  embedding. The resulting conformer(s) should have distances
233  between the specified atoms that reproduce those between the
234  points in \c coordMap. Because the embedding produces a
235  molecule in an arbitrary reference frame, an alignment step
236  is required to actually reproduce the provided coordinates.
237  \param optimizerForceTol set the tolerance on forces in the distgeom optimizer
238  (this shouldn't normally be altered in client code).
239  \param ignoreSmoothingFailures try to embed the molecule even if triangle
240  bounds smoothing fails
241  \param enforceChirality enforce the correct chirality if chiral centers are
242  present
243  \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
244  \param useBasicKnowledge impose "basic knowledge" terms such as flat
245  aromatic rings, ketones, etc.
246  \param verbose print output of experimental torsion-angle preferences
247  \param basinThresh set the basin threshold for the DGeom force field,
248  (this shouldn't normally be altered in client code).
249  \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
250  \param ETversion version of torsion preferences to use
251  \param useSmallRingTorsions optional torsions to improve small ring
252  conformer sampling
253 
254  \param useMacrocycleTorsions optional torsions to improve macrocycle
255  conformer sampling \param useMacrocycle14config If 1-4 distances bound
256  heuristics for macrocycles is used \return ID of the conformations added to
257  the molecule, -1 if the emdedding failed
258 */
259 inline int EmbedMolecule(
260  ROMol &mol, unsigned int maxIterations = 0, int seed = -1,
261  bool clearConfs = true, bool useRandomCoords = false,
262  double boxSizeMult = 2.0, bool randNegEig = true,
263  unsigned int numZeroFail = 1,
264  const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
265  double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
266  bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
267  bool useBasicKnowledge = false, bool verbose = false,
268  double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
269  unsigned int ETversion = 1, bool useSmallRingTorsions = false,
270  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
271  EmbedParameters params(
272  maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
273  randNegEig, numZeroFail, coordMap, optimizerForceTol,
274  ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
275  useBasicKnowledge, verbose, basinThresh, -1.0, onlyHeavyAtomsForRMS,
276  ETversion, nullptr, true, useSmallRingTorsions, useMacrocycleTorsions,
277  useMacrocycle14config);
278  return EmbedMolecule(mol, params);
279 };
280 
281 //*! Embed multiple conformations for a molecule
282 /*!
283  This is kind of equivalent to calling EmbedMolecule multiple times - just that
284  the bounds
285  matrix is computed only once from the topology
286  **NOTE**: if the molecule has multiple fragments, they will be embedded
287  separately,
288  this means that they will likely occupy the same region of space.
289  \param mol Molecule of interest
290  \param res Used to return the resulting conformer ids
291  \param numConfs Number of conformations to be generated
292  \param numThreads Sets the number of threads to use (more than one thread
293  will only be used if the RDKit was build with
294  multithread
295  support). If set to zero, the max supported by the
296  system
297  will be used.
298  \param maxIterations Max. number of times the embedding will be tried if
299  coordinates are not obtained successfully. The default
300  value is 10x the number of atoms.
301  \param seed provides a seed for the random number generator (so that
302  the same coordinates can be obtained for a molecule on
303  multiple runs). If negative, the RNG will not be seeded.
304  \param clearConfs Clear all existing conformations on the molecule
305  \param useRandomCoords Start the embedding from random coordinates instead of
306  using eigenvalues of the distance matrix.
307  \param boxSizeMult Determines the size of the box that is used for
308  random coordinates. If this is a positive number, the
309  side length will equal the largest element of the
310  distance matrix times \c boxSizeMult. If this is a
311  negative number, the side length will equal
312  \c -boxSizeMult (i.e. independent of the elements of the
313  distance matrix).
314  \param randNegEig Picks coordinates at random when a embedding process
315  produces negative eigenvalues
316  \param numZeroFail Fail embedding if we find this many or more zero
317  eigenvalues (within a tolerance)
318  \param pruneRmsThresh Retain only the conformations out of 'numConfs' after
319  embedding that are at least this far apart from each
320  other. RMSD is computed on the heavy atoms.
321  Pruning is greedy; i.e. the first embedded conformation
322  is retained and from then on only those that are at
323  least
324  pruneRmsThresh away from already retained conformations
325  are kept. The pruning is done after embedding and
326  bounds violation minimization. No pruning by default.
327  \param coordMap a map of int to Point3D, between atom IDs and their locations
328  their locations. If this container is provided, the
329  coordinates are used to set distance constraints on the
330  embedding. The resulting conformer(s) should have distances
331  between the specified atoms that reproduce those between the
332  points in \c coordMap. Because the embedding produces a
333  molecule in an arbitrary reference frame, an alignment step
334  is required to actually reproduce the provided coordinates.
335  \param optimizerForceTol set the tolerance on forces in the DGeom optimizer
336  (this shouldn't normally be altered in client code).
337  \param ignoreSmoothingFailures try to embed the molecule even if triangle
338  bounds smoothing fails
339  \param enforceChirality enforce the correct chirality if chiral centers are
340  present
341  \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
342  \param useBasicKnowledge impose "basic knowledge" terms such as flat
343  aromatic rings, ketones, etc.
344  \param verbose print output of experimental torsion-angle preferences
345  \param basinThresh set the basin threshold for the DGeom force field,
346  (this shouldn't normally be altered in client code).
347  \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
348  \param ETversion version of torsion preferences to use
349  \param useSmallRingTorsions optional torsions to improve small ring
350  conformer sampling
351 
352  \param useMacrocycleTorsions optional torsions to improve macrocycle
353  conformer sampling \param useMacrocycle14config If 1-4 distances bound
354  heuristics for macrocycles is used
355 
356 */
357 inline void EmbedMultipleConfs(
358  ROMol &mol, INT_VECT &res, unsigned int numConfs = 10, int numThreads = 1,
359  unsigned int maxIterations = 30, int seed = -1, bool clearConfs = true,
360  bool useRandomCoords = false, double boxSizeMult = 2.0,
361  bool randNegEig = true, unsigned int numZeroFail = 1,
362  double pruneRmsThresh = -1.0,
363  const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
364  double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
365  bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
366  bool useBasicKnowledge = false, bool verbose = false,
367  double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
368  unsigned int ETversion = 1, bool useSmallRingTorsions = false,
369  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
370  EmbedParameters params(
371  maxIterations, numThreads, seed, clearConfs, useRandomCoords, boxSizeMult,
372  randNegEig, numZeroFail, coordMap, optimizerForceTol,
373  ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
374  useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
375  onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
376  useMacrocycleTorsions, useMacrocycle14config);
377  EmbedMultipleConfs(mol, res, numConfs, params);
378 };
379 //! \overload
381  ROMol &mol, unsigned int numConfs = 10, unsigned int maxIterations = 30,
382  int seed = -1, bool clearConfs = true, bool useRandomCoords = false,
383  double boxSizeMult = 2.0, bool randNegEig = true,
384  unsigned int numZeroFail = 1, double pruneRmsThresh = -1.0,
385  const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
386  double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
387  bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
388  bool useBasicKnowledge = false, bool verbose = false,
389  double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
390  unsigned int ETversion = 1, bool useSmallRingTorsions = false,
391  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
392  EmbedParameters params(
393  maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
394  randNegEig, numZeroFail, coordMap, optimizerForceTol,
395  ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
396  useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
397  onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
398  useMacrocycleTorsions, useMacrocycle14config);
399  INT_VECT res;
400  EmbedMultipleConfs(mol, res, numConfs, params);
401  return res;
402 };
403 
404 //! Parameters corresponding to Sereina Riniker's KDG approach
405 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters KDG;
406 //! Parameters corresponding to Sereina Riniker's ETDG approach
407 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETDG;
408 //! Parameters corresponding to Sereina Riniker's ETKDG approach
409 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDG;
410 //! Parameters corresponding to Sereina Riniker's ETKDG approach - version 2
411 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv2;
412 //! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
413 //! (10.1021/acs.jcim.0c00025) - the macrocycle part
414 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv3;
415 //! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
416 //! (10.1021/acs.jcim.0c00025) - the small ring part
417 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters srETKDGv3;
418 } // namespace DGeomHelpers
419 } // namespace RDKit
420 
421 #endif
Defines the primary molecule class ROMol as well as associated typedefs.
Class to store the distance bound.
Definition: BoundsMatrix.h:28
#define RDKIT_DISTGEOMHELPERS_EXPORT
Definition: export.h:190
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv2
Parameters corresponding to Sereina Riniker's ETKDG approach - version 2.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETDG
Parameters corresponding to Sereina Riniker's ETDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv3
int EmbedMolecule(ROMol &mol, const EmbedParameters &params)
Compute an embedding (in 3D) for the specified molecule using Distance.
Definition: Embedder.h:177
RDKIT_DISTGEOMHELPERS_EXPORT void EmbedMultipleConfs(ROMol &mol, INT_VECT &res, unsigned int numConfs, const EmbedParameters &params)
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDG
Parameters corresponding to Sereina Riniker's ETKDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters srETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters KDG
Parameters corresponding to Sereina Riniker's KDG approach.
const uint32_t seed
Definition: MHFP.h:29
Std stuff.
Definition: Abbreviations.h:17
std::vector< int > INT_VECT
Definition: types.h:271
Parameter object for controlling embedding.
Definition: Embedder.h:92
EmbedParameters(unsigned int maxIterations, int numThreads, int randomSeed, bool clearConfs, bool useRandomCoords, double boxSizeMult, bool randNegEig, unsigned int numZeroFail, const std::map< int, RDGeom::Point3D > *coordMap, double optimizerForceTol, bool ignoreSmoothingFailures, bool enforceChirality, bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose, double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS, unsigned int ETversion=1, const DistGeom::BoundsMatrix *boundsMat=nullptr, bool embedFragmentsSeparately=true, bool useSmallRingTorsions=false, bool useMacrocycleTorsions=false, bool useMacrocycle14config=false, std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double >> CPCI=nullptr)
Definition: Embedder.h:123
boost::shared_ptr< const DistGeom::BoundsMatrix > boundsMat
Definition: Embedder.h:112
std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double > > CPCI
Definition: Embedder.h:117