RDKit
Open-source cheminformatics and machine learning.
Embedder.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2004-2017 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef RD_EMBEDDER_H_GUARD
13 #define RD_EMBEDDER_H_GUARD
14 
15 #include <map>
16 #include <utility>
17 #include <Geometry/point.h>
18 #include <GraphMol/ROMol.h>
19 #include <boost/shared_ptr.hpp>
20 #include <DistGeom/BoundsMatrix.h>
21 
22 namespace RDKit {
23 namespace DGeomHelpers {
24 
25 //! Parameter object for controlling embedding
26 /*!
27  numConfs Number of conformations to be generated
28  numThreads Sets the number of threads to use (more than one thread
29  will only be used if the RDKit was build with multithread
30  support) If set to zero, the max supported by the system will
31  be used.
32  maxIterations Max. number of times the embedding will be tried if
33  coordinates are not obtained successfully. The default
34  value is 10x the number of atoms.
35  randomSeed provides a seed for the random number generator (so that
36  the same coordinates can be obtained for a
37  molecule on multiple runs) If -1, the
38  RNG will not be seeded.
39  clearConfs Clear all existing conformations on the molecule
40  useRandomCoords Start the embedding from random coordinates instead of
41  using eigenvalues of the distance matrix.
42  boxSizeMult Determines the size of the box that is used for
43  random coordinates. If this is a positive number, the
44  side length will equal the largest element of the distance
45  matrix times \c boxSizeMult. If this is a negative number,
46  the side length will equal \c -boxSizeMult (i.e. independent
47  of the elements of the distance matrix).
48  randNegEig Picks coordinates at random when a embedding process produces
49  negative eigenvalues
50  numZeroFail Fail embedding if we find this many or more zero eigenvalues
51  (within a tolerance)
52  pruneRmsThresh Retain only the conformations out of 'numConfs' after
53  embedding that are at least this far apart from each other.
54  RMSD is computed on the heavy atoms.
55  Prunining is greedy; i.e. the first embedded conformation is
56  retained and from then on only those that are at least
57  \c pruneRmsThresh away from already
58  retained conformations are kept. The pruning is done
59  after embedding and bounds violation minimization.
60  No pruning by default.
61  coordMap a map of int to Point3D, between atom IDs and their locations
62  their locations. If this container is provided, the
63  coordinates are used to set distance constraints on the
64  embedding. The resulting conformer(s) should have distances
65  between the specified atoms that reproduce those between the
66  points in \c coordMap. Because the embedding produces a
67  molecule in an arbitrary reference frame, an alignment step
68  is required to actually reproduce the provided coordinates.
69  optimizerForceTol set the tolerance on forces in the DGeom optimizer
70  (this shouldn't normally be altered in client code).
71  ignoreSmoothingFailures try to embed the molecule even if triangle bounds
72  smoothing fails
73  enforceChirality enforce the correct chirality if chiral centers are present
74  useExpTorsionAnglePrefs impose experimental torsion-angle preferences
75  useBasicKnowledge impose "basic knowledge" terms such as flat
76  aromatic rings, ketones, etc.
77  ETversion version of the experimental torsion-angle preferences
78  verbose print output of experimental torsion-angle preferences
79  basinThresh set the basin threshold for the DGeom force field,
80  (this shouldn't normally be altered in client code).
81  onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
82  boundsMat custom bound matrix to specify upper and lower bounds of atom
83  pairs
84  embedFragmentsSeparately embed each fragment of molecule in turn
85  useSmallRingTorsions optional torsions to improve small ring conformer
86  sampling
87  useMacrocycleTorsions optional torsions to improve macrocycle conformer
88  sampling
89  useMacrocycle14config If 1-4 distances bound heuristics for
90  macrocycles is used
91  CPCI custom columbic interactions between atom pairs
92  callback void pointer to a function for reporting progress,
93  will be called with the current iteration number.
94  forceTransAmides constrain amide bonds to be trans.
95  useSymmetryForPruning use molecule symmetry when doing the RMSD pruning.
96  NOTE that for reasons of computational efficiency,
97  setting this will also set onlyHeavyAtomsForRMS to
98  true.
99 
100 
101 */
103  unsigned int maxIterations{0};
104  int numThreads{1};
105  int randomSeed{-1};
106  bool clearConfs{true};
107  bool useRandomCoords{false};
108  double boxSizeMult{2.0};
109  bool randNegEig{true};
110  unsigned int numZeroFail{1};
111  const std::map<int, RDGeom::Point3D> *coordMap{nullptr};
112  double optimizerForceTol{1e-3};
113  bool ignoreSmoothingFailures{false};
114  bool enforceChirality{true};
115  bool useExpTorsionAnglePrefs{false};
116  bool useBasicKnowledge{false};
117  bool verbose{false};
118  double basinThresh{5.0};
119  double pruneRmsThresh{-1.0};
120  bool onlyHeavyAtomsForRMS{false};
121  unsigned int ETversion{1};
122  boost::shared_ptr<const DistGeom::BoundsMatrix> boundsMat;
123  bool embedFragmentsSeparately{true};
124  bool useSmallRingTorsions{false};
125  bool useMacrocycleTorsions{false};
126  bool useMacrocycle14config{false};
127  std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>> CPCI;
128  void (*callback)(unsigned int);
129  bool forceTransAmides{true};
130  bool useSymmetryForPruning{true};
131  double boundsMatForceScaling{1.0};
132  EmbedParameters() : boundsMat(nullptr), CPCI(nullptr), callback(nullptr) {}
134  unsigned int maxIterations, int numThreads, int randomSeed,
135  bool clearConfs, bool useRandomCoords, double boxSizeMult,
136  bool randNegEig, unsigned int numZeroFail,
137  const std::map<int, RDGeom::Point3D> *coordMap, double optimizerForceTol,
138  bool ignoreSmoothingFailures, bool enforceChirality,
139  bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose,
140  double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS,
141  unsigned int ETversion = 1,
142  const DistGeom::BoundsMatrix *boundsMat = nullptr,
143  bool embedFragmentsSeparately = true, bool useSmallRingTorsions = false,
144  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false,
145  std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>>
146  CPCI = nullptr,
147  void (*callback)(unsigned int) = nullptr)
148  : maxIterations(maxIterations),
149  numThreads(numThreads),
150  randomSeed(randomSeed),
151  clearConfs(clearConfs),
152  useRandomCoords(useRandomCoords),
153  boxSizeMult(boxSizeMult),
154  randNegEig(randNegEig),
155  numZeroFail(numZeroFail),
156  coordMap(coordMap),
157  optimizerForceTol(optimizerForceTol),
158  ignoreSmoothingFailures(ignoreSmoothingFailures),
159  enforceChirality(enforceChirality),
160  useExpTorsionAnglePrefs(useExpTorsionAnglePrefs),
161  useBasicKnowledge(useBasicKnowledge),
162  verbose(verbose),
163  basinThresh(basinThresh),
164  pruneRmsThresh(pruneRmsThresh),
165  onlyHeavyAtomsForRMS(onlyHeavyAtomsForRMS),
166  ETversion(ETversion),
167  boundsMat(boundsMat),
168  embedFragmentsSeparately(embedFragmentsSeparately),
169  useSmallRingTorsions(useSmallRingTorsions),
170  useMacrocycleTorsions(useMacrocycleTorsions),
171  useMacrocycle14config(useMacrocycle14config),
172  CPCI(std::move(CPCI)),
173  callback(callback) {}
174 };
175 
176 //*! update parameters from a JSON string
178  EmbedParameters &params, const std::string &json);
179 
180 //*! Embed multiple conformations for a molecule
182  ROMol &mol, INT_VECT &res, unsigned int numConfs,
183  const EmbedParameters &params);
184 inline INT_VECT EmbedMultipleConfs(ROMol &mol, unsigned int numConfs,
185  const EmbedParameters &params) {
186  INT_VECT res;
187  EmbedMultipleConfs(mol, res, numConfs, params);
188  return res;
189 }
190 
191 //! Compute an embedding (in 3D) for the specified molecule using Distance
192 /// Geometry
193 inline int EmbedMolecule(ROMol &mol, const EmbedParameters &params) {
194  INT_VECT confIds;
195  EmbedMultipleConfs(mol, confIds, 1, params);
196 
197  int res;
198  if (confIds.size()) {
199  res = confIds[0];
200  } else {
201  res = -1;
202  }
203  return res;
204 }
205 
206 //! Compute an embedding (in 3D) for the specified molecule using Distance
207 /// Geometry
208 /*!
209  The following operations are performed (in order) here:
210  -# Build a distance bounds matrix based on the topology, including 1-5
211  distances but not VDW scaling
212  -# Triangle smooth this bounds matrix
213  -# If step 2 fails - repeat step 1, this time without 1-5 bounds and with vdW
214  scaling, and repeat step 2
215  -# Pick a distance matrix at random using the bounds matrix
216  -# Compute initial coordinates from the distance matrix
217  -# Repeat steps 3 and 4 until maxIterations is reached or embedding is
218  successful
219  -# Adjust initial coordinates by minimizing a Distance Violation error
220  function
221  **NOTE**: if the molecule has multiple fragments, they will be embedded
222  separately,
223  this means that they will likely occupy the same region of space.
224  \param mol Molecule of interest
225  \param maxIterations Max. number of times the embedding will be tried if
226  coordinates are not obtained successfully. The default
227  value is 10x the number of atoms.
228  \param seed provides a seed for the random number generator (so that
229  the same coordinates can be obtained for a molecule on
230  multiple runs). If negative, the RNG will not be seeded.
231  \param clearConfs Clear all existing conformations on the molecule
232  \param useRandomCoords Start the embedding from random coordinates instead of
233  using eigenvalues of the distance matrix.
234  \param boxSizeMult Determines the size of the box that is used for
235  random coordinates. If this is a positive number, the
236  side length will equal the largest element of the
237  distance matrix times \c boxSizeMult. If this is a
238  negative number, the side length will equal
239  \c -boxSizeMult (i.e. independent of the elements of the
240  distance matrix).
241  \param randNegEig Picks coordinates at random when a embedding process
242  produces negative eigenvalues
243  \param numZeroFail Fail embedding if we find this many or more zero
244  eigenvalues (within a tolerance)
245  \param coordMap a map of int to Point3D, between atom IDs and their locations
246  their locations. If this container is provided, the
247  coordinates are used to set distance constraints on the
248  embedding. The resulting conformer(s) should have distances
249  between the specified atoms that reproduce those between the
250  points in \c coordMap. Because the embedding produces a
251  molecule in an arbitrary reference frame, an alignment step
252  is required to actually reproduce the provided coordinates.
253  \param optimizerForceTol set the tolerance on forces in the distgeom optimizer
254  (this shouldn't normally be altered in client code).
255  \param ignoreSmoothingFailures try to embed the molecule even if triangle
256  bounds smoothing fails
257  \param enforceChirality enforce the correct chirality if chiral centers are
258  present
259  \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
260  \param useBasicKnowledge impose "basic knowledge" terms such as flat
261  aromatic rings, ketones, etc.
262  \param verbose print output of experimental torsion-angle preferences
263  \param basinThresh set the basin threshold for the DGeom force field,
264  (this shouldn't normally be altered in client code).
265  \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
266  \param ETversion version of torsion preferences to use
267  \param useSmallRingTorsions optional torsions to improve small ring
268  conformer sampling
269 
270  \param useMacrocycleTorsions optional torsions to improve macrocycle
271  conformer sampling \param useMacrocycle14config If 1-4 distances bound
272  heuristics for macrocycles is used \return ID of the conformations added to
273  the molecule, -1 if the emdedding failed
274 */
275 inline int EmbedMolecule(
276  ROMol &mol, unsigned int maxIterations = 0, int seed = -1,
277  bool clearConfs = true, bool useRandomCoords = false,
278  double boxSizeMult = 2.0, bool randNegEig = true,
279  unsigned int numZeroFail = 1,
280  const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
281  double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
282  bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
283  bool useBasicKnowledge = false, bool verbose = false,
284  double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
285  unsigned int ETversion = 1, bool useSmallRingTorsions = false,
286  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
287  EmbedParameters params(
288  maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
289  randNegEig, numZeroFail, coordMap, optimizerForceTol,
290  ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
291  useBasicKnowledge, verbose, basinThresh, -1.0, onlyHeavyAtomsForRMS,
292  ETversion, nullptr, true, useSmallRingTorsions, useMacrocycleTorsions,
293  useMacrocycle14config);
294  return EmbedMolecule(mol, params);
295 };
296 
297 //*! Embed multiple conformations for a molecule
298 /*!
299  This is kind of equivalent to calling EmbedMolecule multiple times - just that
300  the bounds
301  matrix is computed only once from the topology
302  **NOTE**: if the molecule has multiple fragments, they will be embedded
303  separately,
304  this means that they will likely occupy the same region of space.
305  \param mol Molecule of interest
306  \param res Used to return the resulting conformer ids
307  \param numConfs Number of conformations to be generated
308  \param numThreads Sets the number of threads to use (more than one thread
309  will only be used if the RDKit was build with
310  multithread
311  support). If set to zero, the max supported by the
312  system
313  will be used.
314  \param maxIterations Max. number of times the embedding will be tried if
315  coordinates are not obtained successfully. The default
316  value is 10x the number of atoms.
317  \param seed provides a seed for the random number generator (so that
318  the same coordinates can be obtained for a molecule on
319  multiple runs). If negative, the RNG will not be seeded.
320  \param clearConfs Clear all existing conformations on the molecule
321  \param useRandomCoords Start the embedding from random coordinates instead of
322  using eigenvalues of the distance matrix.
323  \param boxSizeMult Determines the size of the box that is used for
324  random coordinates. If this is a positive number, the
325  side length will equal the largest element of the
326  distance matrix times \c boxSizeMult. If this is a
327  negative number, the side length will equal
328  \c -boxSizeMult (i.e. independent of the elements of the
329  distance matrix).
330  \param randNegEig Picks coordinates at random when a embedding process
331  produces negative eigenvalues
332  \param numZeroFail Fail embedding if we find this many or more zero
333  eigenvalues (within a tolerance)
334  \param pruneRmsThresh Retain only the conformations out of 'numConfs' after
335  embedding that are at least this far apart from each
336  other. RMSD is computed on the heavy atoms.
337  Pruning is greedy; i.e. the first embedded conformation
338  is retained and from then on only those that are at
339  least
340  pruneRmsThresh away from already retained conformations
341  are kept. The pruning is done after embedding and
342  bounds violation minimization. No pruning by default.
343  \param coordMap a map of int to Point3D, between atom IDs and their locations
344  their locations. If this container is provided, the
345  coordinates are used to set distance constraints on the
346  embedding. The resulting conformer(s) should have distances
347  between the specified atoms that reproduce those between the
348  points in \c coordMap. Because the embedding produces a
349  molecule in an arbitrary reference frame, an alignment step
350  is required to actually reproduce the provided coordinates.
351  \param optimizerForceTol set the tolerance on forces in the DGeom optimizer
352  (this shouldn't normally be altered in client code).
353  \param ignoreSmoothingFailures try to embed the molecule even if triangle
354  bounds smoothing fails
355  \param enforceChirality enforce the correct chirality if chiral centers are
356  present
357  \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
358  \param useBasicKnowledge impose "basic knowledge" terms such as flat
359  aromatic rings, ketones, etc.
360  \param verbose print output of experimental torsion-angle preferences
361  \param basinThresh set the basin threshold for the DGeom force field,
362  (this shouldn't normally be altered in client code).
363  \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
364  \param ETversion version of torsion preferences to use
365  \param useSmallRingTorsions optional torsions to improve small ring
366  conformer sampling
367 
368  \param useMacrocycleTorsions optional torsions to improve macrocycle
369  conformer sampling \param useMacrocycle14config If 1-4 distances bound
370  heuristics for macrocycles is used
371 
372 */
373 inline void EmbedMultipleConfs(
374  ROMol &mol, INT_VECT &res, unsigned int numConfs = 10, int numThreads = 1,
375  unsigned int maxIterations = 30, int seed = -1, bool clearConfs = true,
376  bool useRandomCoords = false, double boxSizeMult = 2.0,
377  bool randNegEig = true, unsigned int numZeroFail = 1,
378  double pruneRmsThresh = -1.0,
379  const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
380  double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
381  bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
382  bool useBasicKnowledge = false, bool verbose = false,
383  double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
384  unsigned int ETversion = 1, bool useSmallRingTorsions = false,
385  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
386  EmbedParameters params(
387  maxIterations, numThreads, seed, clearConfs, useRandomCoords, boxSizeMult,
388  randNegEig, numZeroFail, coordMap, optimizerForceTol,
389  ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
390  useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
391  onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
392  useMacrocycleTorsions, useMacrocycle14config);
393  EmbedMultipleConfs(mol, res, numConfs, params);
394 };
395 //! \overload
397  ROMol &mol, unsigned int numConfs = 10, unsigned int maxIterations = 30,
398  int seed = -1, bool clearConfs = true, bool useRandomCoords = false,
399  double boxSizeMult = 2.0, bool randNegEig = true,
400  unsigned int numZeroFail = 1, double pruneRmsThresh = -1.0,
401  const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
402  double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
403  bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
404  bool useBasicKnowledge = false, bool verbose = false,
405  double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
406  unsigned int ETversion = 1, bool useSmallRingTorsions = false,
407  bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
408  EmbedParameters params(
409  maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
410  randNegEig, numZeroFail, coordMap, optimizerForceTol,
411  ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
412  useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
413  onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
414  useMacrocycleTorsions, useMacrocycle14config);
415  INT_VECT res;
416  EmbedMultipleConfs(mol, res, numConfs, params);
417  return res;
418 };
419 
420 //! Parameters corresponding to Sereina Riniker's KDG approach
421 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters KDG;
422 //! Parameters corresponding to Sereina Riniker's ETDG approach
423 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETDG;
424 //! Parameters corresponding to Sereina Riniker's ETKDG approach
425 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDG;
426 //! Parameters corresponding to Sereina Riniker's ETKDG approach - version 2
427 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv2;
428 //! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
429 //! (10.1021/acs.jcim.0c00025) - the macrocycle part
430 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv3;
431 //! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
432 //! (10.1021/acs.jcim.0c00025) - the small ring part
433 RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters srETKDGv3;
434 } // namespace DGeomHelpers
435 } // namespace RDKit
436 
437 #endif
Defines the primary molecule class ROMol as well as associated typedefs.
Class to store the distance bound.
Definition: BoundsMatrix.h:28
#define RDKIT_DISTGEOMHELPERS_EXPORT
Definition: export.h:113
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv2
Parameters corresponding to Sereina Riniker's ETKDG approach - version 2.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETDG
Parameters corresponding to Sereina Riniker's ETDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT void updateEmbedParametersFromJSON(EmbedParameters &params, const std::string &json)
int EmbedMolecule(ROMol &mol, const EmbedParameters &params)
Definition: Embedder.h:193
RDKIT_DISTGEOMHELPERS_EXPORT void EmbedMultipleConfs(ROMol &mol, INT_VECT &res, unsigned int numConfs, const EmbedParameters &params)
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDG
Parameters corresponding to Sereina Riniker's ETKDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters srETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters KDG
Parameters corresponding to Sereina Riniker's KDG approach.
const uint32_t seed
Definition: MHFP.h:29
Std stuff.
Definition: Abbreviations.h:18
std::vector< int > INT_VECT
Definition: types.h:274
Parameter object for controlling embedding.
Definition: Embedder.h:102
boost::shared_ptr< const DistGeom::BoundsMatrix > boundsMat
Definition: Embedder.h:122
std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double > > CPCI
Definition: Embedder.h:127
EmbedParameters(unsigned int maxIterations, int numThreads, int randomSeed, bool clearConfs, bool useRandomCoords, double boxSizeMult, bool randNegEig, unsigned int numZeroFail, const std::map< int, RDGeom::Point3D > *coordMap, double optimizerForceTol, bool ignoreSmoothingFailures, bool enforceChirality, bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose, double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS, unsigned int ETversion=1, const DistGeom::BoundsMatrix *boundsMat=nullptr, bool embedFragmentsSeparately=true, bool useSmallRingTorsions=false, bool useMacrocycleTorsions=false, bool useMacrocycle14config=false, std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double >> CPCI=nullptr, void(*callback)(unsigned int)=nullptr)
Definition: Embedder.h:133