RDKit
Open-source cheminformatics and machine learning.
SmilesWrite.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_SMILESWRITE_H_012020
12 #define RD_SMILESWRITE_H_012020
13 
14 #include <string>
15 #include <vector>
16 #include <memory>
17 #include <cstdint>
18 #include <limits>
19 
20 namespace RDKit {
21 class Atom;
22 class Bond;
23 class ROMol;
24 
26  bool doIsomericSmiles =
27  true; /**< include stereochemistry and isotope information */
28  bool doKekule = false; /**< kekulize the molecule before generating the SMILES
29  and output single/double bonds. NOTE that the output
30  is not canonical and that this will thrown an
31  exception if the molecule cannot be kekulized. */
32  bool canonical = true; /**< generate canonical SMILES */
33  bool allBondsExplicit = false; /**< include symbols for all bonds */
34  bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
35  bool doRandom = false; /**< randomize the output order. The resulting SMILES
36  is not canonical */
37  int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
38  atom. The resulting SMILES is not canonical */
39 };
40 namespace SmilesWrite {
41 
42 enum CXSmilesFields : uint32_t {
43  CX_NONE = 0,
44  CX_ATOM_LABELS = 1 << 0,
46  CX_COORDS = 1 << 2,
47  CX_RADICALS = 1 << 3,
48  CX_ATOM_PROPS = 1 << 4,
49  CX_LINKNODES = 1 << 5,
51  CX_SGROUPS = 1 << 7,
52  CX_POLYMER = 1 << 8,
53  // NB: std::int32_t is intentional as a non-scoped enum is implicitly cast to int
54  // so numbers larger than std::numeric_limits<std::int32_t>::max() will be
55  // negative
56  CX_ALL = std::numeric_limits<std::int32_t>::max()
57 };
58 
59 //! \brief returns the cxsmiles data for a molecule
61  const ROMol &mol,
62  std::uint32_t flags = CXSmilesFields::CX_ALL);
63 
64 //! \brief returns true if the atom number is in the SMILES organic subset
66 
67 //! \brief returns the SMILES for an atom
68 /*!
69  \param atom : the atom to work with
70  \param doKekule : we're doing kekulized smiles (e.g. don't use
71  lower case for the atom label)
72  \param bondIn : the bond we came into the atom on (unused)
73  \param allHsExplicit : if true, hydrogen counts will be provided for every
74  atom.
75  \param isomericSmiles : if true, isomeric SMILES will be generated
76 */
78  bool doKekule = false,
79  const Bond *bondIn = nullptr,
80  bool allHsExplicit = false,
81  bool isomericSmiles = true);
82 
83 //! \brief returns the SMILES for a bond
84 /*!
85  \param bond : the bond to work with
86  \param atomToLeftIdx : the index of the atom preceding \c bond
87  in the SMILES
88  \param doKekule : we're doing kekulized smiles (e.g. write out
89  bond orders for aromatic bonds)
90  \param allBondsExplicit : if true, symbols will be included for all bonds.
91 */
93  const Bond *bond, int atomToLeftIdx = -1, bool doKekule = false,
94  bool allBondsExplicit = false);
95 } // namespace SmilesWrite
96 
97 //! \brief returns canonical SMILES for a molecule
99  const ROMol &mol, const SmilesWriteParams &params);
100 
101 //! \brief returns canonical SMILES for a molecule
102 /*!
103  \param mol : the molecule in question.
104  \param doIsomericSmiles : include stereochemistry and isotope information
105  in the SMILES
106 
107  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
108  this will throw an exception if the molecule cannot be kekulized.
109 
110  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
111  The resulting SMILES is not, of course, canonical.
112  \param canonical : if false, no attempt will be made to canonicalize the
113  SMILES
114  \param allBondsExplicit : if true, symbols will be included for all bonds.
115  \param allHsExplicit : if true, hydrogen counts will be provided for every
116  atom.
117  */
118 inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
119  bool doKekule = false, int rootedAtAtom = -1,
120  bool canonical = true,
121  bool allBondsExplicit = false,
122  bool allHsExplicit = false,
123  bool doRandom = false) {
125  ps.doIsomericSmiles = doIsomericSmiles;
126  ps.doKekule = doKekule;
127  ps.rootedAtAtom = rootedAtAtom;
128  ps.canonical = canonical;
129  ps.allBondsExplicit = allBondsExplicit;
130  ps.allHsExplicit = allHsExplicit;
131  ps.doRandom = doRandom;
132  return MolToSmiles(mol, ps);
133 };
134 
135 //! \brief returns a vector of random SMILES for a molecule (may contain
136 //! duplicates)
137 /*!
138  \param mol : the molecule in question.
139  \param numSmiles : the number of SMILES to return
140  \param randomSeed : if >0, will be used to seed the random number generator
141  \param doIsomericSmiles : include stereochemistry and isotope information
142  in the SMILES
143  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
144  \param allBondsExplicit : if true, symbols will be included for all bonds.
145  \param allHsExplicit : if true, hydrogen counts will be provided for every
146  atom.
147  */
149  const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
150  bool doIsomericSmiles = true, bool doKekule = false,
151  bool allBondsExplicit = false, bool allHsExplicit = false);
152 
153 //! \brief returns canonical SMILES for part of a molecule
155  const ROMol &mol, const SmilesWriteParams &params,
156  const std::vector<int> &atomsToUse,
157  const std::vector<int> *bondsToUse = nullptr,
158  const std::vector<std::string> *atomSymbols = nullptr,
159  const std::vector<std::string> *bondSymbols = nullptr);
160 
161 //! \brief returns canonical SMILES for part of a molecule
162 /*!
163  \param mol : the molecule in question.
164  \param atomsToUse : indices of the atoms in the fragment
165  \param bondsToUse : indices of the bonds in the fragment. If this is not
166  provided,
167  all bonds between the atoms in atomsToUse will be included
168  \param atomSymbols : symbols to use for the atoms in the output SMILES
169  \param bondSymbols : symbols to use for the bonds in the output SMILES
170  \param doIsomericSmiles : include stereochemistry and isotope information
171  in the SMILES
172  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
173  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
174  The resulting SMILES is not, of course, canonical.
175  \param canonical : if false, no attempt will be made to canonicalize the
176  SMILES
177  \param allBondsExplicit : if true, symbols will be included for all bonds.
178  \param allHsExplicit : if true, hydrogen counts will be provided for every
179  atom.
180  \param doRandom : generate a randomized smiles string by randomly choosing
181  the priority to follow in the DFS traversal. [default false]
182 
183  \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
184 
185  */
186 inline std::string MolFragmentToSmiles(
187  const ROMol &mol, const std::vector<int> &atomsToUse,
188  const std::vector<int> *bondsToUse = nullptr,
189  const std::vector<std::string> *atomSymbols = nullptr,
190  const std::vector<std::string> *bondSymbols = nullptr,
191  bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
192  bool canonical = true, bool allBondsExplicit = false,
193  bool allHsExplicit = false) {
195  ps.doIsomericSmiles = doIsomericSmiles;
196  ps.doKekule = doKekule;
197  ps.rootedAtAtom = rootedAtAtom;
198  ps.canonical = canonical;
199  ps.allBondsExplicit = allBondsExplicit;
200  ps.allHsExplicit = allHsExplicit;
201  return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
202  bondSymbols);
203 }
204 
205 //! \brief returns canonical CXSMILES for a molecule
207  const ROMol &mol, const SmilesWriteParams &ps,
208  std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL);
209 
210 //! \brief returns canonical CXSMILES for a molecule
211 /*!
212  \param mol : the molecule in question.
213  \param doIsomericSmiles : include stereochemistry and isotope information
214  in the SMILES
215  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
216  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
217  The resulting SMILES is not, of course, canonical.
218  \param canonical : if false, no attempt will be made to canonicalize the
219  SMILES
220  \param allBondsExplicit : if true, symbols will be included for all bonds.
221  \param allHsExplicit : if true, hydrogen counts will be provided for every
222  atom.
223  */
224 inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
225  bool doKekule = false, int rootedAtAtom = -1,
226  bool canonical = true,
227  bool allBondsExplicit = false,
228  bool allHsExplicit = false,
229  bool doRandom = false) {
231  ps.doIsomericSmiles = doIsomericSmiles;
232  ps.doKekule = doKekule;
233  ps.rootedAtAtom = rootedAtAtom;
234  ps.canonical = canonical;
235  ps.allBondsExplicit = allBondsExplicit;
236  ps.allHsExplicit = allHsExplicit;
237  ps.doRandom = doRandom;
238  return MolToCXSmiles(mol, ps);
239 };
240 
241 //! \brief returns canonical CXSMILES for part of a molecule
243  const ROMol &mol, const SmilesWriteParams &params,
244  const std::vector<int> &atomsToUse,
245  const std::vector<int> *bondsToUse = nullptr,
246  const std::vector<std::string> *atomSymbols = nullptr,
247  const std::vector<std::string> *bondSymbols = nullptr);
248 
249 //! \brief returns canonical CXSMILES for part of a molecule
250 /*!
251  \param mol : the molecule in question.
252  \param atomsToUse : indices of the atoms in the fragment
253  \param bondsToUse : indices of the bonds in the fragment. If this is not
254  provided,
255  all bonds between the atoms in atomsToUse will be included
256  \param atomSymbols : symbols to use for the atoms in the output SMILES
257  \param bondSymbols : symbols to use for the bonds in the output SMILES
258  \param doIsomericSmiles : include stereochemistry and isotope information
259  in the SMILES
260  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
261  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
262  The resulting SMILES is not, of course, canonical.
263  \param canonical : if false, no attempt will be made to canonicalize the
264  SMILES
265  \param allBondsExplicit : if true, symbols will be included for all bonds.
266  \param allHsExplicit : if true, hydrogen counts will be provided for every
267  atom.
268 
269  \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
270 
271  */
272 inline std::string MolFragmentToCXSmiles(
273  const ROMol &mol, const std::vector<int> &atomsToUse,
274  const std::vector<int> *bondsToUse = nullptr,
275  const std::vector<std::string> *atomSymbols = nullptr,
276  const std::vector<std::string> *bondSymbols = nullptr,
277  bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
278  bool canonical = true, bool allBondsExplicit = false,
279  bool allHsExplicit = false) {
281  ps.doIsomericSmiles = doIsomericSmiles;
282  ps.doKekule = doKekule;
283  ps.rootedAtAtom = rootedAtAtom;
284  ps.canonical = canonical;
285  ps.allBondsExplicit = allBondsExplicit;
286  ps.allHsExplicit = allHsExplicit;
287  return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
288  bondSymbols);
289 }
290 
291 } // namespace RDKit
292 #endif
The class for representing atoms.
Definition: Atom.h:68
class for representing a bond
Definition: Bond.h:46
#define RDKIT_SMILESPARSE_EXPORT
Definition: export.h:441
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx=-1, bool doKekule=false, bool allBondsExplicit=false)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, bool doKekule=false, const Bond *bondIn=nullptr, bool allHsExplicit=false, bool isomericSmiles=true)
returns the SMILES for an atom
Std stuff.
Definition: Abbreviations.h:18
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL)
returns canonical CXSMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule