RDKit
Open-source cheminformatics and machine learning.
SmilesWrite.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_SMILESWRITE_H_012020
12 #define RD_SMILESWRITE_H_012020
13 
14 #include <string>
15 #include <vector>
16 #include <memory>
17 #include <cstdint>
18 #include <limits>
19 
20 namespace RDKit {
21 class Atom;
22 class Bond;
23 class ROMol;
24 
26  bool doIsomericSmiles =
27  true; /**< include stereochemistry and isotope information */
28  bool doKekule = false; /**< kekulize the molecule before generating the SMILES
29  and output single/double bonds. NOTE that the output
30  is not canonical and that this will thrown an
31  exception if the molecule cannot be kekulized. */
32  bool canonical = true; /**< generate canonical SMILES */
33  bool allBondsExplicit = false; /**< include symbols for all bonds */
34  bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
35  bool doRandom = false; /**< randomize the output order. The resulting SMILES
36  is not canonical */
37  int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
38  atom. The resulting SMILES is not canonical */
39 };
40 namespace SmilesWrite {
41 
42 enum CXSmilesFields : uint32_t {
43  CX_NONE = 0,
44  CX_ATOM_LABELS = 1 << 0,
46  CX_COORDS = 1 << 2,
47  CX_RADICALS = 1 << 3,
48  CX_ATOM_PROPS = 1 << 4,
49  CX_LINKNODES = 1 << 5,
51  CX_SGROUPS = 1 << 7,
52  CX_POLYMER = 1 << 8,
53  CX_ALL = 0x7fffffff
54 };
55 
56 //! \brief returns the cxsmiles data for a molecule
58  const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL);
59 
60 //! \brief returns true if the atom number is in the SMILES organic subset
62 
63 //! \brief returns the SMILES for an atom
64 /*!
65  \param atom : the atom to work with
66  \param doKekule : we're doing kekulized smiles (e.g. don't use
67  lower case for the atom label)
68  \param bondIn : the bond we came into the atom on (unused)
69  \param allHsExplicit : if true, hydrogen counts will be provided for every
70  atom.
71  \param isomericSmiles : if true, isomeric SMILES will be generated
72 */
74  bool doKekule = false,
75  const Bond *bondIn = nullptr,
76  bool allHsExplicit = false,
77  bool isomericSmiles = true);
78 
79 //! \brief returns the SMILES for a bond
80 /*!
81  \param bond : the bond to work with
82  \param atomToLeftIdx : the index of the atom preceding \c bond
83  in the SMILES
84  \param doKekule : we're doing kekulized smiles (e.g. write out
85  bond orders for aromatic bonds)
86  \param allBondsExplicit : if true, symbols will be included for all bonds.
87 */
89  const Bond *bond, int atomToLeftIdx = -1, bool doKekule = false,
90  bool allBondsExplicit = false);
91 } // namespace SmilesWrite
92 
93 //! \brief returns canonical SMILES for a molecule
95  const ROMol &mol, const SmilesWriteParams &params);
96 
97 //! \brief returns canonical SMILES for a molecule
98 /*!
99  \param mol : the molecule in question.
100  \param doIsomericSmiles : include stereochemistry and isotope information
101  in the SMILES
102 
103  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
104  this will throw an exception if the molecule cannot be kekulized.
105 
106  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
107  The resulting SMILES is not, of course, canonical.
108  \param canonical : if false, no attempt will be made to canonicalize the
109  SMILES
110  \param allBondsExplicit : if true, symbols will be included for all bonds.
111  \param allHsExplicit : if true, hydrogen counts will be provided for every
112  atom.
113  */
114 inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
115  bool doKekule = false, int rootedAtAtom = -1,
116  bool canonical = true,
117  bool allBondsExplicit = false,
118  bool allHsExplicit = false,
119  bool doRandom = false) {
121  ps.doIsomericSmiles = doIsomericSmiles;
122  ps.doKekule = doKekule;
123  ps.rootedAtAtom = rootedAtAtom;
124  ps.canonical = canonical;
125  ps.allBondsExplicit = allBondsExplicit;
126  ps.allHsExplicit = allHsExplicit;
127  ps.doRandom = doRandom;
128  return MolToSmiles(mol, ps);
129 };
130 
131 //! \brief returns a vector of random SMILES for a molecule (may contain
132 //! duplicates)
133 /*!
134  \param mol : the molecule in question.
135  \param numSmiles : the number of SMILES to return
136  \param randomSeed : if >0, will be used to seed the random number generator
137  \param doIsomericSmiles : include stereochemistry and isotope information
138  in the SMILES
139  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
140  \param allBondsExplicit : if true, symbols will be included for all bonds.
141  \param allHsExplicit : if true, hydrogen counts will be provided for every
142  atom.
143  */
145  const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
146  bool doIsomericSmiles = true, bool doKekule = false,
147  bool allBondsExplicit = false, bool allHsExplicit = false);
148 
149 //! \brief returns canonical SMILES for part of a molecule
151  const ROMol &mol, const SmilesWriteParams &params,
152  const std::vector<int> &atomsToUse,
153  const std::vector<int> *bondsToUse = nullptr,
154  const std::vector<std::string> *atomSymbols = nullptr,
155  const std::vector<std::string> *bondSymbols = nullptr);
156 
157 //! \brief returns canonical SMILES for part of a molecule
158 /*!
159  \param mol : the molecule in question.
160  \param atomsToUse : indices of the atoms in the fragment
161  \param bondsToUse : indices of the bonds in the fragment. If this is not
162  provided,
163  all bonds between the atoms in atomsToUse will be included
164  \param atomSymbols : symbols to use for the atoms in the output SMILES
165  \param bondSymbols : symbols to use for the bonds in the output SMILES
166  \param doIsomericSmiles : include stereochemistry and isotope information
167  in the SMILES
168  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
169  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
170  The resulting SMILES is not, of course, canonical.
171  \param canonical : if false, no attempt will be made to canonicalize the
172  SMILES
173  \param allBondsExplicit : if true, symbols will be included for all bonds.
174  \param allHsExplicit : if true, hydrogen counts will be provided for every
175  atom.
176  \param doRandom : generate a randomized smiles string by randomly choosing
177  the priority to follow in the DFS traversal. [default false]
178 
179  \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
180 
181  */
182 inline std::string MolFragmentToSmiles(
183  const ROMol &mol, const std::vector<int> &atomsToUse,
184  const std::vector<int> *bondsToUse = nullptr,
185  const std::vector<std::string> *atomSymbols = nullptr,
186  const std::vector<std::string> *bondSymbols = nullptr,
187  bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
188  bool canonical = true, bool allBondsExplicit = false,
189  bool allHsExplicit = false) {
191  ps.doIsomericSmiles = doIsomericSmiles;
192  ps.doKekule = doKekule;
193  ps.rootedAtAtom = rootedAtAtom;
194  ps.canonical = canonical;
195  ps.allBondsExplicit = allBondsExplicit;
196  ps.allHsExplicit = allHsExplicit;
197  return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
198  bondSymbols);
199 }
200 
201 //! \brief returns canonical CXSMILES for a molecule
203  const ROMol &mol, const SmilesWriteParams &ps,
204  std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL);
205 
206 //! \brief returns canonical CXSMILES for a molecule
207 /*!
208  \param mol : the molecule in question.
209  \param doIsomericSmiles : include stereochemistry and isotope information
210  in the SMILES
211  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
212  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
213  The resulting SMILES is not, of course, canonical.
214  \param canonical : if false, no attempt will be made to canonicalize the
215  SMILES
216  \param allBondsExplicit : if true, symbols will be included for all bonds.
217  \param allHsExplicit : if true, hydrogen counts will be provided for every
218  atom.
219  */
220 inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
221  bool doKekule = false, int rootedAtAtom = -1,
222  bool canonical = true,
223  bool allBondsExplicit = false,
224  bool allHsExplicit = false,
225  bool doRandom = false) {
227  ps.doIsomericSmiles = doIsomericSmiles;
228  ps.doKekule = doKekule;
229  ps.rootedAtAtom = rootedAtAtom;
230  ps.canonical = canonical;
231  ps.allBondsExplicit = allBondsExplicit;
232  ps.allHsExplicit = allHsExplicit;
233  ps.doRandom = doRandom;
234  return MolToCXSmiles(mol, ps);
235 };
236 
237 //! \brief returns canonical CXSMILES for part of a molecule
239  const ROMol &mol, const SmilesWriteParams &params,
240  const std::vector<int> &atomsToUse,
241  const std::vector<int> *bondsToUse = nullptr,
242  const std::vector<std::string> *atomSymbols = nullptr,
243  const std::vector<std::string> *bondSymbols = nullptr);
244 
245 //! \brief returns canonical CXSMILES for part of a molecule
246 /*!
247  \param mol : the molecule in question.
248  \param atomsToUse : indices of the atoms in the fragment
249  \param bondsToUse : indices of the bonds in the fragment. If this is not
250  provided,
251  all bonds between the atoms in atomsToUse will be included
252  \param atomSymbols : symbols to use for the atoms in the output SMILES
253  \param bondSymbols : symbols to use for the bonds in the output SMILES
254  \param doIsomericSmiles : include stereochemistry and isotope information
255  in the SMILES
256  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
257  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
258  The resulting SMILES is not, of course, canonical.
259  \param canonical : if false, no attempt will be made to canonicalize the
260  SMILES
261  \param allBondsExplicit : if true, symbols will be included for all bonds.
262  \param allHsExplicit : if true, hydrogen counts will be provided for every
263  atom.
264 
265  \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
266 
267  */
268 inline std::string MolFragmentToCXSmiles(
269  const ROMol &mol, const std::vector<int> &atomsToUse,
270  const std::vector<int> *bondsToUse = nullptr,
271  const std::vector<std::string> *atomSymbols = nullptr,
272  const std::vector<std::string> *bondSymbols = nullptr,
273  bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
274  bool canonical = true, bool allBondsExplicit = false,
275  bool allHsExplicit = false) {
277  ps.doIsomericSmiles = doIsomericSmiles;
278  ps.doKekule = doKekule;
279  ps.rootedAtAtom = rootedAtAtom;
280  ps.canonical = canonical;
281  ps.allBondsExplicit = allBondsExplicit;
282  ps.allHsExplicit = allHsExplicit;
283  return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
284  bondSymbols);
285 }
286 
287 } // namespace RDKit
288 #endif
The class for representing atoms.
Definition: Atom.h:68
class for representing a bond
Definition: Bond.h:47
#define RDKIT_SMILESPARSE_EXPORT
Definition: export.h:449
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx=-1, bool doKekule=false, bool allBondsExplicit=false)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, bool doKekule=false, const Bond *bondIn=nullptr, bool allHsExplicit=false, bool isomericSmiles=true)
returns the SMILES for an atom
Std stuff.
Definition: Abbreviations.h:18
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL)
returns canonical CXSMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule