RDKit
Open-source cheminformatics and machine learning.
Catalog.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef __RD_CATALOG_H__
13 #define __RD_CATALOG_H__
14 
15 // Boost graph stuff
17 #include <boost/graph/graph_traits.hpp>
18 #include <boost/graph/adjacency_list.hpp>
19 #include <boost/version.hpp>
20 #if BOOST_VERSION >= 104000
21 #include <boost/property_map/property_map.hpp>
22 #else
23 #include <boost/property_map.hpp>
24 #endif
26 
27 // for some typedefs
28 #include <RDGeneral/types.h>
29 #include <RDGeneral/StreamOps.h>
30 
31 namespace RDCatalog {
32 const int versionMajor = 1;
33 const int versionMinor = 0;
34 const int versionPatch = 0;
35 const int endianId = 0xDEADBEEF;
36 
37 //-----------------------------------------------------------------------------
38 //! abstract base class for a catalog object
39 template <class entryType, class paramType>
40 class Catalog {
41  public:
42  typedef entryType entryType_t;
43  typedef paramType paramType_t;
44 
45  //------------------------------------
47 
48  //------------------------------------
49  virtual ~Catalog() { delete dp_cParams; }
50 
51  //------------------------------------
52  //! return a serialized form of the Catalog as an std::string
53  virtual std::string Serialize() const = 0;
54 
55  //------------------------------------
56  //! adds an entry to the catalog
57  /*!
58 
59  \param entry the entry to be added
60  \param updateFPLength (optional) if this is true, our internal
61  fingerprint length will also be updated.
62 
63  */
64  virtual unsigned int addEntry(entryType *entry,
65  bool updateFPLength = true) = 0;
66 
67  //------------------------------------
68  //! returns a particular entry in the Catalog
69  virtual const entryType *getEntryWithIdx(unsigned int idx) const = 0;
70 
71  //------------------------------------
72  //! returns the number of entries
73  virtual unsigned int getNumEntries() const = 0;
74 
75  //------------------------------------
76  //! returns the length of our fingerprint
77  unsigned int getFPLength() const { return d_fpLength; }
78 
79  //------------------------------------
80  //! sets our fingerprint length
81  void setFPLength(unsigned int val) { d_fpLength = val; }
82 
83  //------------------------------------
84  //! sets our parameters by copying the \c params argument
85  virtual void setCatalogParams(const paramType *params) {
86  PRECONDITION(params, "bad parameter object");
87  // if we already have a paramter object throw an exception
89  "A parameter object already exists on the catalog");
90  /*
91  if (dp_cParams) {
92  // we already have parameter object on the catalog
93  // can't overwrite it
94  PRECONDITION(0, "A parameter object already exist on the catalog");
95  }*/
96  dp_cParams = new paramType(*params);
97  }
98 
99  //------------------------------------
100  //! returns a pointer to our parameters
101  const paramType *getCatalogParams() const { return dp_cParams; }
102 
103  protected:
104  // this is the ID that will be assigned to the next entry
105  // added to the catalog - need not be same as the number of entries
106  // in the catalog and does not correspond with the
107  // id of the entry in the catalog.
108  // this is more along the lines of bitId
109  unsigned int d_fpLength; //!< the length of our fingerprint
110  paramType *dp_cParams; //!< our params object
111 };
112 
113 //-----------------------------------------------------------------------------
114 //! A Catalog with a hierarchical structure
115 /*!
116 
117  The entries of a HierarchCatalog are arranged in a directed graph
118 
119  <b>The difference between <i>Indices</i> and <i>Bit Ids</i></b>
120 
121  A HierarchCatalog may contain more entries than the user is actually
122  interested in. For example a HierarchCatalog constructed to contain
123  orders 5 through 8 may well contain information about orders 1-5,
124  in order to facilitate some search optimizations.
125 
126  - <i>Bit Ids</i> refer to the "interesting" bits.
127  So, in the above example, Bit Id \c 0 will be the first entry
128  with order 5.
129  - <i>Indices</i> refer to the underlying structure of the catalog.
130  So, in the above example, the entry with index \c 0 will be
131  the first entry with order 1.
132 
133 */
134 template <class entryType, class paramType, class orderType>
135 class HierarchCatalog : public Catalog<entryType, paramType> {
136  // the entries in the catalog can be traversed using the edges
137  // in a desired order
138  public:
139  //! used by the BGL to set up the node properties in our graph
140  struct vertex_entry_t {
141  enum { num = 1003 };
142  typedef boost::vertex_property_tag kind;
143  };
144  typedef boost::property<vertex_entry_t, entryType *> EntryProperty;
145 
146  //! the type of the graph itself:
147  typedef boost::adjacency_list<
148  boost::vecS,
149  boost::vecS, // FIX: should be using setS for edges so that parallel
150  // edges are never added (page 225 BGL book)
151  // but that seems result in compile errors
152  boost::bidirectionalS, EntryProperty> CatalogGraph;
153 
154  typedef boost::graph_traits<CatalogGraph> CAT_GRAPH_TRAITS;
155  typedef typename CAT_GRAPH_TRAITS::vertex_iterator VER_ITER;
156  typedef std::pair<VER_ITER, VER_ITER> ENT_ITER_PAIR;
157  typedef typename CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER;
158  typedef std::pair<DOWN_ENT_ITER, DOWN_ENT_ITER> DOWN_ENT_ITER_PAIR;
159 
160  //------------------------------------
162 
163  //------------------------------------
164  //! Construct by making a copy of the input \c params object
167  this->setCatalogParams(params);
168  }
169 
170  //------------------------------------
171  //! Construct from a \c pickle (a serialized form of the HierarchCatalog)
173  this->initFromString(pickle);
174  }
175 
176  //------------------------------------
177  ~HierarchCatalog() { destroy(); }
178 
179  //------------------------------------
180  //! serializes this object to a stream
181  void toStream(std::ostream &ss) const {
182  PRECONDITION(this->getCatalogParams(), "NULL parameter object");
183 
184  // the i/o header:
185  RDKit::streamWrite(ss, endianId);
186  RDKit::streamWrite(ss, versionMajor);
187  RDKit::streamWrite(ss, versionMinor);
188  RDKit::streamWrite(ss, versionPatch);
189 
190  // information about the catalog itself:
191  int tmpUInt;
192  tmpUInt = this->getFPLength();
193  RDKit::streamWrite(ss, tmpUInt);
194  tmpUInt = this->getNumEntries();
195  RDKit::streamWrite(ss, tmpUInt);
196 
197  // std::cout << ">>>>-------------------------------" << std::endl;
198  // std::cout << "\tlength: " << getFPLength() << " " << getNumEntries() <<
199  // std::endl;
200 
201  // add the params object:
202  this->getCatalogParams()->toStream(ss);
203  // std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
204  // std::cout << " " << getCatalogParams()->getUpperFragLength();
205  // std::cout << " " << getCatalogParams()->getNumFuncGroups();
206  // std::cout << std::endl;
207 
208  // write the entries in order:
209  for (unsigned int i = 0; i < getNumEntries(); i++) {
210  this->getEntryWithIdx(i)->toStream(ss);
211  }
212 
213  // finally the adjacency list:
214  for (unsigned int i = 0; i < getNumEntries(); i++) {
215  RDKit::INT_VECT children = this->getDownEntryList(i);
216  tmpUInt = static_cast<unsigned int>(children.size());
217  RDKit::streamWrite(ss, tmpUInt);
218  for (RDKit::INT_VECT::const_iterator ivci = children.begin();
219  ivci != children.end(); ivci++) {
220  RDKit::streamWrite(ss, *ivci);
221  }
222  }
223  }
224 
225  //------------------------------------
226  //! serializes this object and returns the resulting \c pickle
227  std::string Serialize() const {
228  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
229  std::ios_base::in);
230  this->toStream(ss);
231  return ss.str();
232  }
233 
234  //------------------------------------
235  //! fills the contents of this object from a stream containing a \c pickle
236  void initFromStream(std::istream &ss) {
237  int tmpInt;
238  // FIX: at the moment we ignore the header info:
239  RDKit::streamRead(ss, tmpInt);
240  RDKit::streamRead(ss, tmpInt);
241  RDKit::streamRead(ss, tmpInt);
242  RDKit::streamRead(ss, tmpInt);
243 
244  unsigned int tmpUInt;
245  RDKit::streamRead(ss, tmpUInt); // fp length
246  this->setFPLength(tmpUInt);
247 
248  unsigned int numEntries;
249  RDKit::streamRead(ss, numEntries);
250  // std::cout << "<<<-------------------------------" << std::endl;
251  // std::cout << "\tlength: " << getFPLength() << " " << numEntries <<
252  // std::endl;
253 
254  // grab the params:
255  paramType *params = new paramType();
256  params->initFromStream(ss);
257  this->setCatalogParams(params);
258  delete params;
259 
260  // std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
261  // std::cout << " " << getCatalogParams()->getUpperFragLength();
262  // std::cout << " " << getCatalogParams()->getNumFuncGroups();
263  // std::cout << std::endl;
264 
265  // now all of the entries:
266  for (unsigned int i = 0; i < numEntries; i++) {
267  entryType *entry = new entryType();
268  entry->initFromStream(ss);
269  this->addEntry(entry, false);
270  }
271 
272  // and, finally, the adjacency list:
273  for (unsigned int i = 0; i < numEntries; i++) {
274  unsigned int nNeighbors;
275  RDKit::streamRead(ss, nNeighbors);
276  for (unsigned int j = 0; j < nNeighbors; j++) {
277  RDKit::streamRead(ss, tmpInt);
278  this->addEdge(i, tmpInt);
279  }
280  }
281  }
282 
283  //------------------------------------
284  unsigned int getNumEntries() const { return static_cast<unsigned int>(boost::num_vertices(d_graph)); }
285 
286  //------------------------------------
287  //! fills the contents of this object from a string containing a \c pickle
288  void initFromString(const std::string &text) {
289  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
290  std::ios_base::in);
291  // initialize the stream:
292  ss.write(text.c_str(), text.length());
293  // now start reading out values:
294  this->initFromStream(ss);
295  }
296 
297  //------------------------------------
298  //! add a new entry to the catalog
299  /*!
300 
301  \param entry the entry to be added
302  \param updateFPLength (optional) if this is true, our internal
303  fingerprint length will also be updated.
304 
305  */
306  unsigned int addEntry(entryType *entry, bool updateFPLength = true) {
307  PRECONDITION(entry, "bad arguments");
308  if (updateFPLength) {
309  unsigned int fpl = this->getFPLength();
310  entry->setBitId(fpl);
311  fpl++;
312  this->setFPLength(fpl);
313  }
314  unsigned int eid = static_cast<unsigned int>(boost::add_vertex(EntryProperty(entry), d_graph));
315  orderType etype = entry->getOrder();
316  // REVIEW: this initialization is not required: the STL map, in
317  // theory, will create a new object when operator[] is called
318  // for a new item
319  if (d_orderMap.find(etype) == d_orderMap.end()) {
320  RDKit::INT_VECT nets;
321  d_orderMap[etype] = nets;
322  }
323  d_orderMap[etype].push_back(eid);
324  return eid;
325  }
326 
327  //------------------------------------
328  //! adds an edge between two entries in the catalog
329  /*!
330  Since we are using a bidirectional graph - the order in
331  which the ids are supplied here makes a difference
332 
333  \param id1 index of the edge's beginning
334  \param id2 index of the edge's end
335 
336  */
337  void addEdge(unsigned int id1, unsigned int id2) {
338  unsigned int nents = getNumEntries();
339  URANGE_CHECK(id1, nents);
340  URANGE_CHECK(id2, nents);
341  // FIX: if we boost::setS for the edgeList BGL will
342  // do the checking for duplicity (parallel edges)
343  // But for reasons unknown setS results in compile
344  // errors while using adjacent_vertices.
345  typename CAT_GRAPH_TRAITS::edge_descriptor edge;
346  bool found;
347  boost::tie(edge, found) = boost::edge(boost::vertex(id1, d_graph),
348  boost::vertex(id2, d_graph), d_graph);
349  if (!found) {
350  boost::add_edge(id1, id2, d_graph);
351  }
352  }
353 
354  //------------------------------------
355  //! returns a pointer to our entry with a particular index
356  const entryType *getEntryWithIdx(unsigned int idx) const {
357  URANGE_CHECK(idx, getNumEntries());
358  int vd = static_cast<int>(boost::vertex(idx, d_graph));
359  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
360  pMap = boost::get(vertex_entry_t(), d_graph);
361  return pMap[vd];
362  }
363 
364  //------------------------------------
365  //! returns a pointer to our entry with a particular bit ID
366  const entryType *getEntryWithBitId(unsigned int idx) const {
367  URANGE_CHECK(idx, this->getFPLength());
368  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
369  pMap = boost::get(vertex_entry_t(), d_graph);
370  const entryType *res = NULL;
371  for (unsigned int i = idx; i < this->getNumEntries(); i++) {
372  const entryType *e = pMap[i];
373  if (e->getBitId() == static_cast<int>(idx)) {
374  res = e;
375  break;
376  }
377  }
378  return res;
379  }
380 
381  //------------------------------------
382  //! returns the index of the entry with a particular bit ID
383  int getIdOfEntryWithBitId(unsigned int idx) const {
384  URANGE_CHECK(idx, this->getFPLength());
385  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
386  pMap = boost::get(vertex_entry_t(), d_graph);
387  int res = -1;
388  for (unsigned int i = idx; i < this->getNumEntries(); i++) {
389  const entryType *e = pMap[i];
390  if (static_cast<unsigned int>(e->getBitId()) == idx) {
391  res = i;
392  break;
393  }
394  }
395  return res;
396  }
397 
398  //------------------------------------
399  //! returns a list of the indices of entries below the one passed in
400  RDKit::INT_VECT getDownEntryList(unsigned int idx) const {
401  RDKit::INT_VECT res;
402  DOWN_ENT_ITER nbrIdx, endIdx;
403  boost::tie(nbrIdx, endIdx) = boost::adjacent_vertices(idx, d_graph);
404  while (nbrIdx != endIdx) {
405  res.push_back(static_cast<int>(*nbrIdx));
406  nbrIdx++;
407  }
408  // std::cout << res.size() << "\n";
409  return res;
410  }
411 
412  //------------------------------------
413  //! returns a list of the indices that have a particular order
414  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) {
415  return d_orderMap[ord];
416  }
417 
418  //------------------------------------
419  //! returns a list of the indices that have a particular order
420  /*!
421  \overload
422  */
423  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) const {
424  typename std::map<orderType, RDKit::INT_VECT>::const_iterator elem;
425  elem = d_orderMap.find(ord);
427  elem != d_orderMap.end(),
428  " catalog does not contain any entries of the order specified");
429  return elem->second;
430  }
431 
432  private:
433  // graphs that store the entries in the catalog in a hierachical manner
434  CatalogGraph d_graph;
435  // a map that maps the order type of entries in the catalog to
436  // a vector of vertex indices in the graphs above
437  // e.g. for a catalog with molecular fragments, the order of a fragment can
438  // simply be the number of bond in it. The list this oder maps to is all the
439  // vertex ids of these fragment in the catalog that have this many bonds in
440  // them
441  std::map<orderType, RDKit::INT_VECT> d_orderMap;
442 
443  //------------------------------------
444  //! clear any memory that we've used
445  void destroy() {
446  ENT_ITER_PAIR entItP = boost::vertices(d_graph);
447  typename boost::property_map<CatalogGraph, vertex_entry_t>::type pMap =
448  boost::get(vertex_entry_t(), d_graph);
449  while (entItP.first != entItP.second) {
450  delete pMap[*(entItP.first++)];
451  }
452  }
453 };
454 
455 //-----------------------------------------------------------------------------
456 //! a linear Catalog (analogous to an std::vector)
457 /*!
458  Here there is no particular hierarchy, simply a
459  collection of entries.
460 */
461 template <class entryType, class orderType>
462 class LinearCatalog : public Catalog<entryType, orderType> {
463  // here there is no particular hierarchy of entries
464  // we simply model it as a vector of entries
465  // FIX: for retrieval purposes a better model map be std::map
466 
467  public:
468  std::string Serialize();
469 
470  unsigned int addEntry(entryType *entry, bool updateFPLength = true);
471 
472  const entryType *getEntryWithIdx(unsigned int idx) const;
473 
474  private:
475  std::vector<entryType *> d_vector;
476 };
477 }
478 
479 #endif
virtual const entryType * getEntryWithIdx(unsigned int idx) const =0
returns a particular entry in the Catalog
const int versionMinor
Definition: Catalog.h:33
void initFromString(const std::string &text)
fills the contents of this object from a string containing a pickle
Definition: Catalog.h:288
CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER
Definition: Catalog.h:157
used by the BGL to set up the node properties in our graph
Definition: Catalog.h:140
paramType paramType_t
Definition: Catalog.h:43
virtual void setCatalogParams(const paramType *params)
sets our parameters by copying the params argument
Definition: Catalog.h:85
int getIdOfEntryWithBitId(unsigned int idx) const
returns the index of the entry with a particular bit ID
Definition: Catalog.h:383
virtual std::string Serialize() const =0
return a serialized form of the Catalog as an std::string
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:100
unsigned int getNumEntries() const
returns the number of entries
Definition: Catalog.h:284
boost::vertex_property_tag kind
Definition: Catalog.h:142
virtual unsigned int addEntry(entryType *entry, bool updateFPLength=true)=0
adds an entry to the catalog
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:246
boost::graph_traits< CatalogGraph > CAT_GRAPH_TRAITS
Definition: Catalog.h:154
RDKIT_CHEMREACTIONS_EXPORT void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
const int endianId
Definition: Catalog.h:35
virtual unsigned int getNumEntries() const =0
returns the number of entries
std::pair< DOWN_ENT_ITER, DOWN_ENT_ITER > DOWN_ENT_ITER_PAIR
Definition: Catalog.h:158
a linear Catalog (analogous to an std::vector)
Definition: Catalog.h:462
A Catalog with a hierarchical structure.
Definition: Catalog.h:135
std::pair< VER_ITER, VER_ITER > ENT_ITER_PAIR
Definition: Catalog.h:156
void initFromStream(std::istream &ss)
fills the contents of this object from a stream containing a pickle
Definition: Catalog.h:236
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord) const
returns a list of the indices that have a particular order
Definition: Catalog.h:423
abstract base class for a catalog object
Definition: Catalog.h:40
std::string Serialize() const
serializes this object and returns the resulting pickle
Definition: Catalog.h:227
const paramType * getCatalogParams() const
returns a pointer to our parameters
Definition: Catalog.h:101
std::vector< int > INT_VECT
Definition: types.h:253
CAT_GRAPH_TRAITS::vertex_iterator VER_ITER
Definition: Catalog.h:155
RDKit::INT_VECT getDownEntryList(unsigned int idx) const
returns a list of the indices of entries below the one passed in
Definition: Catalog.h:400
virtual ~Catalog()
Definition: Catalog.h:49
const int versionPatch
Definition: Catalog.h:34
boost::property< vertex_entry_t, entryType * > EntryProperty
Definition: Catalog.h:144
const entryType * getEntryWithIdx(unsigned int idx) const
returns a pointer to our entry with a particular index
Definition: Catalog.h:356
#define URANGE_CHECK(x, hi)
Definition: Invariant.h:141
boost::adjacency_list< boost::vecS, boost::vecS, boost::bidirectionalS, EntryProperty > CatalogGraph
the type of the graph itself:
Definition: Catalog.h:152
const int versionMajor
Definition: Catalog.h:32
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:226
#define PRECONDITION(expr, mess)
Definition: Invariant.h:108
RDKIT_RDGENERAL_EXPORT std::ostream & toStream(std::ostream &)
unsigned int d_fpLength
the length of our fingerprint
Definition: Catalog.h:109
const entryType * getEntryWithBitId(unsigned int idx) const
returns a pointer to our entry with a particular bit ID
Definition: Catalog.h:366
paramType * dp_cParams
our params object
Definition: Catalog.h:110
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord)
returns a list of the indices that have a particular order
Definition: Catalog.h:414
void addEdge(unsigned int id1, unsigned int id2)
adds an edge between two entries in the catalog
Definition: Catalog.h:337
unsigned int getFPLength() const
returns the length of our fingerprint
Definition: Catalog.h:77
unsigned int addEntry(entryType *entry, bool updateFPLength=true)
add a new entry to the catalog
Definition: Catalog.h:306
void toStream(std::ostream &ss) const
serializes this object to a stream
Definition: Catalog.h:181
void setFPLength(unsigned int val)
sets our fingerprint length
Definition: Catalog.h:81
entryType entryType_t
Definition: Catalog.h:42