RDKit
Open-source cheminformatics and machine learning.
SparseIntVect.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef __RD_SPARSE_INT_VECT_20070921__
13 #define __RD_SPARSE_INT_VECT_20070921__
14 
15 #include <map>
16 #include <string>
17 #include <RDGeneral/Invariant.h>
18 #include <sstream>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/StreamOps.h>
21 #include <cstdint>
22 
24  0x0001; //!< version number to use in pickles
25 namespace RDKit {
26 //! a class for efficiently storing sparse vectors of ints
27 template <typename IndexType>
29  public:
30  typedef std::map<IndexType, int> StorageType;
31 
32  SparseIntVect() : d_length(0){};
33 
34  //! initialize with a particular length
35  SparseIntVect(IndexType length) : d_length(length){};
36 
37  //! Copy constructor
39  d_length = other.d_length;
40  d_data.insert(other.d_data.begin(), other.d_data.end());
41  }
42 
43  //! constructor from a pickle
44  SparseIntVect(const std::string &pkl) {
45  initFromText(pkl.c_str(), pkl.size());
46  };
47  //! constructor from a pickle
48  SparseIntVect(const char *pkl, const unsigned int len) {
49  initFromText(pkl, len);
50  };
51 
53  if (this == &other) {
54  return *this;
55  }
56  d_length = other.d_length;
57  d_data.insert(other.d_data.begin(), other.d_data.end());
58  return *this;
59  }
60 
61  //! destructor (doesn't need to do anything)
63 
64 #ifdef __clang__
65 #pragma clang diagnostic push
66 #pragma clang diagnostic ignored "-Wtautological-compare"
67 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
68  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
69 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
70 #pragma GCC diagnostic push
71 #endif
72 #pragma GCC diagnostic ignored "-Wtype-limits"
73 #endif
74  //! return the value at an index
75  int getVal(IndexType idx) const {
76  if (idx < 0 || idx >= d_length) {
77  throw IndexErrorException(static_cast<int>(idx));
78  }
79  int res = 0;
80  typename StorageType::const_iterator iter = d_data.find(idx);
81  if (iter != d_data.end()) {
82  res = iter->second;
83  }
84  return res;
85  };
86 
87  //! set the value at an index
88  void setVal(IndexType idx, int val) {
89  if (idx < 0 || idx >= d_length) {
90  throw IndexErrorException(static_cast<int>(idx));
91  }
92  if (val != 0) {
93  d_data[idx] = val;
94  } else {
95  d_data.erase(idx);
96  }
97  };
98 #ifdef __clang__
99 #pragma clang diagnostic pop
100 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
101  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
102 #pragma GCC diagnostic pop
103 #endif
104  //! support indexing using []
105  int operator[](IndexType idx) const { return getVal(idx); };
106 
107  //! returns the length
108  IndexType getLength() const { return d_length; };
109 
110  //! returns the sum of all the elements in the vect
111  //! the doAbs argument toggles summing the absolute values of the elements
112  int getTotalVal(bool doAbs = false) const {
113  int res = 0;
114  typename StorageType::const_iterator iter;
115  for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
116  if (!doAbs)
117  res += iter->second;
118  else
119  res += abs(iter->second);
120  }
121  return res;
122  };
123  //! returns the length
124  unsigned int size() const { return getLength(); };
125 
126  //! returns our nonzero elements as a map(IndexType->int)
127  const StorageType &getNonzeroElements() const { return d_data; }
128 
129  //! this is a "fuzzy" intesection, the final value
130  //! of each element is equal to the minimum from
131  //! the two vects.
133  if (other.d_length != d_length) {
134  throw ValueErrorException("SparseIntVect size mismatch");
135  }
136 
137  typename StorageType::iterator iter = d_data.begin();
138  typename StorageType::const_iterator oIter = other.d_data.begin();
139  while (iter != d_data.end()) {
140  // we're relying on the fact that the maps are sorted:
141  while (oIter != other.d_data.end() && oIter->first < iter->first) {
142  ++oIter;
143  }
144  if (oIter != other.d_data.end() && oIter->first == iter->first) {
145  // found it:
146  if (oIter->second < iter->second) {
147  iter->second = oIter->second;
148  }
149  ++oIter;
150  ++iter;
151  } else {
152  // not there; our value is zero, which means
153  // we should remove this value:
154  typename StorageType::iterator tmpIter = iter;
155  ++tmpIter;
156  d_data.erase(iter);
157  iter = tmpIter;
158  }
159  }
160  return *this;
161  };
163  const SparseIntVect<IndexType> &other) const {
164  SparseIntVect<IndexType> res(*this);
165  return res &= other;
166  }
167 
168  //! this is a "fuzzy" union, the final value
169  //! of each element is equal to the maximum from
170  //! the two vects.
172  if (other.d_length != d_length) {
173  throw ValueErrorException("SparseIntVect size mismatch");
174  }
175 
176  typename StorageType::iterator iter = d_data.begin();
177  typename StorageType::const_iterator oIter = other.d_data.begin();
178  while (iter != d_data.end()) {
179  // we're relying on the fact that the maps are sorted:
180  while (oIter != other.d_data.end() && oIter->first < iter->first) {
181  d_data[oIter->first] = oIter->second;
182  ++oIter;
183  }
184  if (oIter != other.d_data.end() && oIter->first == iter->first) {
185  // found it:
186  if (oIter->second > iter->second) {
187  iter->second = oIter->second;
188  }
189  ++oIter;
190  }
191  ++iter;
192  }
193  // finish up the other vect:
194  while (oIter != other.d_data.end()) {
195  d_data[oIter->first] = oIter->second;
196  ++oIter;
197  }
198  return *this;
199  };
201  const SparseIntVect<IndexType> &other) const {
202  SparseIntVect<IndexType> res(*this);
203  return res |= other;
204  }
205 
207  if (other.d_length != d_length) {
208  throw ValueErrorException("SparseIntVect size mismatch");
209  }
210  typename StorageType::iterator iter = d_data.begin();
211  typename StorageType::const_iterator oIter = other.d_data.begin();
212  while (oIter != other.d_data.end()) {
213  while (iter != d_data.end() && iter->first < oIter->first) {
214  ++iter;
215  }
216  if (iter != d_data.end() && oIter->first == iter->first) {
217  // found it:
218  iter->second += oIter->second;
219  if (!iter->second) {
220  typename StorageType::iterator tIter = iter;
221  ++tIter;
222  d_data.erase(iter);
223  iter = tIter;
224  } else {
225  ++iter;
226  }
227  } else {
228  d_data[oIter->first] = oIter->second;
229  }
230  ++oIter;
231  }
232  return *this;
233  };
235  const SparseIntVect<IndexType> &other) const {
236  SparseIntVect<IndexType> res(*this);
237  return res += other;
238  }
239 
241  if (other.d_length != d_length) {
242  throw ValueErrorException("SparseIntVect size mismatch");
243  }
244  typename StorageType::iterator iter = d_data.begin();
245  typename StorageType::const_iterator oIter = other.d_data.begin();
246  while (oIter != other.d_data.end()) {
247  while (iter != d_data.end() && iter->first < oIter->first) {
248  ++iter;
249  }
250  if (iter != d_data.end() && oIter->first == iter->first) {
251  // found it:
252  iter->second -= oIter->second;
253  if (!iter->second) {
254  typename StorageType::iterator tIter = iter;
255  ++tIter;
256  d_data.erase(iter);
257  iter = tIter;
258  } else {
259  ++iter;
260  }
261  } else {
262  d_data[oIter->first] = -oIter->second;
263  }
264  ++oIter;
265  }
266  return *this;
267  };
269  const SparseIntVect<IndexType> &other) const {
270  SparseIntVect<IndexType> res(*this);
271  return res -= other;
272  }
274  typename StorageType::iterator iter = d_data.begin();
275  while (iter != d_data.end()) {
276  iter->second *= v;
277  ++iter;
278  }
279  return *this;
280  };
282  SparseIntVect<IndexType> res(*this);
283  return res *= v;
284  };
286  typename StorageType::iterator iter = d_data.begin();
287  while (iter != d_data.end()) {
288  iter->second /= v;
289  ++iter;
290  }
291  return *this;
292  };
294  SparseIntVect<IndexType> res(*this);
295  return res /= v;
296  };
298  typename StorageType::iterator iter = d_data.begin();
299  while (iter != d_data.end()) {
300  iter->second += v;
301  ++iter;
302  }
303  return *this;
304  };
306  SparseIntVect<IndexType> res(*this);
307  return res += v;
308  };
310  typename StorageType::iterator iter = d_data.begin();
311  while (iter != d_data.end()) {
312  iter->second -= v;
313  ++iter;
314  }
315  return *this;
316  };
318  SparseIntVect<IndexType> res(*this);
319  return res -= v;
320  };
321 
322  bool operator==(const SparseIntVect<IndexType> &v2) const {
323  if (d_length != v2.d_length) {
324  return false;
325  }
326  return d_data == v2.d_data;
327  }
328  bool operator!=(const SparseIntVect<IndexType> &v2) const {
329  return !(*this == v2);
330  }
331 
332  //! returns a binary string representation (pickle)
333  std::string toString() const {
334  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
335  std::ios_base::in);
336  std::uint32_t tInt;
338  streamWrite(ss, tInt);
339  tInt = sizeof(IndexType);
340  streamWrite(ss, tInt);
341  streamWrite(ss, d_length);
342  IndexType nEntries = d_data.size();
343  streamWrite(ss, nEntries);
344 
345  typename StorageType::const_iterator iter = d_data.begin();
346  while (iter != d_data.end()) {
347  streamWrite(ss, iter->first);
348  std::int32_t tInt = iter->second;
349  streamWrite(ss, tInt);
350  ++iter;
351  }
352  return ss.str();
353  };
354 
355  void fromString(const std::string &txt) {
356  initFromText(txt.c_str(), txt.length());
357  }
358 
359  private:
360  IndexType d_length;
361  StorageType d_data;
362 
363  void initFromText(const char *pkl, const unsigned int len) {
364  d_data.clear();
365  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
366  std::ios_base::in);
367  ss.write(pkl, len);
368 
369  std::uint32_t vers;
370  streamRead(ss, vers);
371  if (vers == 0x0001) {
372  std::uint32_t tInt;
373  streamRead(ss, tInt);
374  if (tInt > sizeof(IndexType)) {
375  throw ValueErrorException(
376  "IndexType cannot accommodate index size in SparseIntVect pickle");
377  }
378  switch (tInt) {
379  case sizeof(char):
380  readVals<unsigned char>(ss);
381  break;
382  case sizeof(std::int32_t):
383  readVals<std::uint32_t>(ss);
384  break;
385  case sizeof(boost::int64_t):
386  readVals<boost::uint64_t>(ss);
387  break;
388  default:
389  throw ValueErrorException("unreadable format");
390  }
391  } else {
392  throw ValueErrorException("bad version in SparseIntVect pickle");
393  }
394  };
395  template <typename T>
396  void readVals(std::stringstream &ss) {
397  PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
398  T tVal;
399  streamRead(ss, tVal);
400  d_length = tVal;
401  T nEntries;
402  streamRead(ss, nEntries);
403  for (T i = 0; i < nEntries; ++i) {
404  streamRead(ss, tVal);
405  std::int32_t val;
406  streamRead(ss, val);
407  d_data[tVal] = val;
408  }
409  }
410 };
411 
412 template <typename IndexType, typename SequenceType>
414  const SequenceType &seq) {
415  typename SequenceType::const_iterator seqIt;
416  for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
417  // EFF: probably not the most efficient approach
418  IndexType idx = *seqIt;
419  vect.setVal(idx, vect.getVal(idx) + 1);
420  }
421 }
422 
423 namespace {
424 template <typename IndexType>
425 void calcVectParams(const SparseIntVect<IndexType> &v1,
426  const SparseIntVect<IndexType> &v2, double &v1Sum,
427  double &v2Sum, double &andSum) {
428  if (v1.getLength() != v2.getLength()) {
429  throw ValueErrorException("SparseIntVect size mismatch");
430  }
431  v1Sum = v2Sum = andSum = 0.0;
432  // we're doing : (v1&v2).getTotalVal(), but w/o generating
433  // the other vector:
434  typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
435  iter1 = v1.getNonzeroElements().begin();
436  if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
437  iter2 = v2.getNonzeroElements().begin();
438  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
439  while (iter1 != v1.getNonzeroElements().end()) {
440  while (iter2 != v2.getNonzeroElements().end() &&
441  iter2->first < iter1->first) {
442  ++iter2;
443  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
444  }
445  if (iter2 != v2.getNonzeroElements().end()) {
446  if (iter2->first == iter1->first) {
447  if (abs(iter2->second) < abs(iter1->second)) {
448  andSum += abs(iter2->second);
449  } else {
450  andSum += abs(iter1->second);
451  }
452  ++iter2;
453  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
454  }
455  ++iter1;
456  if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
457  } else {
458  break;
459  }
460  }
461  if (iter1 != v1.getNonzeroElements().end()) {
462  ++iter1;
463  while (iter1 != v1.getNonzeroElements().end()) {
464  v1Sum += abs(iter1->second);
465  ++iter1;
466  }
467  }
468  if (iter2 != v2.getNonzeroElements().end()) {
469  ++iter2;
470  while (iter2 != v2.getNonzeroElements().end()) {
471  v2Sum += abs(iter2->second);
472  ++iter2;
473  }
474  }
475 }
476 } // namespace
477 
478 template <typename IndexType>
480  const SparseIntVect<IndexType> &v2,
481  bool returnDistance = false, double bounds = 0.0) {
482  if (v1.getLength() != v2.getLength()) {
483  throw ValueErrorException("SparseIntVect size mismatch");
484  }
485  double v1Sum = 0.0;
486  double v2Sum = 0.0;
487  if (!returnDistance && bounds > 0.0) {
488  v1Sum = v1.getTotalVal(true);
489  v2Sum = v2.getTotalVal(true);
490  double denom = v1Sum + v2Sum;
491  if (fabs(denom) < 1e-6) {
492  // no need to worry about returnDistance here
493  return 0.0;
494  }
495  double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
496  if (2. * minV / denom < bounds) {
497  return 0.0;
498  }
499  v1Sum = 0.0;
500  v2Sum = 0.0;
501  }
502 
503  double numer = 0.0;
504 
505  calcVectParams(v1, v2, v1Sum, v2Sum, numer);
506 
507  double denom = v1Sum + v2Sum;
508  double sim;
509  if (fabs(denom) < 1e-6) {
510  sim = 0.0;
511  } else {
512  sim = 2. * numer / denom;
513  }
514  if (returnDistance) sim = 1. - sim;
515  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
516  return sim;
517 }
518 
519 template <typename IndexType>
521  const SparseIntVect<IndexType> &v2, double a, double b,
522  bool returnDistance = false, double bounds = 0.0) {
523  RDUNUSED_PARAM(bounds);
524  if (v1.getLength() != v2.getLength()) {
525  throw ValueErrorException("SparseIntVect size mismatch");
526  }
527  double v1Sum = 0.0;
528  double v2Sum = 0.0;
529  double andSum = 0.0;
530 
531  calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
532 
533  double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
534  double sim;
535 
536  if (fabs(denom) < 1e-6) {
537  sim = 0.0;
538  } else {
539  sim = andSum / denom;
540  }
541  if (returnDistance) sim = 1. - sim;
542  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
543  return sim;
544 }
545 
546 template <typename IndexType>
548  const SparseIntVect<IndexType> &v2,
549  bool returnDistance = false, double bounds = 0.0) {
550  return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
551 }
552 } // namespace RDKit
553 
554 #endif
#define RDUNUSED_PARAM(x)
Definition: Invariant.h:196
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Definition: SparseIntVect.h:23
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:19
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect< IndexType > & operator/(int v)
SparseIntVect(IndexType length)
initialize with a particular length
Definition: SparseIntVect.h:35
unsigned int size() const
returns the length
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
Definition: SparseIntVect.h:38
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator/=(int v)
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
Definition: SparseIntVect.h:48
int operator[](IndexType idx) const
support indexing using []
void fromString(const std::string &txt)
SparseIntVect< IndexType > & operator*=(int v)
void setVal(IndexType idx, int val)
set the value at an index
Definition: SparseIntVect.h:88
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
SparseIntVect & operator=(const SparseIntVect< IndexType > &other)
Definition: SparseIntVect.h:52
std::string toString() const
returns a binary string representation (pickle)
int getTotalVal(bool doAbs=false) const
SparseIntVect< IndexType > & operator-(int v)
std::map< IndexType, int > StorageType
Definition: SparseIntVect.h:30
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator-=(int v)
SparseIntVect(const std::string &pkl)
constructor from a pickle
Definition: SparseIntVect.h:44
~SparseIntVect()
destructor (doesn't need to do anything)
Definition: SparseIntVect.h:62
SparseIntVect< IndexType > & operator+(int v)
IndexType getLength() const
returns the length
int getVal(IndexType idx) const
return the value at an index
Definition: SparseIntVect.h:75
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:39
Std stuff.
Definition: Abbreviations.h:17
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:274
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:254