RDKit
Open-source cheminformatics and machine learning.
RGroupScore.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2017 Novartis Institutes for BioMedical Research
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef RGROUP_SCORE_H
11 #define RGROUP_SCORE_H
12 
13 #include "RGroupMatch.h"
14 #include <vector>
15 #include <deque>
16 #include <set>
17 namespace RDKit {
18 
19 //! iterate through all possible permutations of the rgroups
21  std::vector<size_t> permutation;
22  std::vector<size_t> sizes;
23  std::deque<size_t> bases;
26  CartesianProduct(const std::vector<size_t> &inputSizes)
27  : permutation(inputSizes.size(), 0),
28  sizes(inputSizes),
29  permutationCount(0) {
30  maxPermutations = 1;
31  for (unsigned long size : sizes) {
32  bases.push_front(maxPermutations);
33  maxPermutations *= size; // may overflow....
34  }
35  }
36 
37  bool next() {
39  if (permutationCount == 1) {
40  return true;
41  }
42 
43  return increment(0);
44  }
45 
46  size_t value(const std::vector<size_t> &p) const {
47  size_t v = 0;
48  for (size_t i = 0; i < p.size(); ++i) {
49  v += bases[i] * p[i];
50  }
51  return v;
52  }
53 
54  size_t value() { return value(permutation); }
55 
56  bool increment(size_t rowToIncrement) {
58  return false;
59  }
60 
61  permutation[rowToIncrement] += 1;
62  size_t max_index_of_row = sizes[rowToIncrement] - 1;
63  if (permutation[rowToIncrement] > max_index_of_row) {
64  permutation[rowToIncrement] = 0;
65  return increment(rowToIncrement + 1);
66  }
67  return true;
68  }
69 };
70 
72  public:
74  RGroupScorer(const std::vector<std::vector<size_t>> &permutations,
75  double score);
76  //! score the passed permutation of matches
77  double matchScore(const std::vector<size_t> &permutation,
78  const std::vector<std::vector<RGroupMatch>> &matches,
79  const std::set<int> &labels);
80  //! set the passed permutation and score as the best one
81  void setBestPermutation(const std::vector<size_t> &permutation, double score);
82  //! return the best permutation found so far
83  const std::vector<size_t> &getBestPermutation() const {
84  return d_saved.permutation;
85  }
86  //! called when process() starts to initialize State
88  //! store the passed tied permutation for subsequent processing
89  void pushTieToStore(const std::vector<size_t> &permutation);
90  //! find the best permutation across the tied ones that were stored
91  void breakTies(const std::vector<std::vector<RGroupMatch>> &matches,
92  const std::set<int> &labels,
93  const std::unique_ptr<CartesianProduct> &iterator,
94  const std::chrono::steady_clock::time_point &t0,
95  double timeout);
96  //! clear all stored tied permutations
97  void clearTieStore();
98  //! number of stored tied permutations
99  size_t tieStoreSize() const { return d_store.size(); }
100  //! return the best score found so far
101  double getBestScore() const { return d_bestScore; }
102 
103  private:
104  void restoreInitialState() { d_current = d_initial; }
105  struct RLabelData {
106  int numRGroups = 0;
107  std::vector<std::map<std::string, unsigned int>> matchSetVect;
108  std::map<std::set<int>, size_t> linkerMatchSet;
109  };
110  // The State structure stores the state of the RGroupScorer
111  // This allows more efficient scoring of permutations, in that
112  // the score of pruned permutations, which are effectively frozen,
113  // are cached in the State rather than being recomputed on-the-fly
114  // while only permutations in the last chunk are actually scored
115  struct State {
116  // compute the criteria according to which the best
117  // permutation is found across the tied ones
118  void computeTieBreakingCriteria(
119  const std::vector<std::vector<RGroupMatch>> &matches,
120  const std::vector<int> &orderedLabels, std::vector<int> &heavyCounts) {
121  // heavyCounts is a vector which has the same size of labels
122  // for each label we add an increment if a molecule
123  // bears an R-group at that label
124  PRECONDITION(permutation.size() <= matches.size(),
125  "permutation.size() should be <= matches.size()");
126  size_t offset = matches.size() - permutation.size();
127  // numMatchedUserRGroups counts the total number of user labelled r
128  // groups filled in this permutation. We want to maximize this number
129  size_t i = 0;
130  for (int label : orderedLabels) {
131  for (size_t m = 0; m < permutation.size(); ++m) { // for each molecule
132  // Negative labels are assigned to R-groups that were found along
133  // the way (when onlyMatchAtRGroups=false) rather than being
134  // user-specified. For each molecule, check if we add an R-group at
135  // this negative label; if we do, count it once. So we know how many
136  // different negative labels we have filled: we prefer permutations
137  // which fill less, as it means we have added less groups on different
138  // positions
139  const auto &match = matches[m + offset][permutation[m]];
140  auto rg = match.rgroups.find(label);
141  if (rg != match.rgroups.end() && !rg->second->is_hydrogen) {
142  if (label < 0 && heavyCounts.at(i) == 0) {
143  ++numAddedRGroups;
144  } else if (label > 0) {
145  ++numMatchedUserRGroups;
146  }
147  ++heavyCounts[i];
148  }
149  }
150  ++i;
151  }
152  }
153 
154  int N = 0;
155  int numAddedRGroups = 0;
156  int numMatchedUserRGroups = 0;
157  std::map<int, int> heavyCountPerLabel;
158  std::map<int, RLabelData> labelDataMap;
159  std::vector<size_t> permutation;
160  };
161  double d_bestScore = 0.0;
162  // the current State
163  State d_current;
164  // the initial state when process() is called
165  State d_initial;
166  // the best State found so far
167  State d_saved;
168  // the States associated to each tied permutation
169  std::deque<State> d_store;
170 };
171 
172 } // namespace RDKit
173 #endif
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
void pushTieToStore(const std::vector< size_t > &permutation)
store the passed tied permutation for subsequent processing
void startProcessing()
called when process() starts to initialize State
void setBestPermutation(const std::vector< size_t > &permutation, double score)
set the passed permutation and score as the best one
void breakTies(const std::vector< std::vector< RGroupMatch >> &matches, const std::set< int > &labels, const std::unique_ptr< CartesianProduct > &iterator, const std::chrono::steady_clock::time_point &t0, double timeout)
find the best permutation across the tied ones that were stored
void clearTieStore()
clear all stored tied permutations
RGroupScorer(const std::vector< std::vector< size_t >> &permutations, double score)
const std::vector< size_t > & getBestPermutation() const
return the best permutation found so far
Definition: RGroupScore.h:83
double matchScore(const std::vector< size_t > &permutation, const std::vector< std::vector< RGroupMatch >> &matches, const std::set< int > &labels)
score the passed permutation of matches
size_t tieStoreSize() const
number of stored tied permutations
Definition: RGroupScore.h:99
double getBestScore() const
return the best score found so far
Definition: RGroupScore.h:101
#define RDKIT_RGROUPDECOMPOSITION_EXPORT
Definition: export.h:393
Std stuff.
Definition: Abbreviations.h:18
iterate through all possible permutations of the rgroups
Definition: RGroupScore.h:20
std::vector< size_t > sizes
Definition: RGroupScore.h:22
std::deque< size_t > bases
Definition: RGroupScore.h:23
size_t value(const std::vector< size_t > &p) const
Definition: RGroupScore.h:46
CartesianProduct(const std::vector< size_t > &inputSizes)
Definition: RGroupScore.h:26
bool increment(size_t rowToIncrement)
Definition: RGroupScore.h:56
std::vector< size_t > permutation
Definition: RGroupScore.h:21