IsoSpec  1.95
marginalTrek++.cpp
1 /*
2  * Copyright (C) 2015-2019 Mateusz Łącki and Michał Startek.
3  *
4  * This file is part of IsoSpec.
5  *
6  * IsoSpec is free software: you can redistribute it and/or modify
7  * it under the terms of the Simplified ("2-clause") BSD licence.
8  *
9  * IsoSpec is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  *
13  * You should have received a copy of the Simplified BSD Licence
14  * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15  */
16 
17 
18 #include <cmath>
19 #include <algorithm>
20 #include <vector>
21 #include <stdlib.h>
22 #include <tuple>
23 #include <unordered_map>
24 #include <unordered_set>
25 #include <queue>
26 #include <utility>
27 #include <iostream>
28 #include <string.h>
29 #include <string>
30 #include <limits>
31 #include <cstdlib>
32 #include <fenv.h>
33 #include "platform.h"
34 #include "marginalTrek++.h"
35 #include "conf.h"
36 #include "allocator.h"
37 #include "operators.h"
38 #include "summator.h"
39 #include "element_tables.h"
40 #include "misc.h"
41 
42 
43 namespace IsoSpec
44 {
45 
47 
55 Conf initialConfigure(const int atomCnt, const int isotopeNo, const double* probs, const double* lprobs)
56 {
61  Conf res = new int[isotopeNo];
62 
63  // This approximates the mode (heuristics: the mean is close to the mode).
64  for(int i = 0; i < isotopeNo; ++i )
65  res[i] = int( atomCnt * probs[i] ) + 1;
66 
67  // The number of assigned atoms above.
68  int s = 0;
69 
70  for(int i = 0; i < isotopeNo; ++i) s += res[i];
71 
72  int diff = atomCnt - s;
73 
74  // Too little: enlarging fist index.
75  if( diff > 0 ){
76  res[0] += diff;
77  }
78  // Too much: trying to redistribute the difference: hopefully the first element is the largest.
79  if( diff < 0 ){
80  diff = abs(diff);
81  int i = 0, coordDiff = 0;
82 
83  while( diff > 0){
84  coordDiff = res[i] - diff;
85 
86  if( coordDiff >= 0 ){
87  res[i] -= diff;
88  diff = 0;
89  } else {
90  res[i] = 0;
91  i++;
92  diff = abs(coordDiff);
93  }
94  }
95  }
96 
97  // What we computed so far will be very close to the mode: hillclimb the rest of the way
98 
99  bool modified = true;
100  double LP = unnormalized_logProb(res, lprobs, isotopeNo);
101  double NLP;
102 
103  while(modified)
104  {
105  modified = false;
106  for(int ii = 0; ii<isotopeNo; ii++)
107  for(int jj = 0; jj<isotopeNo; jj++)
108  if(ii != jj && res[ii] > 0)
109  {
110  res[ii]--;
111  res[jj]++;
112  NLP = unnormalized_logProb(res, lprobs, isotopeNo);
113  if(NLP>LP || (NLP==LP && ii>jj))
114  {
115  modified = true;
116  LP = NLP;
117  }
118  else
119  {
120  res[ii]++;
121  res[jj]--;
122  }
123  }
124 
125 
126  }
127  return res;
128 }
129 
130 
131 
132 #if !ISOSPEC_BUILDING_R
133 void printMarginal( const std::tuple<double*,double*,int*,int>& results, int dim)
134 {
135  for(int i=0; i<std::get<3>(results); i++){
136 
137  std::cout << "Mass = " << std::get<0>(results)[i] <<
138  " log-prob =\t" << std::get<1>(results)[i] <<
139  " prob =\t" << exp(std::get<1>(results)[i]) <<
140  "\tand configuration =\t";
141 
142  for(int j=0; j<dim; j++) std::cout << std::get<2>(results)[i*dim + j] << " ";
143 
144  std::cout << std::endl;
145  }
146 }
147 #endif
148 
149 
150 double* getMLogProbs(const double* probs, int isoNo)
151 {
157  for(int ii = 0; ii < isoNo; ii++)
158  if(probs[ii] <= 0.0 || probs[ii] > 1.0)
159  throw std::invalid_argument("All isotope probabilities p must fulfill: 0.0 < p <= 1.0");
160 
161  int curr_method = fegetround();
162  fesetround(FE_UPWARD);
163  double* ret = new double[isoNo];
164 
165  // here we change the table of probabilities and log it.
166  for(int i = 0; i < isoNo; i++)
167  {
168  ret[i] = log(probs[i]);
169  for(int j=0; j<ISOSPEC_NUMBER_OF_ISOTOPIC_ENTRIES; j++)
170  if(elem_table_probability[j] == probs[i])
171  {
172  ret[i] = elem_table_log_probability[j];
173  break;
174  }
175  }
176  fesetround(curr_method);
177  return ret;
178 }
179 
180 double get_loggamma_nominator(int x)
181 {
182  // calculate log gamma of the nominator calculated in the binomial exression.
183  int curr_method = fegetround();
184  fesetround(FE_UPWARD);
185  double ret = lgamma(x+1);
186  fesetround(curr_method);
187  return ret;
188 }
189 
190 int verify_atom_cnt(int atomCnt)
191 {
192  if(ISOSPEC_G_FACT_TABLE_SIZE-1 <= atomCnt)
193  throw std::length_error("Subisotopologue too large, size limit (that is, the maximum number of atoms of a single element in a molecule) is: " + std::to_string(ISOSPEC_G_FACT_TABLE_SIZE-1));
194  return atomCnt;
195 }
196 
198  const double* _masses,
199  const double* _probs,
200  int _isotopeNo,
201  int _atomCnt
202 ) :
203 disowned(false),
204 isotopeNo(_isotopeNo),
205 atomCnt(verify_atom_cnt(_atomCnt)),
206 atom_lProbs(getMLogProbs(_probs, isotopeNo)),
207 atom_masses(array_copy<double>(_masses, _isotopeNo)),
208 loggamma_nominator(get_loggamma_nominator(_atomCnt)),
209 mode_conf(initialConfigure(atomCnt, isotopeNo, _probs, atom_lProbs)),
210 mode_lprob(loggamma_nominator+unnormalized_logProb(mode_conf, atom_lProbs, isotopeNo)),
211 mode_mass(mass(mode_conf, atom_masses, isotopeNo)),
212 mode_prob(exp(mode_lprob)),
213 smallest_lprob(atomCnt * *std::min_element(atom_lProbs, atom_lProbs+isotopeNo))
214 {}
215 
216 // the move-constructor: used in the specialization of the marginal.
218 disowned(other.disowned),
219 isotopeNo(other.isotopeNo),
220 atomCnt(other.atomCnt),
221 atom_lProbs(other.atom_lProbs),
222 atom_masses(other.atom_masses),
223 loggamma_nominator(other.loggamma_nominator),
224 mode_conf(other.mode_conf),
225 mode_lprob(other.mode_lprob),
226 mode_mass(other.mode_mass),
227 mode_prob(other.mode_prob),
228 smallest_lprob(other.smallest_lprob)
229 {
230  other.disowned = true;
231 }
232 
234 {
235  if(!disowned)
236  {
237  delete[] atom_masses;
238  delete[] atom_lProbs;
239  delete[] mode_conf;
240  }
241 }
242 
243 
245 {
246  double ret_mass = std::numeric_limits<double>::infinity();
247  for(unsigned int ii=0; ii < isotopeNo; ii++)
248  if( ret_mass > atom_masses[ii] )
249  ret_mass = atom_masses[ii];
250  return ret_mass*atomCnt;
251 }
252 
254 {
255  double ret_mass = 0.0;
256  for(unsigned int ii=0; ii < isotopeNo; ii++)
257  if( ret_mass < atom_masses[ii] )
258  ret_mass = atom_masses[ii];
259  return ret_mass*atomCnt;
260 }
261 
263 {
264  double found_prob = -std::numeric_limits<double>::infinity();
265  double found_mass = 0.0; // to avoid uninitialized var warning
266  for(unsigned int ii=0; ii < isotopeNo; ii++)
267  if( found_prob < atom_lProbs[ii] )
268  {
269  found_prob = atom_lProbs[ii];
270  found_mass = atom_masses[ii];
271  }
272  return found_mass*atomCnt;
273 }
274 
276 {
277  double ret = 0.0;
278  for(unsigned int ii = 0; ii < isotopeNo; ii++)
279  ret += exp(atom_lProbs[ii]) * atom_masses[ii];
280  return ret * atomCnt;
281 }
282 
283 // this is roughly an equivalent of IsoSpec-Threshold-Generator
285  Marginal&& m,
286  int tabSize,
287  int hashSize
288 ) :
289 Marginal(std::move(m)),
290 current_count(0),
291 keyHasher(isotopeNo),
292 equalizer(isotopeNo),
293 orderMarginal(atom_lProbs, isotopeNo),
294 visited(hashSize,keyHasher,equalizer),
295 pq(orderMarginal),
296 totalProb(),
297 candidate(new int[isotopeNo]),
298 allocator(isotopeNo, tabSize)
299 {
300  int* initialConf = allocator.makeCopy(mode_conf);
301 
302  pq.push(initialConf);
303  visited[initialConf] = 0;
304 
305  totalProb = Summator();
306 
307  current_count = 0;
308 
309  add_next_conf();
310 }
311 
312 
313 bool MarginalTrek::add_next_conf()
314 {
319  if(pq.size() < 1) return false;
320 
321  Conf topConf = pq.top();
322  pq.pop();
323  ++current_count;
324  visited[topConf] = current_count;
325 
326  _confs.push_back(topConf);
327  _conf_masses.push_back(mass(topConf, atom_masses, isotopeNo));
328  double logprob = logProb(topConf);
329  _conf_lprobs.push_back(logprob);
330 
331 
332  totalProb.add( exp( logprob ) );
333 
334  for( unsigned int i = 0; i < isotopeNo; ++i )
335  {
336  for( unsigned int j = 0; j < isotopeNo; ++j )
337  {
338  // Growing index different than decreasing one AND
339  // Remain on simplex condition.
340  if( i != j && topConf[j] > 0 ){
341  copyConf(topConf, candidate, isotopeNo);
342 
343  ++candidate[i];
344  --candidate[j];
345 
346  // candidate should not have been already visited.
347  if( visited.count( candidate ) == 0 )
348  {
349  Conf acceptedCandidate = allocator.makeCopy(candidate);
350  pq.push(acceptedCandidate);
351 
352  visited[acceptedCandidate] = 0;
353  }
354  }
355  }
356  }
357 
358  return true;
359 }
360 
362 {
363  Summator s;
364  int last_idx = -1;
365  for(unsigned int i=0; i<_conf_lprobs.size(); i++)
366  {
367  s.add(_conf_lprobs[i]);
368  if(s.get() >= cutoff)
369  {
370  last_idx = i;
371  break;
372  }
373  }
374  if(last_idx > -1)
375  return last_idx;
376 
377  while(totalProb.get() < cutoff && add_next_conf()) {}
378  return _conf_lprobs.size();
379 }
380 
381 
382 MarginalTrek::~MarginalTrek()
383 {
384  delete[] candidate;
385 }
386 
387 
388 
389 
391  double lCutOff,
392  bool sort,
393  int tabSize,
394  int hashSize
395 ) : Marginal(std::move(m)),
396 allocator(isotopeNo, tabSize)
397 {
398  const ConfEqual equalizer(isotopeNo);
399  const KeyHasher keyHasher(isotopeNo);
400  const ConfOrderMarginalDescending orderMarginal(atom_lProbs, isotopeNo);
401 
402  std::unordered_set<Conf,KeyHasher,ConfEqual> visited(hashSize,keyHasher,equalizer);
403 
404  Conf currentConf = allocator.makeCopy(mode_conf);
405  if(logProb(currentConf) >= lCutOff)
406  {
407  // create a copy and store a ptr to the *same* copy in both structures
408  // (save some space and time)
409  auto tmp = allocator.makeCopy(currentConf);
410  configurations.push_back(tmp);
411  visited.insert(tmp);
412  }
413 
414  unsigned int idx = 0;
415 
416  while(idx < configurations.size())
417  {
418  memcpy(currentConf, configurations[idx], sizeof(int)*isotopeNo);
419  idx++;
420  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
421  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
422  if( ii != jj && currentConf[jj] > 0)
423  {
424  currentConf[ii]++;
425  currentConf[jj]--;
426 
427  if (visited.count(currentConf) == 0 && logProb(currentConf) >= lCutOff)
428  {
429  // create a copy and store a ptr to the *same* copy in
430  // both structures (save some space and time)
431  auto tmp = allocator.makeCopy(currentConf);
432  visited.insert(tmp);
433  configurations.push_back(tmp);
434  // std::cout << " V: "; for (auto it : visited) std::cout << it << " "; std::cout << std::endl;
435  }
436 
437  currentConf[ii]--;
438  currentConf[jj]++;
439 
440  }
441  }
442 
443  // orderMarginal defines the order of configurations (compares their logprobs)
444  // akin to key in Python sort.
445  if(sort)
446  std::sort(configurations.begin(), configurations.end(), orderMarginal);
447 
448 
449  confs = &configurations[0];
450  no_confs = configurations.size();
451  lProbs = new double[no_confs+1];
452  probs = new double[no_confs];
453  masses = new double[no_confs];
454 
455 
456  for(unsigned int ii=0; ii < no_confs; ii++)
457  {
458  lProbs[ii] = logProb(confs[ii]);
459  probs[ii] = exp(lProbs[ii]);
460  masses[ii] = mass(confs[ii], atom_masses, isotopeNo);
461  }
462  lProbs[no_confs] = -std::numeric_limits<double>::infinity();
463 }
464 
465 
467 {
468  if(lProbs != nullptr)
469  delete[] lProbs;
470  if(masses != nullptr)
471  delete[] masses;
472  if(probs != nullptr)
473  delete[] probs;
474 }
475 
476 
477 
478 
479 
480 
481 
482 LayeredMarginal::LayeredMarginal(Marginal&& m, int tabSize, int _hashSize)
483 : Marginal(std::move(m)), current_threshold(1.0), allocator(isotopeNo, tabSize), sorted_up_to_idx(0),
484 equalizer(isotopeNo), keyHasher(isotopeNo), orderMarginal(atom_lProbs, isotopeNo), hashSize(_hashSize)
485 {
486  fringe.push_back(mode_conf);
487  lProbs.push_back(std::numeric_limits<double>::infinity());
488  lProbs.push_back(-std::numeric_limits<double>::infinity());
489  guarded_lProbs = lProbs.data()+1;
490 }
491 
492 bool LayeredMarginal::extend(double new_threshold)
493 {
494  if(fringe.empty())
495  return false;
496 
497  // TODO: Make sorting optional (controlled by argument?)
498  std::vector<Conf> new_fringe;
499  std::unordered_set<Conf,KeyHasher,ConfEqual> visited(hashSize,keyHasher,equalizer);
500 
501  for(unsigned int ii = 0; ii<fringe.size(); ii++)
502  visited.insert(fringe[ii]);
503 
504  double lpc, opc;
505 
506  Conf currentConf;
507  while(!fringe.empty())
508  {
509  currentConf = fringe.back();
510  fringe.pop_back();
511 
512  opc = logProb(currentConf);
513 
514  if(opc < new_threshold)
515  new_fringe.push_back(currentConf);
516 
517  else
518  {
519  configurations.push_back(currentConf);
520  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
521  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
522  if( ii != jj && currentConf[jj] > 0 )
523  {
524  currentConf[ii]++;
525  currentConf[jj]--;
526 
527  lpc = logProb(currentConf);
528 
529  if (visited.count(currentConf) == 0 && lpc < current_threshold &&
530  (opc > lpc || (opc == lpc && ii > jj)))
531  {
532  Conf nc = allocator.makeCopy(currentConf);
533  currentConf[ii]--;
534  currentConf[jj]++;
535  visited.insert(nc);
536  if(lpc >= new_threshold)
537  fringe.push_back(nc);
538  else
539  new_fringe.push_back(nc);
540  }
541  else
542  {
543  currentConf[ii]--;
544  currentConf[jj]++;
545  }
546 
547  }
548  }
549  }
550 
551  current_threshold = new_threshold;
552  fringe.swap(new_fringe);
553 
554  std::sort(configurations.begin()+sorted_up_to_idx, configurations.end(), orderMarginal);
555 
556  if(lProbs.capacity() * 2 < configurations.size() + 2)
557  {
558  // Reserve space for new values
559  lProbs.reserve(configurations.size()+2);
560  probs.reserve(configurations.size());
561  masses.reserve(configurations.size());
562  } // Otherwise we're growing slowly enough that standard reallocations on push_back work better - we waste some extra memory
563  // but don't reallocate on every call
564 
565  lProbs.pop_back(); // The guardian...
566 
567  for(unsigned int ii=sorted_up_to_idx; ii < configurations.size(); ii++)
568  {
569  lProbs.push_back(logProb(configurations[ii]));
570  probs.push_back(exp(lProbs.back()));
571  masses.push_back(mass(configurations[ii], atom_masses, isotopeNo));
572  }
573 
574  lProbs.push_back(-std::numeric_limits<double>::infinity()); // Restore guardian
575 
576  sorted_up_to_idx = configurations.size();
577  guarded_lProbs = lProbs.data()+1; // Vector might have reallocated its backing storage
578 
579  return true;
580 }
581 
582 
583 
584 } // namespace IsoSpec
585 
IsoSpec::Marginal::getMonoisotopicConfMass
double getMonoisotopicConfMass() const
Get the mass of the monoisotopic subisotopologue.
Definition: marginalTrek++.cpp:262
IsoSpec::PrecalculatedMarginal::PrecalculatedMarginal
PrecalculatedMarginal(Marginal &&m, double lCutOff, bool sort=true, int tabSize=1000, int hashSize=1000)
The move constructor (disowns the Marginal).
Definition: marginalTrek++.cpp:390
IsoSpec::Marginal::Marginal
Marginal(const double *_masses, const double *_probs, int _isotopeNo, int _atomCnt)
Class constructor.
Definition: marginalTrek++.cpp:197
IsoSpec::Marginal
The marginal distribution class (a subisotopologue).
Definition: marginalTrek++.h:44
IsoSpec::LayeredMarginal::extend
bool extend(double new_threshold)
Extend the set of computed subisotopologues to those above the new threshold.
Definition: marginalTrek++.cpp:492
IsoSpec::Marginal::logProb
double logProb(Conf conf) const
Calculate the log-probability of a given subisotopologue.
Definition: marginalTrek++.h:149
IsoSpec::LayeredMarginal::LayeredMarginal
LayeredMarginal(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
Definition: marginalTrek++.cpp:482
IsoSpec::MarginalTrek::processUntilCutoff
int processUntilCutoff(double cutoff)
Calculate subisotopologues with probability above or equal to the cut-off.
Definition: marginalTrek++.cpp:361
IsoSpec::Marginal::getHeaviestConfMass
double getHeaviestConfMass() const
Get the mass of the heaviest subisotopologue.
Definition: marginalTrek++.cpp:253
IsoSpec::initialConfigure
Conf initialConfigure(const int atomCnt, const int isotopeNo, const double *probs, const double *lprobs)
Find one of the most probable subisotopologues.
Definition: marginalTrek++.cpp:55
IsoSpec
Definition: allocator.cpp:21
IsoSpec::PrecalculatedMarginal::~PrecalculatedMarginal
virtual ~PrecalculatedMarginal()
Destructor.
Definition: marginalTrek++.cpp:466
IsoSpec::Marginal::getLightestConfMass
double getLightestConfMass() const
Get the mass of the lightest subisotopologue.
Definition: marginalTrek++.cpp:244
IsoSpec::getMLogProbs
double * getMLogProbs(const double *probs, int isoNo)
Definition: marginalTrek++.cpp:150
IsoSpec::KeyHasher
Definition: operators.h:27
IsoSpec::Marginal::atom_masses
const double *const atom_masses
Definition: marginalTrek++.h:52
IsoSpec::Summator
Definition: summator.h:75
IsoSpec::Marginal::atomCnt
const unsigned int atomCnt
Definition: marginalTrek++.h:50
IsoSpec::ConfEqual
Definition: operators.h:45
IsoSpec::Marginal::getTheoreticalAverageMass
double getTheoreticalAverageMass() const
The theoretical average mass of the molecule.
Definition: marginalTrek++.cpp:275
IsoSpec::Marginal::atom_lProbs
const double *const atom_lProbs
Definition: marginalTrek++.h:51
IsoSpec::Marginal::~Marginal
virtual ~Marginal()
Destructor.
Definition: marginalTrek++.cpp:233
IsoSpec::MarginalTrek::MarginalTrek
MarginalTrek(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
Definition: marginalTrek++.cpp:284
IsoSpec::Marginal::isotopeNo
const unsigned int isotopeNo
Definition: marginalTrek++.h:49
IsoSpec::Marginal::mode_conf
const Conf mode_conf
Definition: marginalTrek++.h:54
IsoSpec::ConfOrderMarginalDescending
Definition: operators.h:92