SeqAn3  3.1.0
The Modern C++ library for sequence analysis.
search_scheme_algorithm.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <type_traits>
16 
24 
25 namespace seqan3::detail
26 {
27 
34 template <typename configuration_t, typename index_t, typename ...policies_t>
36  requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
38 class search_scheme_algorithm : protected policies_t...
39 {
40 private:
42  using traits_t = search_traits<configuration_t>;
44  using search_result_type = typename traits_t::search_result_type;
45 
46  static_assert(!std::same_as<search_result_type, empty_type>, "The search result type was not configured.");
47 
48 public:
52  search_scheme_algorithm() = default;
53  search_scheme_algorithm(search_scheme_algorithm const &) = default;
54  search_scheme_algorithm(search_scheme_algorithm &&) = default;
55  search_scheme_algorithm & operator=(search_scheme_algorithm const &) = default;
56  search_scheme_algorithm & operator=(search_scheme_algorithm &&) = default;
57  ~search_scheme_algorithm() = default;
58 
70  search_scheme_algorithm(configuration_t const & cfg, index_t const & index) : policies_t{cfg}...
71  {
72  stratum = cfg.get_or(search_cfg::hit_strata{0}).stratum;
73  index_ptr = std::addressof(index);
74  }
76 
97  template <tuple_like indexed_query_t, typename callback_t>
99  requires (std::tuple_size_v<indexed_query_t> == 2) &&
100  std::ranges::forward_range<std::tuple_element_t<1, indexed_query_t>> &&
101  std::invocable<callback_t, search_result_type>
103  void operator()(indexed_query_t && indexed_query, callback_t && callback)
104  {
105  auto && [query_idx, query] = indexed_query;
106  auto error_state = this->max_error_counts(query); // see policy_max_error
107 
108  // construct internal delegate for collecting hits for later filtering (if necessary)
110  auto on_hit_delegate = [&internal_hits] (auto const & it)
111  {
112  internal_hits.push_back(it);
113  };
114 
115  perform_search_by_hit_strategy(internal_hits, query, error_state, on_hit_delegate);
116 
117  // Invoke the callback on the generated result.
118  this->make_results(std::move(internal_hits), query_idx, callback); // see policy_search_result_builder
119  }
120 
121 private:
123  index_t const * index_ptr{nullptr};
124 
126  uint8_t stratum{};
127 
128  // forward declaration
129  template <bool abort_on_hit, typename query_t, typename delegate_t>
130  inline void search_algo_bi(query_t & query, search_param const error_left, delegate_t && delegate);
131 
139  template <typename query_t, typename delegate_t>
140  void perform_search_by_hit_strategy(std::vector<typename index_t::cursor_type> & internal_hits,
141  query_t & query,
142  search_param error_state,
143  delegate_t const & on_hit_delegate)
144  {
145  if constexpr (!traits_t::search_all_hits)
146  {
147  auto max_total = error_state.total;
148  error_state.total = 0; // start search with less errors
149  while (internal_hits.empty() && error_state.total <= max_total)
150  {
151  // * If you only want the best hit (traits_t::search_single_best_hit), you stop after finding the
152  // first hit, the hit with the least errors (`abort_on_hit` is true).
153  // * If you are in strata mode (traits_t::search_strata_hits), you do the same as with best hits,
154  // but then do the extra step afterwards (`abort_on_hit` is true).
155  // * If you want all best hits (traits_t::search_all_best_hits), you do not stop after the first
156  // hit but continue the current search algorithm/max_error pattern (`abort_on_hit` is true).
157  constexpr bool abort_on_hit = !traits_t::search_all_best_hits;
158  search_algo_bi<abort_on_hit>(query, error_state, on_hit_delegate);
159  ++error_state.total;
160  }
161  if constexpr (traits_t::search_strata_hits)
162  {
163  if (!internal_hits.empty())
164  {
165  internal_hits.clear(); // TODO:don't clear when using Optimum Search Schemes with lower error bounds
166  error_state.total += stratum - 1;
167  search_algo_bi<false>(query, error_state, on_hit_delegate);
168  }
169  }
170  }
171  else // detail::search_mode_all
172  {
173  // If you want to find all hits, you cannot stop once you found any hit (<false>)
174  // since you have to find all paths in the search tree that satisfy the hit condition.
175  search_algo_bi<false>(query, error_state, on_hit_delegate);
176  }
177  }
178 };
179 
193 inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
194 {
195  // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
196  // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
197  // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
198  // first hit (e.g. search strategy: best). Even though it is not guaranteed, this seems to be a good greedy
199  // approach.
200  std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
201  return scheme;
202 }
203 
221 template <typename search_scheme_t>
222 inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
223 {
224  using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
225 
226  bool constexpr is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
227 
228  // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
229  using result_type = std::conditional_t<is_dyn_scheme,
232  transformation_trait_or_t<std::tuple_size<search_scheme_t>,
233  std::false_type>::value>>;
234 
235  result_type result;
236  if constexpr (is_dyn_scheme)
237  result.resize(search_scheme.size());
238 
239  uint8_t const blocks {search_scheme[0].blocks()};
240  size_t const block_length{query_length / blocks};
241  uint8_t const rest {static_cast<uint8_t>(query_length % blocks)};
242 
243  blocks_length_type blocks_length;
244  // set all blocks_length values to block_length
245  // resp. block_length + 1 for the first `rest = block_length % blocks` values
246  if constexpr (is_dyn_scheme)
247  blocks_length.resize(blocks, block_length);
248  else
249  blocks_length.fill(block_length);
250 
251  for (uint8_t block_id = 0; block_id < rest; ++block_id)
252  ++blocks_length[block_id];
253 
254  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
255  {
256  auto const & search = search_scheme[search_id];
257 
258  auto & [search_blocks_length, start_pos] = result[search_id];
259 
260  // compute cumulative blocks_length and starting position
261  start_pos = 0;
262  if constexpr (is_dyn_scheme)
263  search_blocks_length.resize(blocks);
264  search_blocks_length[0] = blocks_length[search.pi[0] - 1];
265  for (uint8_t i = 1; i < blocks; ++i)
266  {
267  search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
268  if (search.pi[i] < search.pi[0])
269  start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
270  }
271  }
272 
273  return result;
274 }
275 
277 // forward declaration
278 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
279  typename delegate_t>
280 inline bool search_ss(cursor_t cur, query_t & query,
281  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
282  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
283  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate);
285 
317 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
318  typename delegate_t>
319 inline bool search_ss_exact(cursor_t cur, query_t & query,
320  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
321  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
322  search_t const & search, blocks_length_t const & blocks_length,
323  search_param const error_left, delegate_t && delegate)
324 {
325  using size_type = typename cursor_t::size_type;
326 
327  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
328  bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
329 
330  if (go_right)
331  {
332  size_type const infix_lb = rb - 1; // inclusive
333  size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
334 
335  if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
336  return false;
337 
338  if (search_ss<abort_on_hit>(cur, query, lb, infix_rb + 2, errors_spent, block_id2, go_right2, search,
339  blocks_length, error_left, delegate) && abort_on_hit)
340  {
341  return true;
342  }
343  }
344  else
345  {
346  size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
347  size_type const infix_rb = lb - 1; // inclusive
348 
349  if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
350  return false;
351 
352  if (search_ss<abort_on_hit>(cur, query, infix_lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
353  error_left, delegate) && abort_on_hit)
354  {
355  return true;
356  }
357  }
358  return false;
359 }
360 
367 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
368  typename delegate_t>
369 inline bool search_ss_deletion(cursor_t cur, query_t & query,
370  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
371  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
372  search_t const & search, blocks_length_t const & blocks_length,
373  search_param const error_left, delegate_t && delegate)
374 {
375  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
376  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
377 
378  // Switch to the next block when the min number of errors is reached
379  if (min_error_left_in_block == 0)
380  {
381  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
382  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
383 
384  if (search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
385  error_left, delegate) && abort_on_hit)
386  {
387  return true;
388  }
389  }
390 
391  // Insert deletions into the current block as long as possible
392  // Do not allow deletions at the beginning of the leftmost block
393  // Do not allow deletions at the end of the rightmost block
394  if (!(search.pi[block_id] == 1 && !go_right) &&
395  !(search.pi[block_id] == search.blocks() && go_right) &&
396  max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0 &&
397  ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
398  {
399  search_param error_left2{error_left};
400  error_left2.total--;
401  error_left2.deletion--;
402  do
403  {
404  if (search_ss_deletion<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search,
405  blocks_length, error_left2, delegate) && abort_on_hit)
406  {
407  return true;
408  }
409  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
410  }
411  return false;
412 }
413 
422 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
423  typename delegate_t>
424 inline bool search_ss_children(cursor_t cur, query_t & query,
425  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
426  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
427  uint8_t const min_error_left_in_block, search_t const & search,
428  blocks_length_t const & blocks_length, search_param const error_left,
429  delegate_t && delegate)
430 {
431  using size_type = typename cursor_t::size_type;
432  if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
433  {
434  size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
435 
436  size_type lb2 = lb - !go_right;
437  size_type rb2 = rb + go_right;
438 
439  do
440  {
441  bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
442 
443  // skip if there are more min errors left in the current block than characters in the block
444  // i.e. chars_left - 1 < min_error_left_in_block - delta
445  // TODO: move that outside the if / do-while struct
446  // TODO: incorporate error_left.deletion into formula
447  if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
448  continue;
449 
450  if (!delta || error_left.substitution > 0)
451  {
452  search_param error_left2{error_left};
453  error_left2.total -= delta;
454  error_left2.substitution -= delta;
455 
456  // At the end of the current block
457  if (rb - lb == blocks_length[block_id])
458  {
459  // Leave the possibility for one or multiple deletions at the end of a block.
460  // Thus do not change the direction (go_right) yet.
461  if (error_left.deletion > 0)
462  {
463  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id,
464  go_right, search, blocks_length, error_left2, delegate) &&
465  abort_on_hit)
466  {
467  return true;
468  }
469  }
470  else
471  {
472  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
473  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
474 
475  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id2, go_right2,
476  search, blocks_length, error_left2, delegate) &&
477  abort_on_hit)
478  {
479  return true;
480  }
481  }
482  }
483  else
484  {
485  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id, go_right, search,
486  blocks_length, error_left2, delegate) && abort_on_hit)
487  {
488  return true;
489  }
490  }
491  }
492 
493  // Deletion
494  // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
495  // No deletion at the beginning of the leftmost block.
496  // No deletion at the end of the rightmost block.
497  if (error_left.deletion > 0 &&
498  !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1)) &&
499  !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
500  {
501  search_param error_left3{error_left};
502  error_left3.total--;
503  error_left3.deletion--;
504  search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search, blocks_length,
505  error_left3, delegate);
506  }
507  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
508  }
509  return false;
510 }
511 
517 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t,
518  typename blocks_length_t, typename delegate_t>
519 inline bool search_ss(cursor_t cur, query_t & query,
520  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
521  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
522  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate)
523 {
524  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
525  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
526 
527  // Done.
528  if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
529  {
530  delegate(cur);
531  return true;
532  }
533  // Exact search in current block.
534  else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id])) ||
535  (error_left.total == 0 && min_error_left_in_block == 0))
536  {
537  if (search_ss_exact<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right, search, blocks_length,
538  error_left, delegate) && abort_on_hit)
539  {
540  return true;
541  }
542  }
543  // Approximate search in current block.
544  // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
545  else if (error_left.total > 0)
546  {
547  // Insertion
548  if (error_left.insertion > 0)
549  {
550  using size_type = typename cursor_t::size_type;
551 
552  size_type const lb2 = lb - !go_right;
553  size_type const rb2 = rb + go_right;
554 
555  search_param error_left2{error_left};
556  error_left2.total--;
557  error_left2.insertion--;
558  // At the end of the current block
559  if (rb - lb == blocks_length[block_id])
560  {
561  // Leave the possibility for one or multiple deletions at the end of a block.
562  // Thus do not change the direction (go_right) yet.
563  // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
564  // it pay off the additional complexity and documentation for the user? (Note that the user might only
565  // allow for insertions and deletion and not for mismatches).
566  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
567  blocks_length, error_left2, delegate) && abort_on_hit)
568  {
569  return true;
570  }
571  }
572  else
573  {
574  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
575  blocks_length, error_left2, delegate) && abort_on_hit)
576  {
577  return true;
578  }
579  }
580  }
581  if (search_ss_children<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right,
582  min_error_left_in_block, search, blocks_length, error_left, delegate) &&
583  abort_on_hit)
584  {
585  return true;
586  }
587  }
588  return false;
589 }
590 
614 template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
615 inline void search_ss(index_t const & index, query_t & query, search_param const error_left,
616  search_scheme_t const & search_scheme, delegate_t && delegate)
617 {
618  // retrieve cumulative block lengths and starting position
619  auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
620 
621  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
622  {
623  auto const & search = search_scheme[search_id];
624  auto const & [blocks_length, start_pos] = block_info[search_id];
625 
626  bool const hit = search_ss<abort_on_hit>(
627  index.cursor(), // cursor on the index
628  query, // query to be searched
629  start_pos, start_pos + 1, // infix range already searched (open interval)
630  // the first character of `query` has the index 1 (not 0)
631  0, // errors spent
632  0, // current block id in search scheme
633  true, // search the first block from left to right
634  search, blocks_length, // search scheme information
635  error_left, // errors left (broken down by error types)
636  delegate // delegate function called on hit
637  );
638 
639  if (abort_on_hit && hit)
640  return;
641  }
642 }
643 
662 template <typename configuration_t, typename index_t, typename ...policies_t>
663 template <bool abort_on_hit, typename query_t, typename delegate_t>
664 inline void search_scheme_algorithm<configuration_t, index_t, policies_t...>::search_algo_bi(
665  query_t & query,
666  search_param const error_left,
667  delegate_t && delegate)
668 {
669  switch (error_left.total)
670  {
671  case 0:
672  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 0>, delegate);
673  break;
674  case 1:
675  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 1>, delegate);
676  break;
677  case 2:
678  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 2>, delegate);
679  break;
680  case 3:
681  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 3>, delegate);
682  break;
683  default:
684  auto const & search_scheme{compute_ss(0, error_left.total)};
685  search_ss<abort_on_hit>(*index_ptr, query, error_left, search_scheme, delegate);
686  break;
687  }
688 }
689 
690 } // namespace seqan3::detail
T addressof(T... args)
Provides the bidirectional seqan3::bi_fm_index.
T clear(T... args)
T empty(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:104
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
T max(T... args)
T push_back(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides data structures used by different search algorithms.
Provides the data structures and precomputed instances for (optimum) search schemes.
Provides seqan3::detail::search_traits.
Provides seqan3::views::slice.
Provides seqan3::detail::transformation_trait_or.