SeqAn3  3.2.0
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <ranges>
17 #include <type_traits>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
27 
28 namespace seqan3
29 {
33 {
35  size_t begin_position{};
37  size_t end_position{};
38 
47  friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
48  {
49  return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
50  }
51 
57  friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
58  {
59  return !(lhs == rhs);
60  }
62 };
63 
85 template <typename index_t>
87 {
88 public:
93  using index_type = index_t;
95  using size_type = typename index_type::size_type;
97 
98 private:
103  using node_type = detail::fm_index_cursor_node<index_t>;
105  using sdsl_char_type = typename index_type::sdsl_char_type;
107  using sdsl_index_type = typename index_t::sdsl_index_type;
109  using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
111  using index_alphabet_type = typename index_t::alphabet_type;
117 
119  index_type const * index{nullptr};
121  size_type parent_lb{};
123  size_type parent_rb{};
125  node_type node{};
127  sdsl_sigma_type sigma{};
128 
129  template <typename _index_t>
130  friend class bi_fm_index_cursor;
131 
133  size_type offset() const noexcept
134  {
135  assert(index->index.size() > query_length());
136  return index->index.size() - query_length() - 1; // since the string is reversed during construction
137  }
138 
140  bool
141  backward_search(sdsl_index_type const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
142  {
143  assert(l <= r && r < csa.size());
144 
145  size_type _l, _r;
146 
147  size_type cc = c;
148  if constexpr (!std::same_as<index_alphabet_type, sdsl::plain_byte_alphabet>)
149  {
150  cc = csa.char2comp[c];
151  if (cc == 0 && c > 0) // [[unlikely]]
152  return false;
153  }
154 
155  size_type const c_begin = csa.C[cc];
156  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
157  {
158  _l = c_begin;
159  _r = csa.C[cc + 1] - 1;
160  // if we use not the plain_byte_alphabet, we could return always return true here
161  }
162  else
163  {
164  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
165  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
166  }
167 
168  if (_r >= _l)
169  {
170  r = _r;
171  l = _l;
172  assert(r + 1 - l >= 0);
173  return true;
174  }
175  return false;
176  }
177 
178 public:
183  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
184  // std::array of iterators.
185  fm_index_cursor() noexcept = default;
186  fm_index_cursor(fm_index_cursor const &) noexcept = default;
187  fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
188  fm_index_cursor(fm_index_cursor &&) noexcept = default;
189  fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
190  ~fm_index_cursor() = default;
191 
193  fm_index_cursor(index_t const & _index) noexcept :
194  index(&_index),
195  node({0, _index.index.size() - 1, 0, 0}),
196  sigma(_index.index.sigma - index_t::text_layout_mode)
197  {
198  assert(_index.index.size() != 0);
199  }
200  //\}
201 
214  bool operator==(fm_index_cursor const & rhs) const noexcept
215  {
216  assert(index != nullptr);
217  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
218 
219  // position in the implicit suffix tree is defined by the SA interval and depth.
220  // No need to compare parent intervals
221  return node == rhs.node;
222  }
223 
236  bool operator!=(fm_index_cursor const & rhs) const noexcept
237  {
238  assert(index != nullptr);
239 
240  return !(*this == rhs);
241  }
242 
260  bool extend_right() noexcept
261  {
262  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
263  // store all cursors at once in a private std::array of cursors
264  assert(index != nullptr);
265 
266  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
267  size_type _lb = node.lb, _rb = node.rb;
268  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
269  {
270  ++c;
271  }
272 
273  if (c != sigma)
274  {
275  parent_lb = node.lb;
276  parent_rb = node.rb;
277  node = {_lb, _rb, node.depth + 1, c};
278  return true;
279  }
280  return false;
281  }
282 
296  template <typename char_t>
297  requires std::convertible_to<char_t, index_alphabet_type> bool
298  extend_right(char_t const c) noexcept
299  {
300  assert(index != nullptr);
301  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
302  // for the indexed text.
303  assert(seqan3::to_rank(static_cast<index_alphabet_type>(c))
304  < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
305 
306  size_type _lb = node.lb, _rb = node.rb;
307 
308  sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
309 
310  if (backward_search(index->index, c_char, _lb, _rb))
311  {
312  parent_lb = node.lb;
313  parent_rb = node.rb;
314  node = {_lb, _rb, node.depth + 1, c_char};
315  return true;
316  }
317  return false;
318  }
319 
321  template <typename char_type>
322  requires detail::is_char_adaptation_v<char_type> bool
323  extend_right(char_type const * cstring) noexcept
324  {
326  }
327 
344  template <std::ranges::range seq_t>
345  bool extend_right(seq_t && seq) noexcept
346  {
347  static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
348  static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
349  "The alphabet of the sequence must be convertible to the alphabet of the index.");
350 
351  assert(index != nullptr); // range must not be empty!
352 
353  size_type _lb = node.lb, _rb = node.rb;
354  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
355 
356  sdsl_char_type c{};
357  size_t len{0};
358 
359  for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
360  {
361  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
362  // for the indexed text.
363  assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it))
364  < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
365 
366  c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
367 
368  new_parent_lb = _lb;
369  new_parent_rb = _rb;
370  if (!backward_search(index->index, c, _lb, _rb))
371  return false;
372  }
373 
374  parent_lb = new_parent_lb;
375  parent_rb = new_parent_rb;
376  node = {_lb, _rb, len + node.depth, c};
377  return true;
378  }
379 
406  bool cycle_back() noexcept
407  {
408  assert(index != nullptr && query_length() > 0);
409  // parent_lb > parent_rb --> invalid interval
410  assert(parent_lb <= parent_rb);
411 
412  sdsl_char_type c = node.last_char + 1;
413  size_type _lb = parent_lb, _rb = parent_rb;
414 
415  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
416  {
417  ++c;
418  }
419 
420  if (c != sigma) // Collection has additional sentinel as delimiter
421  {
422  node = {_lb, _rb, node.depth, c};
423  return true;
424  }
425  return false;
426  }
427 
443  size_type last_rank() const noexcept
444  {
445  // parent_lb > parent_rb --> invalid interval
446  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
447 
448  return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
449  }
450 
467  {
468  assert(index != nullptr);
469 
470  return {node.lb, node.rb + 1};
471  }
472 
491  size_type query_length() const noexcept
492  {
493  assert(index != nullptr);
494  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
495 
496  return node.depth;
497  }
498 
516  template <std::ranges::range text_t>
517  auto path_label(text_t && text) const noexcept
518  requires (index_t::text_layout_mode == text_layout::single)
519  {
520  static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
521  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
522  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
523  "The alphabet types of the given text and index differ.");
524  assert(index != nullptr);
525 
526  size_type const query_begin = offset() - index->index[node.lb];
527  return text | views::slice(query_begin, query_begin + query_length());
528  }
529 
531  template <std::ranges::range text_t>
532  auto path_label(text_t && text) const noexcept
533  requires (index_t::text_layout_mode == text_layout::collection)
534  {
535  static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
536  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
537  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
538  "The alphabet types of the given text and index differ.");
539  assert(index != nullptr);
540 
541  // Position of query in concatenated text.
542  size_type const location = offset() - index->index[node.lb];
543 
544  // The rank represents the number of start positions of the individual sequences/texts in the collection
545  // before position `location + 1` and thereby also the number of delimiters.
546  size_type const rank = index->text_begin_rs.rank(location + 1);
547  assert(rank > 0);
548  size_type const text_id = rank - 1;
549 
550  // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
551  size_type const start_location = index->text_begin_ss.select(rank);
552  // Substract lengths of previous sequences.
553  size_type const query_begin = location - start_location;
554 
555  // Take subtext, slice query out of it
556  return text[text_id] | views::slice(query_begin, query_begin + query_length());
557  }
558 
570  size_type count() const noexcept
571  {
572  assert(index != nullptr);
573 
574  return 1 + node.rb - node.lb;
575  }
576 
589  requires (index_t::text_layout_mode == text_layout::single)
590  {
591  assert(index != nullptr);
592 
593  locate_result_type occ{};
594  occ.reserve(count());
595  for (size_type i = 0; i < count(); ++i)
596  {
597  occ.emplace_back(0, offset() - index->index[node.lb + i]);
598  }
599 
600  return occ;
601  }
602 
605  requires (index_t::text_layout_mode == text_layout::collection)
606  {
607  assert(index != nullptr);
608 
609  locate_result_type occ;
610  occ.reserve(count());
611  for (size_type i = 0; i < count(); ++i)
612  {
613  size_type loc = offset() - index->index[node.lb + i];
614  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
615  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
616  occ.emplace_back(sequence_rank - 1, sequence_position);
617  }
618  return occ;
619  }
620 
633  auto lazy_locate() const
634  requires (index_t::text_layout_mode == text_layout::single)
635  {
636  assert(index != nullptr);
637 
638  return std::views::iota(node.lb, node.lb + count())
640  [*this, _offset = offset()](auto sa_pos)
641  {
642  return locate_result_value_type{0u, _offset - index->index[sa_pos]};
643  });
644  }
645 
647  auto lazy_locate() const
648  requires (index_t::text_layout_mode == text_layout::collection)
649  {
650  assert(index != nullptr);
651 
652  return std::views::iota(node.lb, node.lb + count())
654  [*this, _offset = offset()](auto sa_pos)
655  {
656  auto loc = _offset - index->index[sa_pos];
657  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
658  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
659  return locate_result_value_type{sequence_rank - 1, sequence_position};
660  });
661  }
662 
670  template <cereal_archive archive_t>
671  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
672  {
673  archive(parent_lb);
674  archive(parent_rb);
675  archive(node);
676  archive(sigma);
677  }
679 };
680 
681 } // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:87
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:345
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:406
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:236
seqan3::suffix_array_interval suffix_array_interval() const noexcept
Returns the half-open suffix array interval.
Definition: fm_index_cursor.hpp:466
locate_result_type locate() const requires(index_t
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:588
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:214
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:570
requires std::convertible_to< char_t, index_alphabet_type > bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:298
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:443
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:260
requires detail::is_char_adaptation_v< char_type > bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:323
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_cursor.hpp:491
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:93
auto path_label(text_t &&text) const noexcept requires(index_t
Returns the searched query.
Definition: fm_index_cursor.hpp:517
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:95
auto lazy_locate() const requires(index_t
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition: fm_index_cursor.hpp:633
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T emplace_back(T... args)
requires requires
The rank_type of the semi-alphabet; defined as the return type of seqan3::to_rank....
Definition: alphabet/concept.hpp:164
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
typename range_innermost_value< t >::type range_innermost_value_t
Shortcut for seqan3::range_innermost_value (transformation_trait shortcut).
Definition: core/range/type_traits.hpp:98
@ seq
The "sequence", usually a range of nucleotides or amino acids.
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: search/fm_index/concept.hpp:91
@ single
The text is a single range.
Definition: search/fm_index/concept.hpp:93
@ collection
The text is a range of ranges.
Definition: search/fm_index/concept.hpp:95
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:469
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
The <ranges> header from C++20's standard library.
T reserve(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides seqan3::views::slice.
The underlying suffix array interval.
Definition: fm_index_cursor.hpp:33
size_t end_position
The exclusive end position of the interval ("right boundary").
Definition: fm_index_cursor.hpp:37
size_t begin_position
The begin position of the interval ("left boundary").
Definition: fm_index_cursor.hpp:35
friend bool operator==(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for equality.
Definition: fm_index_cursor.hpp:47
friend bool operator!=(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for inequality.
Definition: fm_index_cursor.hpp:57