libsemigroups  v3.1.2
C++ library for semigroups and monoids
Loading...
Searching...
No Matches
ukkonen.hpp
1//
2// libsemigroups - C++ library for semigroups and monoids
3// Copyright (C) 2021-2025 James D. Mitchell + Maria Tsalakou
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17//
18
19// This file contains an implementation of a generalised suffix tree, adapted
20// from:
21//
22// https://cp-algorithms.com/string/suffix-tree-ukkonen.html
23
24// TODO(1)
25// * there's an issue if you add a word containing a very large letter, then
26// add more words so that the unique letter used is now in the word with the
27// very large letter
28// * possibly move more of the inline functions in the ukkoken namespace into
29// the cpp file. Seemingly g++-14 complains about some of these (maybe the
30// others just aren't currently tested).
31
32#ifndef LIBSEMIGROUPS_UKKONEN_HPP_
33#define LIBSEMIGROUPS_UKKONEN_HPP_
34
35#include <algorithm> // for find_if, copy, max
36#include <cstring> // for size_t, strlen
37#include <iterator> // for distance
38#include <map> // for operator!=, map, _Rb_tree_iterator
39#include <numeric> // for accumulate
40#include <stack> // for stack
41#include <string> // for allocator, string, basic_string
42#include <type_traits> // for make_unsigned_t
43#include <utility> // for pair, make_pair
44#include <vector> // for vector
45
46#include "constants.hpp" // for operator==, operator!=, Max, UNDEFINED
47#include "debug.hpp" // for LIBSEMIGROUPS_ASSERT
48#include "dot.hpp" // for Dot
49#include "exception.hpp" // for LIBSEMIGROUPS_EXCEPTION
50#include "types.hpp" // for word_type, letter_type
51
52#include "detail/fmt.hpp" // for format, format_string
53#include "detail/string.hpp" // for maximum_common_prefix
54
55namespace libsemigroups {
56 namespace detail {
57 // Helper for unsigned integers
58 template <typename Iterator>
59 [[nodiscard]] letter_type deref_as_unsigned(Iterator it) {
60 auto val = *it;
61 return static_cast<std::make_unsigned_t<decltype(val)>>(*it);
62 }
63 } // namespace detail
64
77
90 class Ukkonen {
91 // Alias for index in _nodes
92 using node_index_type = size_t;
93
94 // Alias for index in side an edge
95 using edge_index_type = size_t;
96
97 public:
100 using unique_letter_type = size_t;
101
103 using word_index_type = size_t;
104
106 using const_iterator = typename word_type::const_iterator;
107
109 using index_type = size_t;
110
112 // Ukkonen - inner classes - public
114
121 struct State {
127 node_index_type v;
128
132 edge_index_type pos;
133
137 State() = default;
138
142 State(State const&) = default;
143
147 State(State&&) = default;
148
152 State& operator=(State const&) = default;
153
157 State& operator=(State&&) = default;
158
168 State(node_index_type vv, edge_index_type ppos) : v(vv), pos(ppos) {}
169
184 bool operator==(State const& that) const noexcept {
185 return v == that.v && pos == that.pos;
186 }
187 };
188
194 struct Node {
199
205
209 node_index_type parent;
210
211#ifndef LIBSEMIGROUPS_PARSED_BY_DOXYGEN
212 // No doc
213 node_index_type link;
214 // The next member is a weak indicator of whether or not the node
215 // corresponds to a real suffix. If the value is true, then the node
216 // corresponds to a real suffix. If the value is false, then the children
217 // should be checked via is_real_suffix(Node const&) in Ukkonen.
218 mutable bool is_real_suffix;
219#endif
220
225
229 Node(Node const&) = default;
230
234 Node(Node&&) = default;
235
239 Node& operator=(Node const&) = default;
240
244 Node& operator=(Node&&) = default;
245
260 explicit Node(index_type l = 0,
261 index_type r = 0,
262 node_index_type parent = UNDEFINED);
263
275 size_t length() const noexcept {
276 return r - l;
277 }
278
293 node_index_type& child(letter_type c);
294
309 node_index_type child(letter_type c) const;
310
322 bool is_leaf() const noexcept {
323 return children.empty();
324 }
325
337 bool is_root() const noexcept {
338 return parent == UNDEFINED;
339 }
340 };
341
342 private:
344 // Ukkonen - private data
346
347 size_t _max_word_length;
348 std::vector<size_t> _multiplicity;
349 unique_letter_type _next_unique_letter;
350 std::vector<Node> _nodes;
351 State _ptr;
352 std::vector<index_type> _word_begin;
353 std::vector<index_type> _word_index_lookup;
354 word_type _word;
355
356 public:
358 // Ukkonen - constructors - public
360
368
381
386
391
396
401
402 ~Ukkonen();
403
405 // Ukkonen - initialisation - public
407
430
442 add_word_no_checks(first, last);
443 }
444
446 // Ukkonen - attributes - public
448
461 std::vector<Node> const& nodes() const noexcept {
462 return _nodes;
463 }
464
479 size_t number_of_distinct_words() const noexcept {
480 return -1 - _next_unique_letter;
481 }
482
495 size_t length_of_distinct_words() const noexcept {
496 return _word.size() - number_of_distinct_words();
497 }
498
514 size_t length_of_words() const noexcept;
515
530 size_t number_of_words() const noexcept {
531 return std::accumulate(_multiplicity.cbegin(), _multiplicity.cend(), 0);
532 }
533
545 size_t max_word_length() const noexcept {
546 return _max_word_length;
547 }
548
562 const_iterator begin() const noexcept {
563 return _word.begin();
564 }
565
579 const_iterator cbegin() const noexcept {
580 return _word.cbegin();
581 }
582
596 const_iterator end() const noexcept {
597 return _word.end();
598 }
599
613 const_iterator cend() const noexcept {
614 return _word.cend();
615 }
616
635 LIBSEMIGROUPS_ASSERT(n.parent != UNDEFINED);
636 return word_index_no_checks(n.r - 1);
637 }
638
653 if (n.parent == UNDEFINED) {
655 "expected the parent of the parameter to not be UNDEFINED");
656 }
657 return word_index_no_checks(n);
658 }
659
680 LIBSEMIGROUPS_ASSERT(i < _word.size());
681 return _word_index_lookup[i];
682 }
683
700 if (i >= _word.size()) {
702 "expected the parameter to be in the range [0, {}), found {}",
703 _word.size(),
704 i);
705 }
706 return word_index_no_checks(i);
707 }
708
722 size_t distance_from_root(Node const& n) const;
723
737
753 return _multiplicity[i];
754 }
755
771 if (i >= _multiplicity.size()) {
773 "expected the parameter to be in the range [0, {}), found {}",
774 _multiplicity.size(),
775 i);
776 }
777 return multiplicity_no_checks(i);
778 }
779
796 LIBSEMIGROUPS_ASSERT(i < number_of_distinct_words());
797 return -1 - i;
798 }
799
815 bool is_unique_letter(letter_type l) const noexcept {
816 return l >= _next_unique_letter;
817 }
818
847 template <typename Iterator>
848 word_index_type index_no_checks(Iterator first, Iterator last) const;
849
850 // Returns the index of the word [first, last) in the suffix tree if it is
851 // contained (i.e. [first, last) is one of the words added using add_word),
852 // and UNDEFINED if not.
859 template <typename Iterator>
860 word_index_type index(Iterator first, Iterator last) const {
862 return index_no_checks(first, last);
863 }
864
895 template <typename Iterator>
896 Iterator traverse_no_checks(State& st, Iterator first, Iterator last) const;
897
904 template <typename Iterator>
905 Iterator traverse(State& st, Iterator first, Iterator last) const {
907 return traverse_no_checks(st, first, last);
908 }
909
938 template <typename Iterator>
940 Iterator last) const {
941 State st(0, 0);
942 return std::make_pair(st, traverse_no_checks(st, first, last));
943 }
944
951 template <typename Iterator>
952 std::pair<State, Iterator> traverse(Iterator first, Iterator last) const {
954 return traverse_no_checks(first, last);
955 }
956
958 // Ukkonen - validation - public
960
980 template <typename Iterator>
981 void throw_if_contains_unique_letter(Iterator first, Iterator last) const;
982
990
991 private:
993 // Ukkonen - helpers - private
995
996 bool is_real_suffix(Node const& n) const;
997
998 size_t word_length(word_index_type index) const {
999 LIBSEMIGROUPS_ASSERT(index < _word_begin.size() - 1);
1000 return (_word_begin[index + 1] - _word_begin[index]) - 1;
1001 }
1002
1004 // The following functions go, split, get_link, and tree_extend are
1005 // minimally adapted from:
1006 //
1007 // https://cp-algorithms.com/string/suffix-tree-ukkonen.html
1009
1010 // Follow the path in the tree starting at the position described by
1011 // State st, and corresponding to the range [l, r) in _word.
1012 void go(State& st, index_type l, index_type r) const;
1013
1014 // Split the node _nodes[st.v] into two nodes, the new node
1015 // with edge corresponding to
1016 //
1017 // [_nodes[st.v].l, _nodes[st.v].l + st.pos)
1018 //
1019 // and the old node with edge corresponding to
1020 //
1021 // [_nodes[st.v].l + st.pos, _nodes[st.v].r)
1022 node_index_type split(State const& st);
1023
1024 // Get the suffix link of a node by index
1025 node_index_type get_link(node_index_type v);
1026
1027 // Perform the phase starting with the pos letter of the word.
1028 void tree_extend(index_type pos);
1029 }; // class Ukkonen
1030
1034 namespace ukkonen {
1039
1046 template <typename Iterator>
1047 void add_word_no_checks(Ukkonen& u, Iterator first, Iterator last) {
1048 // TODO(later) it'd be better to just convert the values pointed at by the
1049 // iterators, than to allocate a word_type here, but this is currently a
1050 // bit tricky to set up
1051 word_type w;
1052 for (auto it = first; it != last; ++it) {
1053 w.push_back(detail::deref_as_unsigned(it));
1054 }
1055 add_word_no_checks(u, w);
1056 }
1057
1064 template <typename Word>
1065 void add_word_no_checks(Ukkonen& u, Word const& w) {
1066 add_word_no_checks(u, w.cbegin(), w.cend());
1067 }
1068
1072 void add_word_no_checks(Ukkonen& u, char const* w);
1073
1082 void add_word(Ukkonen& u, word_type const& w);
1083
1095 template <typename Iterator>
1096 void add_word(Ukkonen& u, Iterator first, Iterator last) {
1097 u.throw_if_contains_unique_letter(first, last);
1098 add_word_no_checks(u, first, last);
1099 }
1100
1109 template <typename Word>
1110 void add_word(Ukkonen& u, Word const& w) {
1111 add_word(u, w.cbegin(), w.cend());
1112 }
1113
1122 void add_word(Ukkonen& u, char const* w);
1123
1139
1159 template <typename Iterator1, typename Iterator2>
1160 void add_words_no_checks(Ukkonen& u, Iterator1 first, Iterator2 last) {
1161 for (auto it = first; it != last; ++it) {
1162 add_word_no_checks(u, *it);
1163 }
1164 }
1165
1175
1187 template <typename Iterator1, typename Iterator2>
1188 void add_words(Ukkonen& u, Iterator1 first, Iterator2 last) {
1189 for (auto it = first; it != last; ++it) {
1190 add_word(u, *it);
1191 }
1192 }
1193
1200 template <typename Word>
1201 auto traverse_no_checks(Ukkonen const& u, Word const& w) {
1202 return u.traverse_no_checks(w.cbegin(), w.cend());
1203 }
1204
1210
1215 char const* w);
1216
1228 template <typename Word>
1229 auto traverse(Ukkonen const& u, Word const& w) {
1230 return u.traverse(w.cbegin(), w.cend());
1231 }
1232
1243 traverse(Ukkonen const& u, word_type const& w);
1244
1254 char const* w);
1255
1264 template <typename Word>
1266 Ukkonen const& u,
1267 Word const& w) {
1268 return u.traverse_no_checks(st, w.cbegin(), w.cend());
1269 }
1270
1276 word_type::const_iterator traverse_no_checks(Ukkonen::State& st,
1277 Ukkonen const& u,
1278 word_type const& w);
1279
1286 Ukkonen const& u,
1287 char const* w);
1288
1302 template <typename Word>
1303 auto traverse(Ukkonen::State& st, Ukkonen const& u, Word const& w) {
1304 return u.traverse(st, w.cbegin(), w.cend());
1305 }
1306
1317 word_type::const_iterator traverse(Ukkonen::State& st,
1318 Ukkonen const& u,
1319 word_type const& w);
1320
1331 char const* traverse(Ukkonen::State& st, Ukkonen const& u, char const* w);
1332
1359 template <typename Iterator>
1360 bool is_subword_no_checks(Ukkonen const& u, Iterator first, Iterator last);
1361
1366 template <typename Word>
1367 bool is_subword_no_checks(Ukkonen const& u, Word const& w) {
1368 return is_subword_no_checks(u, w.cbegin(), w.cend());
1369 }
1370
1375 bool is_subword_no_checks(Ukkonen const& u, char const* w);
1376
1381 bool is_subword_no_checks(Ukkonen const& u, word_type const& w);
1382
1392 template <typename Iterator>
1393 bool is_subword(Ukkonen const& u, Iterator first, Iterator last) {
1394 u.throw_if_contains_unique_letter(first, last);
1395 return is_subword_no_checks(u, first, last);
1396 }
1397
1407 template <typename Word>
1408 bool is_subword(Ukkonen const& u, Word const& w) {
1409 return is_subword(u, w.cbegin(), w.cend());
1410 }
1411
1421 bool is_subword(Ukkonen const& u, char const* w);
1422
1432 bool is_subword(Ukkonen const& u, word_type const& w);
1433
1457 template <typename Iterator>
1458 bool is_suffix_no_checks(Ukkonen const& u, Iterator first, Iterator last);
1459
1464 template <typename Word>
1465 bool is_suffix_no_checks(Ukkonen const& u, Word const& w) {
1466 return is_suffix_no_checks(u, w.cbegin(), w.cend());
1467 }
1468
1473 // This function is required so that we can use initialiser list, as an
1474 // argument to is_suffix_no_checks
1475 bool is_suffix_no_checks(Ukkonen const& u, word_type const& w);
1476
1481 bool is_suffix_no_checks(Ukkonen const& u, char const* w);
1482
1492 template <typename Iterator>
1493 bool is_suffix(Ukkonen const& u, Iterator first, Iterator last) {
1494 u.throw_if_contains_unique_letter(first, last);
1495 return is_suffix_no_checks(u, first, last);
1496 }
1497
1507 template <typename Word>
1508 bool is_suffix(Ukkonen const& u, Word const& w) {
1509 return is_suffix(u, w.cbegin(), w.cend());
1510 }
1511
1521 // This function is required so that we can use initialiser list, as an
1522 // argument to is_suffix
1523 bool is_suffix(Ukkonen const& u, word_type const& w);
1524
1534 bool is_suffix(Ukkonen const& u, char const* w);
1535
1562 template <typename Iterator>
1564 Iterator first,
1565 Iterator last);
1566
1572 template <typename Word>
1573 typename Word::const_iterator
1574 maximal_piece_prefix_no_checks(Ukkonen const& u, Word const& w) {
1575 return maximal_piece_prefix_no_checks(u, w.cbegin(), w.cend());
1576 }
1577
1583 inline typename word_type::const_iterator
1585 return maximal_piece_prefix_no_checks(u, w.cbegin(), w.cend());
1586 }
1587
1593 inline char const* maximal_piece_prefix_no_checks(Ukkonen const& u,
1594 char const* w) {
1595 return maximal_piece_prefix_no_checks(u, w, w + std::strlen(w));
1596 }
1597
1608 template <typename Iterator>
1610 Iterator first,
1611 Iterator last) {
1612 u.throw_if_contains_unique_letter(first, last);
1613 return maximal_piece_prefix_no_checks(u, first, last);
1614 }
1615
1626 template <typename Word>
1627 typename Word::const_iterator maximal_piece_prefix(Ukkonen const& u,
1628 Word const& w) {
1629 return maximal_piece_prefix(u, w.cbegin(), w.cend());
1630 }
1631
1642 inline typename word_type::const_iterator
1644 return maximal_piece_prefix(u, w.cbegin(), w.cend());
1645 }
1646
1657 inline char const* maximal_piece_prefix(Ukkonen const& u, char const* w) {
1658 return maximal_piece_prefix(u, w, w + std::strlen(w));
1659 }
1660
1686 template <typename Iterator>
1688 Iterator first,
1689 Iterator last) {
1690 return std::distance(first,
1691 maximal_piece_prefix_no_checks(u, first, last));
1692 }
1693
1699 template <typename Word>
1701 Word const& w) {
1702 return length_maximal_piece_prefix_no_checks(u, w.cbegin(), w.cend());
1703 }
1704
1711 word_type const& w) {
1713 }
1714
1721 char const* w) {
1723 }
1724
1735 template <typename Iterator>
1737 Iterator first,
1738 Iterator last) {
1739 return std::distance(first, maximal_piece_prefix(u, first, last));
1740 }
1741
1752 template <typename Word>
1753 size_t length_maximal_piece_prefix(Ukkonen const& u, Word const& w) {
1754 return length_maximal_piece_prefix(u, w.cbegin(), w.cend());
1755 }
1756
1767 inline size_t length_maximal_piece_prefix(Ukkonen const& u,
1768 word_type const& w) {
1769 return length_maximal_piece_prefix(u, w.cbegin(), w.cend());
1770 }
1771
1782 inline size_t length_maximal_piece_prefix(Ukkonen const& u, char const* w) {
1783 return length_maximal_piece_prefix(u, w, w + std::strlen(w));
1784 }
1785
1811 template <typename Iterator>
1812 bool is_piece_no_checks(Ukkonen const& u, Iterator first, Iterator last) {
1813 return maximal_piece_prefix_no_checks(u, first, last) == last;
1814 }
1815
1819 template <typename Word>
1820 bool is_piece_no_checks(Ukkonen const& u, Word const& w) {
1821 return is_piece_no_checks(u, w.cbegin(), w.cend());
1822 }
1823
1826 inline bool is_piece_no_checks(Ukkonen const& u, word_type const& w) {
1827 return is_piece_no_checks(u, w.cbegin(), w.cend());
1828 }
1829
1833 inline bool is_piece_no_checks(Ukkonen const& u, char const* w) {
1834 return is_piece_no_checks(u, w, w + std::strlen(w));
1835 }
1836
1845 template <typename Iterator>
1846 bool is_piece(Ukkonen const& u, Iterator first, Iterator last) {
1847 return maximal_piece_prefix(u, first, last) == last;
1848 }
1849
1858 template <typename Word>
1859 bool is_piece(Ukkonen const& u, Word const& w) {
1860 return is_piece(u, w.cbegin(), w.cend());
1861 }
1862
1871 inline bool is_piece(Ukkonen const& u, word_type const& w) {
1872 return is_piece(u, w.cbegin(), w.cend());
1873 }
1874
1883 inline bool is_piece(Ukkonen const& u, char const* w) {
1884 return is_piece(u, w, w + std::strlen(w));
1885 }
1886
1915 template <typename Iterator>
1917 Iterator first,
1918 Iterator last);
1919
1925 template <typename Word>
1926 typename Word::const_iterator
1927 maximal_piece_suffix_no_checks(Ukkonen const& u, Word const& w) {
1928 return maximal_piece_suffix_no_checks(u, w.cbegin(), w.cend());
1929 }
1930
1936 inline typename word_type::const_iterator
1938 return maximal_piece_suffix_no_checks(u, w.cbegin(), w.cend());
1939 }
1940
1946 inline char const* maximal_piece_suffix_no_checks(Ukkonen const& u,
1947 char const* w) {
1948 return maximal_piece_suffix_no_checks(u, w, w + std::strlen(w));
1949 }
1950
1961 template <typename Iterator>
1963 Iterator first,
1964 Iterator last) {
1965 u.throw_if_contains_unique_letter(first, last);
1966 return maximal_piece_suffix_no_checks(u, first, last);
1967 }
1968
1979 template <typename Word>
1980 typename Word::const_iterator maximal_piece_suffix(Ukkonen const& u,
1981 Word const& w) {
1982 return maximal_piece_suffix(u, w.cbegin(), w.cend());
1983 }
1984
1995 inline typename word_type::const_iterator
1997 return maximal_piece_suffix(u, w.cbegin(), w.cend());
1998 }
1999
2010 inline char const* maximal_piece_suffix(Ukkonen const& u, char const* w) {
2011 return maximal_piece_suffix(u, w, w + std::strlen(w));
2012 }
2013
2039 template <typename Iterator>
2041 Iterator first,
2042 Iterator last) {
2043 return std::distance(maximal_piece_suffix_no_checks(u, first, last),
2044 last);
2045 }
2046
2052 template <typename Word>
2054 Word const& w) {
2055 return length_maximal_piece_suffix_no_checks(u, w.cbegin(), w.cend());
2056 }
2057
2064 word_type const& w) {
2066 }
2067
2074 char const* w) {
2076 }
2077
2088 template <typename Iterator>
2090 Iterator first,
2091 Iterator last) {
2092 return std::distance(maximal_piece_suffix(u, first, last), last);
2093 }
2094
2105 template <typename Word>
2106 size_t length_maximal_piece_suffix(Ukkonen const& u, Word const& w) {
2107 return length_maximal_piece_suffix(u, w.cbegin(), w.cend());
2108 }
2109
2120 inline size_t length_maximal_piece_suffix(Ukkonen const& u,
2121 word_type const& w) {
2122 return length_maximal_piece_suffix(u, w.cbegin(), w.cend());
2123 }
2124
2135 inline size_t length_maximal_piece_suffix(Ukkonen const& u, char const* w) {
2136 return length_maximal_piece_suffix(u, w, w + std::strlen(w));
2137 }
2138
2164 template <typename Iterator>
2166 Iterator first,
2167 Iterator last);
2168
2174 template <typename Word>
2175 size_t number_of_pieces_no_checks(Ukkonen const& u, Word const& w) {
2176 return number_of_pieces_no_checks(u, w.cbegin(), w.cend());
2177 }
2178
2184 inline size_t number_of_pieces_no_checks(Ukkonen const& u,
2185 word_type const& w) {
2186 return number_of_pieces_no_checks(u, w.cbegin(), w.cend());
2187 }
2188
2194 inline size_t number_of_pieces_no_checks(Ukkonen const& u, char const* w) {
2195 return number_of_pieces_no_checks(u, w, w + std::strlen(w));
2196 }
2197
2198 // clang-format off
2199 // NOLINTNEXTLINE(whitespace/line_length)
2201 // clang-format on
2210 template <typename Iterator>
2211 size_t number_of_pieces(Ukkonen const& u, Iterator first, Iterator last) {
2212 u.throw_if_contains_unique_letter(first, last);
2213 return number_of_pieces_no_checks(u, first, last);
2214 }
2215
2226 template <typename Word>
2227 size_t number_of_pieces(Ukkonen const& u, Word const& w) {
2228 return number_of_pieces(u, w.cbegin(), w.cend());
2229 }
2230
2241 inline size_t number_of_pieces(Ukkonen const& u, word_type const& w) {
2242 return number_of_pieces(u, w.cbegin(), w.cend());
2243 }
2244
2255 inline size_t number_of_pieces(Ukkonen const& u, char const* w) {
2256 return number_of_pieces(u, w, w + std::strlen(w));
2257 }
2258
2276
2303 template <typename Iterator>
2305 Iterator first,
2306 Iterator last);
2307
2311 template <typename Word>
2313
2318 word_type const& w);
2319
2324
2333 template <typename Iterator>
2335 Iterator first,
2336 Iterator last) {
2337 u.throw_if_contains_unique_letter(first, last);
2338 return pieces_no_checks(u, first, last);
2339 }
2340
2349 template <typename Word>
2350 std::vector<Word> pieces(Ukkonen const& u, Word const& w) {
2351 u.throw_if_contains_unique_letter(w.cbegin(), w.cend());
2352 return pieces_no_checks(u, w);
2353 }
2354
2363 inline std::vector<word_type> pieces(Ukkonen const& u, word_type const& w) {
2365 return pieces_no_checks(u, w);
2366 }
2367
2374 inline std::vector<std::string> pieces(Ukkonen const& u, char const* w) {
2376 return pieces_no_checks(u, w);
2377 }
2378
2396 [[nodiscard]] Dot dot(Ukkonen const& u);
2397
2424 template <typename T>
2425 auto dfs(Ukkonen const& u, T& helper);
2426
2427 } // namespace ukkonen
2428
2439 [[nodiscard]] inline std::string to_human_readable_repr(Ukkonen const& u) {
2440 using detail::group_digits;
2441 return fmt::format("<Ukkonen with {} distinct words>",
2442 group_digits(u.number_of_distinct_words()));
2443 }
2444
2457 [[nodiscard]] inline std::string
2459 std::string const& sep = "::") {
2460 using detail::group_digits;
2461 return fmt::format("<Ukkonen{}State with pos = {} and v = {}>",
2462 sep,
2463 group_digits(st.pos),
2464 group_digits(st.v));
2465 }
2466
2479 [[nodiscard]] inline std::string
2481 std::string const& sep = "::") {
2482 using detail::group_digits;
2483 return fmt::format(
2484 "<Ukkonen{}Node with {} children and parent edge label [{}, {})>",
2485 sep,
2486 group_digits(node.children.size()),
2487 group_digits(node.l),
2488 group_digits(node.r));
2489 }
2490
2492
2493} // namespace libsemigroups
2494
2495#include "ukkonen.tpp"
2496
2497#endif // LIBSEMIGROUPS_UKKONEN_HPP_
T accumulate(T... args)
T cbegin(T... args)
A representation of a graph in the DOT language of Graphviz.
Definition dot.hpp:115
For an implementation of Ukkonen's algorithm.
Definition ukkonen.hpp:90
Iterator traverse_no_checks(State &st, Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Ukkonen(Ukkonen &&)
Default move constructor.
typename word_type::const_iterator const_iterator
Alias for word_type iterators.
Definition ukkonen.hpp:106
word_index_type word_index(index_type i) const
Returns the index of the word corresponding to a position.
Definition ukkonen.hpp:699
Ukkonen & operator=(Ukkonen &&)
Default move assignment.
const_iterator begin() const noexcept
Returns an iterator pointing to the first letter of the first word in the suffix tree.
Definition ukkonen.hpp:562
void throw_if_contains_unique_letter(Iterator first, Iterator last) const
Throw if the word [first, last) contains a letter equal to any of the unique letters added to the end...
size_t length_of_words() const noexcept
Returns the sum of the lengths of all of the words in the suffix tree.
size_t word_index_type
Alias for order that words were added.
Definition ukkonen.hpp:103
void add_word(const_iterator first, const_iterator last)
Check and add a word to the suffix tree.
Definition ukkonen.hpp:440
std::pair< State, Iterator > traverse_no_checks(Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Definition ukkonen.hpp:939
std::vector< Node > const & nodes() const noexcept
Returns the nodes in the suffix tree.
Definition ukkonen.hpp:461
size_t distance_from_root(Node const &n) const
Returns the distance of a node from the root.
word_index_type word_index_no_checks(Node const &n) const
Returns the index of the word corresponding to a node.
Definition ukkonen.hpp:634
void add_word_no_checks(const_iterator first, const_iterator last)
Add a word to the suffix tree.
Ukkonen & init()
Initialize an existing Ukkonen object.
word_index_type index_no_checks(Iterator first, Iterator last) const
Find the index of a word in the suffix tree.
const_iterator end() const noexcept
Returns an iterator pointing one past the last letter of the last word in the suffix tree.
Definition ukkonen.hpp:596
size_t index_type
Alias for an index between begin and end.
Definition ukkonen.hpp:109
size_t max_word_length() const noexcept
Returns the maximum length of word in the suffix tree.
Definition ukkonen.hpp:545
word_index_type word_index_no_checks(index_type i) const
Returns the index of the word corresponding to a position.
Definition ukkonen.hpp:679
size_t multiplicity_no_checks(word_index_type i) const
Returns the multiplicity of a word by index.
Definition ukkonen.hpp:752
Ukkonen & operator=(Ukkonen const &)
Default copy assignment.
void throw_if_contains_unique_letter(word_type const &w) const
Throw if w contains a letter equal to any of the unique letters added to the end of words in the suff...
Definition ukkonen.hpp:987
size_t length_of_distinct_words() const noexcept
Returns the sum of the lengths of the distinct words in the suffix tree.
Definition ukkonen.hpp:495
const_iterator cend() const noexcept
Returns an iterator pointing one past the last letter of the last word in the suffix tree.
Definition ukkonen.hpp:613
word_index_type is_suffix(State const &st) const
Check if a state corresponds to a suffix.
word_index_type word_index(Node const &n) const
Returns the index of the word corresponding to a node.
Definition ukkonen.hpp:652
size_t number_of_words() const noexcept
Returns the number of non-empty words in the suffix tree.
Definition ukkonen.hpp:530
word_index_type index(Iterator first, Iterator last) const
Find the index of a word in the suffix tree.
Definition ukkonen.hpp:860
size_t unique_letter_type
Alias for any letter that is added by Ukkonen (so that unique strings end in unique letters).
Definition ukkonen.hpp:100
const_iterator cbegin() const noexcept
Returns an iterator pointing to the first letter of the first word in the suffix tree.
Definition ukkonen.hpp:579
Ukkonen(Ukkonen const &)
Default copy constructor.
Ukkonen()
Default constructor.
Iterator traverse(State &st, Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Definition ukkonen.hpp:905
size_t multiplicity(word_index_type i) const
Returns the multiplicity of a word by index.
Definition ukkonen.hpp:770
std::pair< State, Iterator > traverse(Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Definition ukkonen.hpp:952
unique_letter_type unique_letter(word_index_type i) const noexcept
Returns the unique letter added to the end of a word in the suffix tree.
Definition ukkonen.hpp:795
bool is_unique_letter(letter_type l) const noexcept
Check if a letter is a unique letter added to the end of a word in the suffix tree.
Definition ukkonen.hpp:815
size_t number_of_distinct_words() const noexcept
Returns the number of distinct non-empty words in the suffix tree.
Definition ukkonen.hpp:479
T distance(T... args)
T cend(T... args)
std::string to_human_readable_repr(AhoCorasick const &ac)
Return a string representation.
Undefined const UNDEFINED
Value for something undefined.
#define LIBSEMIGROUPS_EXCEPTION(...)
Throw a LibsemigroupsException.
Definition exception.hpp:99
std::vector< letter_type > word_type
Type for a word over the generators of a semigroup.
Definition types.hpp:99
size_t letter_type
Type for the index of a generator of a semigroup.
Definition types.hpp:96
T make_pair(T... args)
Namespace for Ukkonen helper functions.
Definition ukkonen.hpp:1034
Iterator maximal_piece_suffix(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal suffix of a word occurring in two different places in a word in a suffix tree.
Definition ukkonen.hpp:1962
bool is_subword_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a subword of any word in a suffix tree.
auto traverse(Ukkonen const &u, Word const &w)
Traverse the suffix tree from the root.
Definition ukkonen.hpp:1229
Iterator maximal_piece_suffix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal suffix of a word occurring in two different places in a word in a suffix tree.
size_t length_maximal_piece_suffix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal suffix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:2040
auto dfs(Ukkonen const &u, T &helper)
Perform a depth first search in a suffix tree.
size_t number_of_distinct_subwords(Ukkonen const &u)
Returns the number of distinct subwords of the words in a suffix tree.
Iterator maximal_piece_prefix(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal prefix of a word occurring in two different places in a word in a suffix tree.
Definition ukkonen.hpp:1609
void add_word(Ukkonen &u, word_type const &w)
Add a word to the suffix tree.
bool is_subword(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a subword of any word in a suffix tree.
Definition ukkonen.hpp:1393
Dot dot(Ukkonen const &u)
Returns a Dot object representing a suffix tree.
size_t number_of_pieces_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the number of pieces in a decomposition of a word (if any).
std::vector< Iterator > pieces(Ukkonen const &u, Iterator first, Iterator last)
Find the pieces in a decomposition of a word (if any).
Definition ukkonen.hpp:2334
size_t length_maximal_piece_prefix(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal prefix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:1736
size_t length_maximal_piece_suffix(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal suffix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:2089
void add_words(Ukkonen &u, std::vector< word_type > const &words)
Add all words in a range to a Ukkonen object.
auto traverse_no_checks(Ukkonen const &u, Word const &w)
Traverse the suffix tree from the root.
Definition ukkonen.hpp:1201
bool is_suffix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a suffix of any word in a suffix tree.
std::vector< Iterator > pieces_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the pieces in a decomposition of a word (if any).
Iterator maximal_piece_prefix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal prefix of a word occurring in two different places in a word in a suffix tree.
bool is_piece(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a piece (occurs in two distinct places in the words of the suffix tree).
Definition ukkonen.hpp:1846
bool is_piece_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a piece (occurs in two distinct places in the words of the suffix tree).
Definition ukkonen.hpp:1812
size_t length_maximal_piece_prefix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal prefix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:1687
size_t number_of_pieces(Ukkonen const &u, Iterator first, Iterator last)
Find the number of pieces in a decomposition of a word (if any).
Definition ukkonen.hpp:2211
void add_word_no_checks(Ukkonen &u, word_type const &w)
Add a word to the suffix tree.
bool is_suffix(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a suffix of any word in a suffix tree.
Definition ukkonen.hpp:1493
void add_words_no_checks(Ukkonen &u, std::vector< word_type > const &words)
Add all words in a std::vector to a Ukkonen object.
Namespace containing some operators for creating words.
Definition word-range.hpp:2095
Namespace for everything in the libsemigroups library.
Definition action.hpp:44
T push_back(T... args)
T size(T... args)
T strlen(T... args)
The type of the nodes in the tree.
Definition ukkonen.hpp:194
bool is_root() const noexcept
Returns true if the node is the root and false if not.
Definition ukkonen.hpp:337
Node & operator=(Node &&)=default
Default move assignment.
Node(Node &&)=default
Default move constructor.
size_t length() const noexcept
The length of the edge leading into the current node.
Definition ukkonen.hpp:275
node_index_type & child(letter_type c)
The index of the child node corresponding to a letter (if any).
index_type l
The index of the first letter in the edge leading to the node.
Definition ukkonen.hpp:198
index_type r
The index of one past the last letter in the edge leading to the node.
Definition ukkonen.hpp:204
std::map< letter_type, node_index_type > children
The children of the current node.
Definition ukkonen.hpp:224
Node(Node const &)=default
Default copy constructor.
node_index_type parent
The index of the parent node.
Definition ukkonen.hpp:209
Node(index_type l=0, index_type r=0, node_index_type parent=UNDEFINED)
Construct a node from left most index, right most index, and parent.
Node & operator=(Node const &)=default
Default copy assignment.
bool is_leaf() const noexcept
Returns true if the node is a leaf and false if not.
Definition ukkonen.hpp:322
node_index_type child(letter_type c) const
The index of the child node corresponding to a letter (if any).
The return type of traverse.
Definition ukkonen.hpp:121
State(node_index_type vv, edge_index_type ppos)
Construct from index and position.
Definition ukkonen.hpp:168
State & operator=(State const &)=default
Default copy assignment.
node_index_type v
The index in Ukkonen::nodes of the node at the end of the position reached.
Definition ukkonen.hpp:127
bool operator==(State const &that) const noexcept
Compare states.
Definition ukkonen.hpp:184
edge_index_type pos
The position in the edge leading to the node v reached.
Definition ukkonen.hpp:132
State()=default
Default constructor.
State(State &&)=default
Default move constructor.
State & operator=(State &&)=default
Default move assignment.
State(State const &)=default
Default copy constructor.