libsemigroups  v3.0.0
C++ library for semigroups and monoids
Loading...
Searching...
No Matches
ukkonen.hpp
1//
2// libsemigroups - C++ library for semigroups and monoids
3// Copyright (C) 2021-2025 James D. Mitchell + Maria Tsalakou
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17//
18
19// This file contains an implementation of a generalised suffix tree, adapted
20// from:
21//
22// https://cp-algorithms.com/string/suffix-tree-ukkonen.html
23
24// TODO(1)
25// * there's an issue if you add a word containing a very large letter, then
26// add more words so that the unique letter used is now in the word with the
27// very large letter
28// * possibly move more of the inline functions in the ukkoken namespace into
29// the cpp file. Seemingly g++-14 complains about some of these (maybe the
30// others just aren't currently tested).
31
32#ifndef LIBSEMIGROUPS_UKKONEN_HPP_
33#define LIBSEMIGROUPS_UKKONEN_HPP_
34
35#include <algorithm> // for find_if, copy, max
36#include <cstring> // for size_t, strlen
37#include <iterator> // for distance
38#include <map> // for operator!=, map, _Rb_tree_iterator
39#include <numeric> // for accumulate
40#include <stack> // for stack
41#include <string> // for allocator, string, basic_string
42#include <utility> // for pair, make_pair
43#include <vector> // for vector
44
45#include "constants.hpp" // for operator==, operator!=, Max, UNDEFINED
46#include "debug.hpp" // for LIBSEMIGROUPS_ASSERT
47#include "dot.hpp" // for Dot
48#include "exception.hpp" // for LIBSEMIGROUPS_EXCEPTION
49#include "types.hpp" // for word_type, letter_type
50
51#include "detail/fmt.hpp" // for format, format_string
52#include "detail/string.hpp" // for maximum_common_prefix
53
54namespace libsemigroups {
55
68
81 class Ukkonen {
82 // Alias for index in _nodes
83 using node_index_type = size_t;
84
85 // Alias for index in side an edge
86 using edge_index_type = size_t;
87
88 public:
91 using unique_letter_type = size_t;
92
94 using word_index_type = size_t;
95
97 using const_iterator = typename word_type::const_iterator;
98
100 using index_type = size_t;
101
103 // Ukkonen - inner classes - public
105
112 struct State {
118 node_index_type v;
119
123 edge_index_type pos;
124
128 State() = default;
129
133 State(State const&) = default;
134
138 State(State&&) = default;
139
143 State& operator=(State const&) = default;
144
148 State& operator=(State&&) = default;
149
159 State(node_index_type vv, edge_index_type ppos) : v(vv), pos(ppos) {}
160
175 bool operator==(State const& that) const noexcept {
176 return v == that.v && pos == that.pos;
177 }
178 };
179
185 struct Node {
190
196
200 node_index_type parent;
201
202#ifndef LIBSEMIGROUPS_PARSED_BY_DOXYGEN
203 // No doc
204 node_index_type link;
205 // The next member is a weak indicator of whether or not the node
206 // corresponds to a real suffix. If the value is true, then the node
207 // corresponds to a real suffix. If the value is false, then the children
208 // should be checked via is_real_suffix(Node const&) in Ukkonen.
209 mutable bool is_real_suffix;
210#endif
211
216
220 Node(Node const&) = default;
221
225 Node(Node&&) = default;
226
230 Node& operator=(Node const&) = default;
231
235 Node& operator=(Node&&) = default;
236
251 explicit Node(index_type l = 0,
252 index_type r = 0,
253 node_index_type parent = UNDEFINED);
254
266 size_t length() const noexcept {
267 return r - l;
268 }
269
284 node_index_type& child(letter_type c);
285
300 node_index_type child(letter_type c) const;
301
313 bool is_leaf() const noexcept {
314 return children.empty();
315 }
316
328 bool is_root() const noexcept {
329 return parent == UNDEFINED;
330 }
331 };
332
333 private:
335 // Ukkonen - private data
337
338 size_t _max_word_length;
339 std::vector<size_t> _multiplicity;
340 unique_letter_type _next_unique_letter;
341 std::vector<Node> _nodes;
342 State _ptr;
343 std::vector<index_type> _word_begin;
344 std::vector<index_type> _word_index_lookup;
345 word_type _word;
346
347 public:
349 // Ukkonen - constructors - public
351
359
372
377
382
387
392
393 ~Ukkonen();
394
396 // Ukkonen - initialisation - public
398
421
433 add_word_no_checks(first, last);
434 }
435
437 // Ukkonen - attributes - public
439
452 std::vector<Node> const& nodes() const noexcept {
453 return _nodes;
454 }
455
470 size_t number_of_distinct_words() const noexcept {
471 return -1 - _next_unique_letter;
472 }
473
486 size_t length_of_distinct_words() const noexcept {
487 return _word.size() - number_of_distinct_words();
488 }
489
505 size_t length_of_words() const noexcept;
506
521 size_t number_of_words() const noexcept {
522 return std::accumulate(_multiplicity.cbegin(), _multiplicity.cend(), 0);
523 }
524
536 size_t max_word_length() const noexcept {
537 return _max_word_length;
538 }
539
553 const_iterator begin() const noexcept {
554 return _word.begin();
555 }
556
570 const_iterator cbegin() const noexcept {
571 return _word.cbegin();
572 }
573
587 const_iterator end() const noexcept {
588 return _word.end();
589 }
590
604 const_iterator cend() const noexcept {
605 return _word.cend();
606 }
607
626 LIBSEMIGROUPS_ASSERT(n.parent != UNDEFINED);
627 return word_index_no_checks(n.r - 1);
628 }
629
644 if (n.parent == UNDEFINED) {
646 "expected the parent of the parameter to not be UNDEFINED");
647 }
648 return word_index_no_checks(n);
649 }
650
671 LIBSEMIGROUPS_ASSERT(i < _word.size());
672 return _word_index_lookup[i];
673 }
674
691 if (i >= _word.size()) {
693 "expected the parameter to be in the range [0, {}), found {}",
694 _word.size(),
695 i);
696 }
697 return word_index_no_checks(i);
698 }
699
713 size_t distance_from_root(Node const& n) const;
714
728
744 return _multiplicity[i];
745 }
746
762 if (i >= _multiplicity.size()) {
764 "expected the parameter to be in the range [0, {}), found {}",
765 _multiplicity.size(),
766 i);
767 }
768 return multiplicity_no_checks(i);
769 }
770
787 LIBSEMIGROUPS_ASSERT(i < number_of_distinct_words());
788 return -1 - i;
789 }
790
806 bool is_unique_letter(letter_type l) const noexcept {
807 return l >= _next_unique_letter;
808 }
809
835 template <typename Iterator>
836 word_index_type index_no_checks(Iterator first, Iterator last) const;
837
838 // Returns the index of the word [first, last) in the suffix tree if it is
839 // contained (i.e. [first, last) is one of the words added using add_word),
840 // and UNDEFINED if not.
847 template <typename Iterator>
848 word_index_type index(Iterator first, Iterator last) const {
850 return index_no_checks(first, last);
851 }
852
880 template <typename Iterator>
881 Iterator traverse_no_checks(State& st, Iterator first, Iterator last) const;
882
889 template <typename Iterator>
890 Iterator traverse(State& st, Iterator first, Iterator last) const {
892 return traverse_no_checks(st, first, last);
893 }
894
920 template <typename Iterator>
922 Iterator last) const {
923 State st(0, 0);
924 return std::make_pair(st, traverse(st, first, last));
925 }
926
933 template <typename Iterator>
934 std::pair<State, Iterator> traverse(Iterator first, Iterator last) const {
936 return traverse_no_checks(first, last);
937 }
938
940 // Ukkonen - validation - public
942
959 template <typename Iterator>
960 void throw_if_contains_unique_letter(Iterator first, Iterator last) const;
961
969
970 private:
972 // Ukkonen - helpers - private
974
975 bool is_real_suffix(Node const& n) const;
976
977 size_t word_length(word_index_type index) const {
978 LIBSEMIGROUPS_ASSERT(index < _word_begin.size() - 1);
979 return (_word_begin[index + 1] - _word_begin[index]) - 1;
980 }
981
983 // The following functions go, split, get_link, and tree_extend are
984 // minimally adapted from:
985 //
986 // https://cp-algorithms.com/string/suffix-tree-ukkonen.html
988
989 // Follow the path in the tree starting at the position described by
990 // State st, and corresponding to the range [l, r) in _word.
991 void go(State& st, index_type l, index_type r) const;
992
993 // Split the node _nodes[st.v] into two nodes, the new node
994 // with edge corresponding to
995 //
996 // [_nodes[st.v].l, _nodes[st.v].l + st.pos)
997 //
998 // and the old node with edge corresponding to
999 //
1000 // [_nodes[st.v].l + st.pos, _nodes[st.v].r)
1001 node_index_type split(State const& st);
1002
1003 // Get the suffix link of a node by index
1004 node_index_type get_link(node_index_type v);
1005
1006 // Perform the phase starting with the pos letter of the word.
1007 void tree_extend(index_type pos);
1008 };
1009
1013 namespace ukkonen {
1018
1022 template <typename Iterator>
1023 void add_word_no_checks(Ukkonen& u, Iterator first, Iterator last) {
1024 // TODO(later) it'd be better to just convert the values pointed at by the
1025 // iterators, than to allocate a word_type here, but this is currently a
1026 // bit tricky to set up
1027 add_word_no_checks(u, word_type(first, last));
1028 }
1029
1033 template <typename Word>
1034 void add_word_no_checks(Ukkonen& u, Word const& w) {
1035 add_word_no_checks(u, w.cbegin(), w.cend());
1036 }
1037
1041 void add_word_no_checks(Ukkonen& u, char const* w);
1042
1051 void add_word(Ukkonen& u, word_type const& w);
1052
1061 template <typename Iterator>
1062 void add_word(Ukkonen& u, Iterator first, Iterator last) {
1063 // TODO(later) it'd be better to just convert the values pointed at by the
1064 // iterators, than to allocate a word_type here, but this is currently a
1065 // bit tricky to set up
1066 add_word(u, word_type(first, last));
1067 }
1068
1077 template <typename Word>
1078 void add_word(Ukkonen& u, Word const& w) {
1079 add_word(u, w.cbegin(), w.cend());
1080 }
1081
1090 void add_word(Ukkonen& u, char const* w);
1091
1107
1124 template <typename Iterator1, typename Iterator2>
1125 void add_words_no_checks(Ukkonen& u, Iterator1 first, Iterator2 last) {
1126 for (auto it = first; it != last; ++it) {
1127 add_word_no_checks(u, *it);
1128 }
1129 }
1130
1140
1149 template <typename Iterator1, typename Iterator2>
1150 void add_words(Ukkonen& u, Iterator1 first, Iterator2 last) {
1151 for (auto it = first; it != last; ++it) {
1152 add_word(u, *it);
1153 }
1154 }
1155
1159 template <typename Word>
1160 auto traverse_no_checks(Ukkonen const& u, Word const& w) {
1161 return u.traverse_no_checks(w.cbegin(), w.cend());
1162 }
1163
1169
1174 char const* w);
1175
1184 template <typename Word>
1185 auto traverse(Ukkonen const& u, Word const& w) {
1186 return u.traverse(w.cbegin(), w.cend());
1187 }
1188
1199 traverse(Ukkonen const& u, word_type const& w);
1200
1210 char const* w);
1211
1217 template <typename Word>
1219 Ukkonen const& u,
1220 Word const& w) {
1221 return u.traverse_no_checks(st, w.cbegin(), w.cend());
1222 }
1223
1229 word_type::const_iterator traverse_no_checks(Ukkonen::State& st,
1230 Ukkonen const& u,
1231 word_type const& w);
1232
1239 Ukkonen const& u,
1240 char const* w);
1241
1252 template <typename Word>
1253 auto traverse(Ukkonen::State& st, Ukkonen const& u, Word const& w) {
1254 return u.traverse(st, w.cbegin(), w.cend());
1255 }
1256
1267 word_type::const_iterator traverse(Ukkonen::State& st,
1268 Ukkonen const& u,
1269 word_type const& w);
1270
1281 char const* traverse(Ukkonen::State& st, Ukkonen const& u, char const* w);
1282
1306 template <typename Iterator>
1307 bool is_subword_no_checks(Ukkonen const& u, Iterator first, Iterator last);
1308
1313 template <typename Word>
1314 bool is_subword_no_checks(Ukkonen const& u, Word const& w) {
1315 return is_subword_no_checks(u, w.cbegin(), w.cend());
1316 }
1317
1322 bool is_subword_no_checks(Ukkonen const& u, char const* w);
1323
1328 bool is_subword_no_checks(Ukkonen const& u, word_type const& w);
1329
1339 template <typename Iterator>
1340 bool is_subword(Ukkonen const& u, Iterator first, Iterator last) {
1341 u.throw_if_contains_unique_letter(first, last);
1342 return is_subword_no_checks(u, first, last);
1343 }
1344
1354 template <typename Word>
1355 bool is_subword(Ukkonen const& u, Word const& w) {
1356 return is_subword(u, w.cbegin(), w.cend());
1357 }
1358
1368 bool is_subword(Ukkonen const& u, char const* w);
1369
1379 bool is_subword(Ukkonen const& u, word_type const& w);
1380
1404 template <typename Iterator>
1405 bool is_suffix_no_checks(Ukkonen const& u, Iterator first, Iterator last);
1406
1411 template <typename Word>
1412 bool is_suffix_no_checks(Ukkonen const& u, Word const& w) {
1413 return is_suffix_no_checks(u, w.cbegin(), w.cend());
1414 }
1415
1420 // This function is required so that we can use initialiser list, as an
1421 // argument to is_suffix_no_checks
1422 bool is_suffix_no_checks(Ukkonen const& u, word_type const& w);
1423
1428 bool is_suffix_no_checks(Ukkonen const& u, char const* w);
1429
1439 template <typename Iterator>
1440 bool is_suffix(Ukkonen const& u, Iterator first, Iterator last) {
1441 u.throw_if_contains_unique_letter(first, last);
1442 return is_suffix_no_checks(u, first, last);
1443 }
1444
1454 template <typename Word>
1455 bool is_suffix(Ukkonen const& u, Word const& w) {
1456 return is_suffix(u, w.cbegin(), w.cend());
1457 }
1458
1468 // This function is required so that we can use initialiser list, as an
1469 // argument to is_suffix
1470 bool is_suffix(Ukkonen const& u, word_type const& w);
1471
1481 bool is_suffix(Ukkonen const& u, char const* w);
1482
1509 template <typename Iterator>
1511 Iterator first,
1512 Iterator last);
1513
1519 template <typename Word>
1520 typename Word::const_iterator
1521 maximal_piece_prefix_no_checks(Ukkonen const& u, Word const& w) {
1522 return maximal_piece_prefix_no_checks(u, w.cbegin(), w.cend());
1523 }
1524
1530 inline typename word_type::const_iterator
1532 return maximal_piece_prefix_no_checks(u, w.cbegin(), w.cend());
1533 }
1534
1540 inline char const* maximal_piece_prefix_no_checks(Ukkonen const& u,
1541 char const* w) {
1542 return maximal_piece_prefix_no_checks(u, w, w + std::strlen(w));
1543 }
1544
1555 template <typename Iterator>
1557 Iterator first,
1558 Iterator last) {
1559 u.throw_if_contains_unique_letter(first, last);
1560 return maximal_piece_prefix_no_checks(u, first, last);
1561 }
1562
1573 template <typename Word>
1574 typename Word::const_iterator maximal_piece_prefix(Ukkonen const& u,
1575 Word const& w) {
1576 return maximal_piece_prefix(u, w.cbegin(), w.cend());
1577 }
1578
1589 inline typename word_type::const_iterator
1591 return maximal_piece_prefix(u, w.cbegin(), w.cend());
1592 }
1593
1604 inline char const* maximal_piece_prefix(Ukkonen const& u, char const* w) {
1605 return maximal_piece_prefix(u, w, w + std::strlen(w));
1606 }
1607
1633 template <typename Iterator>
1635 Iterator first,
1636 Iterator last) {
1637 return std::distance(first,
1638 maximal_piece_prefix_no_checks(u, first, last));
1639 }
1640
1646 template <typename Word>
1648 Word const& w) {
1649 return length_maximal_piece_prefix_no_checks(u, w.cbegin(), w.cend());
1650 }
1651
1658 word_type const& w) {
1660 }
1661
1668 char const* w) {
1670 }
1671
1682 template <typename Iterator>
1684 Iterator first,
1685 Iterator last) {
1686 return std::distance(first, maximal_piece_prefix(u, first, last));
1687 }
1688
1699 template <typename Word>
1700 size_t length_maximal_piece_prefix(Ukkonen const& u, Word const& w) {
1701 return length_maximal_piece_prefix(u, w.cbegin(), w.cend());
1702 }
1703
1714 inline size_t length_maximal_piece_prefix(Ukkonen const& u,
1715 word_type const& w) {
1716 return length_maximal_piece_prefix(u, w.cbegin(), w.cend());
1717 }
1718
1729 inline size_t length_maximal_piece_prefix(Ukkonen const& u, char const* w) {
1730 return length_maximal_piece_prefix(u, w, w + std::strlen(w));
1731 }
1732
1758 template <typename Iterator>
1759 bool is_piece_no_checks(Ukkonen const& u, Iterator first, Iterator last) {
1760 return maximal_piece_prefix_no_checks(u, first, last) == last;
1761 }
1762
1766 template <typename Word>
1767 bool is_piece_no_checks(Ukkonen const& u, Word const& w) {
1768 return is_piece_no_checks(u, w.cbegin(), w.cend());
1769 }
1770
1773 inline bool is_piece_no_checks(Ukkonen const& u, word_type const& w) {
1774 return is_piece_no_checks(u, w.cbegin(), w.cend());
1775 }
1776
1780 inline bool is_piece_no_checks(Ukkonen const& u, char const* w) {
1781 return is_piece_no_checks(u, w, w + std::strlen(w));
1782 }
1783
1792 template <typename Iterator>
1793 bool is_piece(Ukkonen const& u, Iterator first, Iterator last) {
1794 return maximal_piece_prefix(u, first, last) == last;
1795 }
1796
1805 template <typename Word>
1806 bool is_piece(Ukkonen const& u, Word const& w) {
1807 return is_piece(u, w.cbegin(), w.cend());
1808 }
1809
1818 inline bool is_piece(Ukkonen const& u, word_type const& w) {
1819 return is_piece(u, w.cbegin(), w.cend());
1820 }
1821
1830 inline bool is_piece(Ukkonen const& u, char const* w) {
1831 return is_piece(u, w, w + std::strlen(w));
1832 }
1833
1862 template <typename Iterator>
1864 Iterator first,
1865 Iterator last);
1866
1872 template <typename Word>
1873 typename Word::const_iterator
1874 maximal_piece_suffix_no_checks(Ukkonen const& u, Word const& w) {
1875 return maximal_piece_suffix_no_checks(u, w.cbegin(), w.cend());
1876 }
1877
1883 inline typename word_type::const_iterator
1885 return maximal_piece_suffix_no_checks(u, w.cbegin(), w.cend());
1886 }
1887
1893 inline char const* maximal_piece_suffix_no_checks(Ukkonen const& u,
1894 char const* w) {
1895 return maximal_piece_suffix_no_checks(u, w, w + std::strlen(w));
1896 }
1897
1908 template <typename Iterator>
1910 Iterator first,
1911 Iterator last) {
1912 u.throw_if_contains_unique_letter(first, last);
1913 return maximal_piece_suffix_no_checks(u, first, last);
1914 }
1915
1926 template <typename Word>
1927 typename Word::const_iterator maximal_piece_suffix(Ukkonen const& u,
1928 Word const& w) {
1929 return maximal_piece_suffix(u, w.cbegin(), w.cend());
1930 }
1931
1942 inline typename word_type::const_iterator
1944 return maximal_piece_suffix(u, w.cbegin(), w.cend());
1945 }
1946
1957 inline char const* maximal_piece_suffix(Ukkonen const& u, char const* w) {
1958 return maximal_piece_suffix(u, w, w + std::strlen(w));
1959 }
1960
1986 template <typename Iterator>
1988 Iterator first,
1989 Iterator last) {
1990 return std::distance(maximal_piece_suffix_no_checks(u, first, last),
1991 last);
1992 }
1993
1999 template <typename Word>
2001 Word const& w) {
2002 return length_maximal_piece_suffix_no_checks(u, w.cbegin(), w.cend());
2003 }
2004
2011 word_type const& w) {
2013 }
2014
2021 char const* w) {
2023 }
2024
2035 template <typename Iterator>
2037 Iterator first,
2038 Iterator last) {
2039 return std::distance(maximal_piece_suffix(u, first, last), last);
2040 }
2041
2052 template <typename Word>
2053 size_t length_maximal_piece_suffix(Ukkonen const& u, Word const& w) {
2054 return length_maximal_piece_suffix(u, w.cbegin(), w.cend());
2055 }
2056
2067 inline size_t length_maximal_piece_suffix(Ukkonen const& u,
2068 word_type const& w) {
2069 return length_maximal_piece_suffix(u, w.cbegin(), w.cend());
2070 }
2071
2082 inline size_t length_maximal_piece_suffix(Ukkonen const& u, char const* w) {
2083 return length_maximal_piece_suffix(u, w, w + std::strlen(w));
2084 }
2085
2111 template <typename Iterator>
2113 Iterator first,
2114 Iterator last);
2115
2121 template <typename Word>
2122 size_t number_of_pieces_no_checks(Ukkonen const& u, Word const& w) {
2123 return number_of_pieces_no_checks(u, w.cbegin(), w.cend());
2124 }
2125
2131 inline size_t number_of_pieces_no_checks(Ukkonen const& u,
2132 word_type const& w) {
2133 return number_of_pieces_no_checks(u, w.cbegin(), w.cend());
2134 }
2135
2141 inline size_t number_of_pieces_no_checks(Ukkonen const& u, char const* w) {
2142 return number_of_pieces_no_checks(u, w, w + std::strlen(w));
2143 }
2144
2145 // clang-format off
2146 // NOLINTNEXTLINE(whitespace/line_length)
2148 // clang-format on
2157 template <typename Iterator>
2158 size_t number_of_pieces(Ukkonen const& u, Iterator first, Iterator last) {
2159 u.throw_if_contains_unique_letter(first, last);
2160 return number_of_pieces_no_checks(u, first, last);
2161 }
2162
2173 template <typename Word>
2174 size_t number_of_pieces(Ukkonen const& u, Word const& w) {
2175 return number_of_pieces(u, w.cbegin(), w.cend());
2176 }
2177
2188 inline size_t number_of_pieces(Ukkonen const& u, word_type const& w) {
2189 return number_of_pieces(u, w.cbegin(), w.cend());
2190 }
2191
2202 inline size_t number_of_pieces(Ukkonen const& u, char const* w) {
2203 return number_of_pieces(u, w, w + std::strlen(w));
2204 }
2205
2223
2250 template <typename Iterator>
2252 Iterator first,
2253 Iterator last);
2254
2258 template <typename Word>
2260
2265 word_type const& w);
2266
2271
2280 template <typename Iterator>
2282 Iterator first,
2283 Iterator last) {
2284 u.throw_if_contains_unique_letter(first, last);
2285 return pieces_no_checks(u, first, last);
2286 }
2287
2296 template <typename Word>
2297 std::vector<Word> pieces(Ukkonen const& u, Word const& w) {
2298 u.throw_if_contains_unique_letter(w.cbegin(), w.cend());
2299 return pieces_no_checks(u, w);
2300 }
2301
2310 inline std::vector<word_type> pieces(Ukkonen const& u, word_type const& w) {
2312 return pieces_no_checks(u, w);
2313 }
2314
2321 inline std::vector<std::string> pieces(Ukkonen const& u, char const* w) {
2323 return pieces_no_checks(u, w);
2324 }
2325
2343 [[nodiscard]] Dot dot(Ukkonen const& u);
2344
2371 template <typename T>
2372 auto dfs(Ukkonen const& u, T& helper);
2373
2374 } // namespace ukkonen
2375
2386 [[nodiscard]] inline std::string to_human_readable_repr(Ukkonen const& u) {
2387 return fmt::format("<Ukkonen with {} distinct words>",
2389 }
2390
2403 [[nodiscard]] inline std::string
2405 std::string const& sep = "::") {
2406 return fmt::format(
2407 "<Ukkonen{}State with pos = {} and v = {}>", sep, st.pos, st.v);
2408 }
2409
2422 [[nodiscard]] inline std::string
2424 std::string const& sep = "::") {
2425 return fmt::format(
2426 "<Ukkonen{}Node with {} children and parent edge label [{}, {})>",
2427 sep,
2428 node.children.size(),
2429 node.l,
2430 node.r);
2431 }
2432
2434
2435} // namespace libsemigroups
2436
2437#include "ukkonen.tpp"
2438
2439#endif // LIBSEMIGROUPS_UKKONEN_HPP_
T accumulate(T... args)
T cbegin(T... args)
A representation of a graph in the DOT language of Graphviz.
Definition dot.hpp:115
For an implementation of Ukkonen's algorithm.
Definition ukkonen.hpp:81
Iterator traverse_no_checks(State &st, Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Ukkonen(Ukkonen &&)
Default move constructor.
typename word_type::const_iterator const_iterator
Alias for word_type iterators.
Definition ukkonen.hpp:97
word_index_type word_index(index_type i) const
Returns the index of the word corresponding to a position.
Definition ukkonen.hpp:690
Ukkonen & operator=(Ukkonen &&)
Default move assignment.
const_iterator begin() const noexcept
Returns an iterator pointing to the first letter of the first word in the suffix tree.
Definition ukkonen.hpp:553
void throw_if_contains_unique_letter(Iterator first, Iterator last) const
Throw if the word [first, last) contains a letter equal to any of the unique letters added to the end...
size_t length_of_words() const noexcept
Returns the sum of the lengths of all of the words in the suffix tree.
size_t word_index_type
Alias for order that words were added.
Definition ukkonen.hpp:94
void add_word(const_iterator first, const_iterator last)
Check and add a word to the suffix tree.
Definition ukkonen.hpp:431
std::pair< State, Iterator > traverse_no_checks(Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Definition ukkonen.hpp:921
std::vector< Node > const & nodes() const noexcept
Returns the nodes in the suffix tree.
Definition ukkonen.hpp:452
size_t distance_from_root(Node const &n) const
Returns the distance of a node from the root.
word_index_type word_index_no_checks(Node const &n) const
Returns the index of the word corresponding to a node.
Definition ukkonen.hpp:625
void add_word_no_checks(const_iterator first, const_iterator last)
Add a word to the suffix tree.
Ukkonen & init()
Initialize an existing Ukkonen object.
word_index_type index_no_checks(Iterator first, Iterator last) const
Find the index of a word in the suffix tree.
const_iterator end() const noexcept
Returns an iterator pointing one past the last letter of the last word in the suffix tree.
Definition ukkonen.hpp:587
size_t index_type
Alias for an index between begin and end.
Definition ukkonen.hpp:100
size_t max_word_length() const noexcept
Returns the maximum length of word in the suffix tree.
Definition ukkonen.hpp:536
word_index_type word_index_no_checks(index_type i) const
Returns the index of the word corresponding to a position.
Definition ukkonen.hpp:670
size_t multiplicity_no_checks(word_index_type i) const
Returns the multiplicity of a word by index.
Definition ukkonen.hpp:743
Ukkonen & operator=(Ukkonen const &)
Default copy assignment.
void throw_if_contains_unique_letter(word_type const &w) const
Throw if w contains a letter equal to any of the unique letters added to the end of words in the suff...
Definition ukkonen.hpp:966
size_t length_of_distinct_words() const noexcept
Returns the sum of the lengths of the distinct words in the suffix tree.
Definition ukkonen.hpp:486
const_iterator cend() const noexcept
Returns an iterator pointing one past the last letter of the last word in the suffix tree.
Definition ukkonen.hpp:604
word_index_type is_suffix(State const &st) const
Check if a state corresponds to a suffix.
word_index_type word_index(Node const &n) const
Returns the index of the word corresponding to a node.
Definition ukkonen.hpp:643
size_t number_of_words() const noexcept
Returns the number of non-empty words in the suffix tree.
Definition ukkonen.hpp:521
word_index_type index(Iterator first, Iterator last) const
Find the index of a word in the suffix tree.
Definition ukkonen.hpp:848
size_t unique_letter_type
Alias for any letter that is added by Ukkonen (so that unique strings end in unique letters).
Definition ukkonen.hpp:91
const_iterator cbegin() const noexcept
Returns an iterator pointing to the first letter of the first word in the suffix tree.
Definition ukkonen.hpp:570
Ukkonen(Ukkonen const &)
Default copy constructor.
Ukkonen()
Default constructor.
Iterator traverse(State &st, Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Definition ukkonen.hpp:890
size_t multiplicity(word_index_type i) const
Returns the multiplicity of a word by index.
Definition ukkonen.hpp:761
std::pair< State, Iterator > traverse(Iterator first, Iterator last) const
Traverse the suffix tree from the root.
Definition ukkonen.hpp:934
unique_letter_type unique_letter(word_index_type i) const noexcept
Returns the unique letter added to the end of a word in the suffix tree.
Definition ukkonen.hpp:786
bool is_unique_letter(letter_type l) const noexcept
Check if a letter is a unique letter added to the end of a word in the suffix tree.
Definition ukkonen.hpp:806
size_t number_of_distinct_words() const noexcept
Returns the number of distinct non-empty words in the suffix tree.
Definition ukkonen.hpp:470
T distance(T... args)
T cend(T... args)
std::string to_human_readable_repr(AhoCorasick const &ac)
Return a string representation.
Undefined const UNDEFINED
Value for something undefined.
#define LIBSEMIGROUPS_EXCEPTION(...)
Throw a LibsemigroupsException.
Definition exception.hpp:95
std::vector< letter_type > word_type
Type for a word over the generators of a semigroup.
Definition types.hpp:101
size_t letter_type
Type for the index of a generator of a semigroup.
Definition types.hpp:98
T make_pair(T... args)
Namespace for Ukkonen helper functions.
Definition ukkonen.hpp:1013
Iterator maximal_piece_suffix(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal suffix of a word occurring in two different places in a word in a suffix tree.
Definition ukkonen.hpp:1909
bool is_subword_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a subword of any word in a suffix tree.
auto traverse(Ukkonen const &u, Word const &w)
Traverse the suffix tree from the root.
Definition ukkonen.hpp:1185
Iterator maximal_piece_suffix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal suffix of a word occurring in two different places in a word in a suffix tree.
size_t length_maximal_piece_suffix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal suffix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:1987
auto dfs(Ukkonen const &u, T &helper)
Perform a depth first search in a suffix tree.
size_t number_of_distinct_subwords(Ukkonen const &u)
Returns the number of distinct subwords of the words in a suffix tree.
Iterator maximal_piece_prefix(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal prefix of a word occurring in two different places in a word in a suffix tree.
Definition ukkonen.hpp:1556
void add_word(Ukkonen &u, word_type const &w)
Add a word to the suffix tree.
bool is_subword(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a subword of any word in a suffix tree.
Definition ukkonen.hpp:1340
Dot dot(Ukkonen const &u)
Returns a Dot object representing a suffix tree.
size_t number_of_pieces_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the number of pieces in a decomposition of a word (if any).
std::vector< Iterator > pieces(Ukkonen const &u, Iterator first, Iterator last)
Find the pieces in a decomposition of a word (if any).
Definition ukkonen.hpp:2281
size_t length_maximal_piece_prefix(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal prefix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:1683
size_t length_maximal_piece_suffix(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal suffix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:2036
void add_words(Ukkonen &u, std::vector< word_type > const &words)
Add all words in a range to a Ukkonen object.
auto traverse_no_checks(Ukkonen const &u, Word const &w)
Traverse the suffix tree from the root.
Definition ukkonen.hpp:1160
bool is_suffix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a suffix of any word in a suffix tree.
std::vector< Iterator > pieces_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the pieces in a decomposition of a word (if any).
Iterator maximal_piece_prefix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the maximal prefix of a word occurring in two different places in a word in a suffix tree.
bool is_piece(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a piece (occurs in two distinct places in the words of the suffix tree).
Definition ukkonen.hpp:1793
bool is_piece_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a piece (occurs in two distinct places in the words of the suffix tree).
Definition ukkonen.hpp:1759
size_t length_maximal_piece_prefix_no_checks(Ukkonen const &u, Iterator first, Iterator last)
Find the length of the maximal prefix of a word occurring in two different places in a word in a suff...
Definition ukkonen.hpp:1634
size_t number_of_pieces(Ukkonen const &u, Iterator first, Iterator last)
Find the number of pieces in a decomposition of a word (if any).
Definition ukkonen.hpp:2158
void add_word_no_checks(Ukkonen &u, word_type const &w)
Add a word to the suffix tree.
bool is_suffix(Ukkonen const &u, Iterator first, Iterator last)
Check if a word is a suffix of any word in a suffix tree.
Definition ukkonen.hpp:1440
void add_words_no_checks(Ukkonen &u, std::vector< word_type > const &words)
Add all words in a std::vector to a Ukkonen object.
Namespace containing some operators for creating words.
Definition word-range.hpp:2072
Namespace for everything in the libsemigroups library.
Definition action.hpp:44
T size(T... args)
T strlen(T... args)
The type of the nodes in the tree.
Definition ukkonen.hpp:185
bool is_root() const noexcept
Returns true if the node is the root and false if not.
Definition ukkonen.hpp:328
Node & operator=(Node &&)=default
Default move assignment.
Node(Node &&)=default
Default move constructor.
size_t length() const noexcept
The length of the edge leading into the current node.
Definition ukkonen.hpp:266
node_index_type & child(letter_type c)
The index of the child node corresponding to a letter (if any).
index_type l
The index of the first letter in the edge leading to the node.
Definition ukkonen.hpp:189
index_type r
The index of one past the last letter in the edge leading to the node.
Definition ukkonen.hpp:195
std::map< letter_type, node_index_type > children
The children of the current node.
Definition ukkonen.hpp:215
Node(Node const &)=default
Default copy constructor.
node_index_type parent
The index of the parent node.
Definition ukkonen.hpp:200
Node(index_type l=0, index_type r=0, node_index_type parent=UNDEFINED)
Construct a node from left most index, right most index, and parent.
Node & operator=(Node const &)=default
Default copy assignment.
bool is_leaf() const noexcept
Returns true if the node is a leaf and false if not.
Definition ukkonen.hpp:313
node_index_type child(letter_type c) const
The index of the child node corresponding to a letter (if any).
The return type of traverse.
Definition ukkonen.hpp:112
State(node_index_type vv, edge_index_type ppos)
Construct from index and position.
Definition ukkonen.hpp:159
State & operator=(State const &)=default
Default copy assignment.
node_index_type v
The index in Ukkonen::nodes of the node at the end of the position reached.
Definition ukkonen.hpp:118
bool operator==(State const &that) const noexcept
Compare states.
Definition ukkonen.hpp:175
edge_index_type pos
The position in the edge leading to the node v reached.
Definition ukkonen.hpp:123
State()=default
Default constructor.
State(State &&)=default
Default move constructor.
State & operator=(State &&)=default
Default move assignment.
State(State const &)=default
Default copy constructor.