libsemigroups  v3.1.2
C++ library for semigroups and monoids
Loading...
Searching...
No Matches
aho-corasick.hpp
1//
2// libsemigroups - C++ library for semigroups and monoids
3// Copyright (C) 2023-2025 Joseph Edwards + James D. Mitchell
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17//
18
19// This file contains the implementation of a trie with suffix links for use by
20// the Aho-Corasick dictionary search algorithm
21
22#ifndef LIBSEMIGROUPS_AHO_CORASICK_HPP_
23#define LIBSEMIGROUPS_AHO_CORASICK_HPP_
24
25#include <cstddef> // for size_t
26#include <memory> // for allocator_traits<>::value_type
27#include <set> // for set
28#include <stack> // for stack
29#include <string> // for string
30#include <unordered_map> // for unordered_map
31#include <vector> // for vector
32
33#include "constants.hpp" // for Undefined, operator!=, UNDEFINED, operator==
34#include "debug.hpp" // for LIBSEMIGROUPS_ASSERT
35#include "exception.hpp" // for LIBSEMIGROUPS_EXCEPTION
36#include "ranges.hpp" // for rx::iterator_range
37#include "types.hpp" // for letter_type, word_type
38
39#include "detail/print.hpp" // for to_printable
40
41// TODO(2) make nodes accessible as indices of some list (numbered nodes).
42// Make sure this address the badness of traversals (lots of different data
43// types and it just feels a bit hacky)
44// TODO(2) is it worthwhile storing a pointer to the terminal nodes beneath
45// each node? If this can be updated quickly, it would save a lot of time in
46// overlap/confluence checking. One compromise is to have a pointer to the rules
47// any given node is contained within. This could be updated easily when adding
48// new rules, but more care would be needed when removing rules.
49// TODO(1) change names from set_X and get_X to X(val) and X(). e.g.
50// set_suffix_link('a') -> suffix_link('a')
51// TODO(2) add something that gets a ranges element to find all terminal nodes.
52// TODO(2) change all_nodes[i] to node_no_checks(i);
53
58namespace libsemigroups {
59
60 class Dot; // forward decl
61
83 public:
85 using index_type = size_t;
86
88 using const_iterator = typename word_type::const_iterator;
89
91 static constexpr index_type root = 0;
92
93 private:
94 // Implements the node of a trie
95 class Node {
96 private:
98 mutable index_type _link;
99 mutable size_t _height;
100 index_type _parent;
101 letter_type _parent_letter;
102 bool _terminal;
103
104 public:
105 Node() : Node(UNDEFINED, UNDEFINED) {}
106
107 Node(index_type parent, letter_type a)
108 : _children(),
109 _link(),
110 _height(),
111 _parent(),
112 _parent_letter(),
113 _terminal() {
114 init(parent, a);
115 }
116
117 Node(const Node&) = default;
118 Node& operator=(const Node&) = default;
119 Node(Node&&) = default;
120 Node& operator=(Node&&) = default;
121
122 Node& init(index_type parent, letter_type a) noexcept;
123
124 Node& init() noexcept {
125 return init(UNDEFINED, UNDEFINED);
126 }
127
128 [[nodiscard]] index_type child(letter_type a) const;
129
130 [[nodiscard]] size_t height() const noexcept {
131 return _height;
132 }
133
134 [[nodiscard]] index_type suffix_link() const noexcept {
135 return _link;
136 }
137
138 void set_suffix_link(index_type val) const noexcept {
139 _link = val;
140 }
141
142 void clear_suffix_link() const noexcept;
143
144 [[nodiscard]] decltype(_children)& children() const noexcept {
145 return _children;
146 }
147
148 [[nodiscard]] size_t number_of_children() const noexcept {
149 return _children.size();
150 }
151
152 [[nodiscard]] bool is_terminal() const noexcept {
153 return _terminal;
154 }
155
156 Node& set_terminal(bool val) noexcept {
157 _terminal = val;
158 return *this;
159 }
160
161 [[nodiscard]] index_type parent() const noexcept {
162 return _parent;
163 }
164
165 [[nodiscard]] letter_type parent_letter() const noexcept {
166 return _parent_letter;
167 }
168
169 void set_height(size_t val) const noexcept {
170 _height = val;
171 }
172 };
173
174 std::vector<Node> _all_nodes;
175 std::set<index_type> _active_nodes_index;
176 std::stack<index_type> _inactive_nodes_index;
177 mutable bool _valid_links;
178
179 public:
185
189 AhoCorasick(AhoCorasick const&) = default;
190
195
200
205
206 ~AhoCorasick();
207
221
234 [[nodiscard]] size_t number_of_nodes() const noexcept {
235 return _active_nodes_index.size();
236 }
237
250 [[nodiscard]] rx::iterator_range<std::set<index_type>::const_iterator>
251 active_nodes() const {
252 return rx::iterator_range(cbegin_nodes(), cend_nodes());
253 }
254
268 cbegin_nodes() const noexcept {
269 return _active_nodes_index.cbegin();
270 }
271
285 cend_nodes() const noexcept {
286 return _active_nodes_index.cend();
287 }
288
301 [[nodiscard]] std::set<index_type>::iterator begin_nodes() const noexcept {
302 return _active_nodes_index.begin();
303 }
304
317 [[nodiscard]] std::set<index_type>::iterator end_nodes() const noexcept {
318 return _active_nodes_index.end();
319 }
320
331 template <typename Iterator>
332 index_type add_word(Iterator first, Iterator last);
333
358 template <typename Iterator>
359 index_type add_word_no_checks(Iterator first, Iterator last);
360
371 template <typename Iterator>
372 index_type rm_word(Iterator first, Iterator last);
373
405 template <typename Iterator>
406 index_type rm_word_no_checks(Iterator first, Iterator last);
407
426 // TODO(2) Should this be templated?
428 letter_type a) const;
429
439 [[nodiscard]] index_type traverse(index_type current, letter_type a) const {
441 return traverse_no_checks(current, a);
442 }
443
463 // TODO(2) template to accept Iterator not word_type&
465
486 word_type w;
488 return w;
489 }
490
504
518
537 [[nodiscard]] size_t height_no_checks(index_type i) const;
538
548 [[nodiscard]] size_t height(index_type i) const {
550 return height_no_checks(i);
551 }
552
573 [[nodiscard]] index_type suffix_link_no_checks(index_type current) const;
574
584 [[nodiscard]] index_type suffix_link(index_type current) const {
586 return suffix_link_no_checks(current);
587 }
588
610 [[nodiscard]] Node const& node_no_checks(index_type i) const {
611 LIBSEMIGROUPS_ASSERT(i < _all_nodes.size());
612 return _all_nodes[i];
613 }
614
624 [[nodiscard]] Node const& node(index_type i) const {
626 return node_no_checks(i);
627 }
628
650 letter_type letter) const {
651 LIBSEMIGROUPS_ASSERT(parent < _all_nodes.size());
652 LIBSEMIGROUPS_ASSERT(_active_nodes_index.count(parent) == 1);
653 return _all_nodes[parent].child(letter);
654 }
655
665 [[nodiscard]] index_type child(index_type parent,
666 letter_type letter) const {
668 return _all_nodes[parent].child(letter);
669 }
670
685
702
703 private:
704 [[nodiscard]] index_type new_active_node_no_checks(index_type parent,
705 letter_type a);
706
707 [[nodiscard]] index_type new_active_node(index_type parent, letter_type a) {
709 return new_active_node_no_checks(parent, a);
710 }
711
712 void deactivate_node_no_checks(index_type i);
713
714 void deactivate_node(index_type i) {
716 deactivate_node_no_checks(i);
717 }
718
719 template <typename Iterator>
720 [[nodiscard]] index_type traverse_trie(Iterator first, Iterator last) const;
721
722 void clear_suffix_links() const;
723 };
724
737
748 namespace aho_corasick {
751
771 template <typename Word>
773 return ac.add_word_no_checks(w.cbegin(), w.cend());
774 }
775
795 template <typename Word>
797 return ac.rm_word_no_checks(w.cbegin(), w.cend());
798 }
799
818 template <typename Word>
819 index_type add_word(AhoCorasick& ac, Word const& w) {
820 return ac.add_word(w.cbegin(), w.cend());
821 }
822
840 inline index_type add_word(AhoCorasick& ac, char const* w) {
841 return ac.add_word(w, w + std::strlen(w));
842 }
843
862 template <typename Word>
863 index_type rm_word(AhoCorasick& ac, Word const& w) {
864 return ac.rm_word(w.cbegin(), w.cend());
865 }
866
884 inline index_type rm_word(AhoCorasick& ac, char const* w) {
885 return ac.rm_word(w, w + std::strlen(w));
886 }
887
911 template <typename Iterator>
913 index_type start,
914 Iterator first,
915 Iterator last);
916
924 [[nodiscard]] inline index_type
926 index_type start,
927 word_type const& w) {
928 return traverse_word_no_checks(ac, start, w.cbegin(), w.cend());
929 }
930
942 template <typename Iterator>
943 [[nodiscard]] index_type traverse_word(AhoCorasick const& ac,
944 index_type start,
945 Iterator first,
946 Iterator last) {
948 return traverse_word_no_checks(ac, start, first, last);
949 }
950
958 template <typename Word>
959 [[nodiscard]] inline index_type traverse_word(AhoCorasick const& ac,
960 index_type start,
961 Word const& w) {
962 return traverse_word(ac, start, w.cbegin(), w.cend());
963 }
964
977 template <typename Iterator>
978 [[nodiscard]] index_type traverse_word(AhoCorasick const& ac,
979 Iterator first,
980 Iterator last) {
981 return traverse_word_no_checks(ac, AhoCorasick::root, first, last);
982 }
983
996 template <typename Word>
997 [[nodiscard]] index_type traverse_word(AhoCorasick const& ac,
998 Word const& w) {
999 return traverse_word(ac, AhoCorasick::root, w.cbegin(), w.cend());
1000 }
1001
1006 [[nodiscard]] Dot dot(AhoCorasick& ac);
1007
1008 } // namespace aho_corasick
1009
1010} // namespace libsemigroups
1011
1012#include "aho-corasick.tpp"
1013
1014#endif // LIBSEMIGROUPS_AHO_CORASICK_HPP_
T cbegin(T... args)
For an implementation of the Aho-Corasick algorithm.
Definition aho-corasick.hpp:82
index_type suffix_link_no_checks(index_type current) const
Calculate the index of the suffix link of a node.
word_type signature(index_type i) const
Find the signature of a node (out-of-place).
Definition aho-corasick.hpp:514
typename word_type::const_iterator const_iterator
Used for word_type iterators.
Definition aho-corasick.hpp:88
AhoCorasick & operator=(AhoCorasick &&)=default
Default move assignment.
std::set< index_type >::const_iterator cbegin_nodes() const noexcept
Return a const iterator pointing to the first active node.
Definition aho-corasick.hpp:268
void signature(word_type &w, index_type i) const
Find the signature of a node (in-place).
Definition aho-corasick.hpp:500
AhoCorasick & operator=(AhoCorasick const &)=default
Default copy assignment.
void signature_no_checks(word_type &w, index_type i) const
Find the signature of a node (in-place).
size_t number_of_nodes() const noexcept
Returns the number of nodes in the trie.
Definition aho-corasick.hpp:234
Node const & node_no_checks(index_type i) const
Return the node given an index.
Definition aho-corasick.hpp:610
size_t height(index_type i) const
Calculate the height of a node.
Definition aho-corasick.hpp:548
index_type suffix_link(index_type current) const
Calculate the index of the suffix link of a node.
Definition aho-corasick.hpp:584
Node const & node(index_type i) const
Return the node given an index.
Definition aho-corasick.hpp:624
AhoCorasick()
Construct an empty AhoCorasick.
index_type add_word_no_checks(Iterator first, Iterator last)
Add a word to the trie.
void throw_if_node_index_out_of_range(index_type i) const
Check if an index corresponds to a node.
index_type child(index_type parent, letter_type letter) const
Return the child of parent with edge-label letter.
Definition aho-corasick.hpp:665
size_t index_type
Alias for the index of a node in the trie.
Definition aho-corasick.hpp:85
std::set< index_type >::iterator end_nodes() const noexcept
Return an iterator pointing one beyond the last active node.
Definition aho-corasick.hpp:317
size_t height_no_checks(index_type i) const
Calculate the height of a node.
void throw_if_node_index_not_active(index_type i) const
Check if an index corresponds to a node currently in the trie.
index_type traverse_no_checks(index_type current, letter_type a) const
Traverse the trie using suffix links where necessary.
index_type traverse(index_type current, letter_type a) const
Traverse the trie using suffix links where necessary.
Definition aho-corasick.hpp:439
std::set< index_type >::const_iterator cend_nodes() const noexcept
Return a const iterator pointing one beyond the last active node.
Definition aho-corasick.hpp:285
index_type rm_word_no_checks(Iterator first, Iterator last)
Remove a word from the trie.
rx::iterator_range< std::set< index_type >::const_iterator > active_nodes() const
Return the active nodes.
Definition aho-corasick.hpp:251
AhoCorasick(AhoCorasick &&)=default
Default move constructor.
static constexpr index_type root
Constant for the root of the trie.
Definition aho-corasick.hpp:91
index_type rm_word(Iterator first, Iterator last)
Check and add a word to the trie.
std::set< index_type >::iterator begin_nodes() const noexcept
Return an iterator pointing to the first active node.
Definition aho-corasick.hpp:301
AhoCorasick(AhoCorasick const &)=default
Default copy constructor.
AhoCorasick & init()
Reinitialise an existing AhoCorasick object.
word_type signature_no_checks(index_type i) const
Find the signature of a node (out-of-place).
Definition aho-corasick.hpp:485
index_type child_no_checks(index_type parent, letter_type letter) const
Return the child of parent with edge-label letter.
Definition aho-corasick.hpp:649
index_type add_word(Iterator first, Iterator last)
Check and add a word to the trie.
A representation of a graph in the DOT language of Graphviz.
Definition dot.hpp:115
T cend(T... args)
std::string to_human_readable_repr(AhoCorasick const &ac)
Return a string representation.
Undefined const UNDEFINED
Value for something undefined.
std::vector< letter_type > word_type
Type for a word over the generators of a semigroup.
Definition types.hpp:99
size_t letter_type
Type for the index of a generator of a semigroup.
Definition types.hpp:96
Namespace for AhoCorasick helper functions.
Definition aho-corasick.hpp:748
AhoCorasick::index_type index_type
Alias for the index of a node in the trie.
Definition aho-corasick.hpp:750
index_type rm_word_no_checks(AhoCorasick &ac, Word const &w)
Remove a word from the trie of ac.
Definition aho-corasick.hpp:796
index_type traverse_word_no_checks(AhoCorasick const &ac, index_type start, Iterator first, Iterator last)
Traverse the trie of ac using suffix links where necessary.
index_type add_word_no_checks(AhoCorasick &ac, Word const &w)
Add a word to the trie of ac.
Definition aho-corasick.hpp:772
index_type add_word(AhoCorasick &ac, Word const &w)
Add a word.
Definition aho-corasick.hpp:819
Dot dot(AhoCorasick &ac)
Construct a dot object of ac.
index_type traverse_word(AhoCorasick const &ac, index_type start, Iterator first, Iterator last)
Traverse the trie of ac using suffix links where necessary.
Definition aho-corasick.hpp:943
index_type rm_word(AhoCorasick &ac, Word const &w)
Remove a word.
Definition aho-corasick.hpp:863
Namespace for everything in the libsemigroups library.
Definition action.hpp:44
T size(T... args)
T strlen(T... args)