libsemigroups  v3.0.0
C++ library for semigroups and monoids
Loading...
Searching...
No Matches
aho-corasick.hpp
1//
2// libsemigroups - C++ library for semigroups and monoids
3// Copyright (C) 2023-2025 Joseph Edwards + James D. Mitchell
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17//
18
19// This file contains the implementation of a trie with suffix links for use by
20// the Aho-Corasick dictionary search algorithm
21
22#ifndef LIBSEMIGROUPS_AHO_CORASICK_HPP_
23#define LIBSEMIGROUPS_AHO_CORASICK_HPP_
24
25#include <memory> // for allocator_traits<>::value_type
26#include <set> // for set
27#include <stack> // for stack
28#include <stddef.h> // for size_t
29#include <string> // for string
30#include <unordered_map> // for unordered_map
31#include <vector> // for vector
32
33#include "constants.hpp" // for Undefined, operator!=, UNDEFINED, operator==
34#include "debug.hpp" // for LIBSEMIGROUPS_ASSERT
35#include "exception.hpp" // for LIBSEMIGROUPS_EXCEPTION
36#include "ranges.hpp" // for rx::iterator_range
37#include "types.hpp" // for letter_type, word_type
38
39// TODO(2) make nodes accessible as indices of some list (numbered nodes).
40// Make sure this address the badness of traversals (lots of different data
41// types and it just feels a bit hacky)
42// TODO(2) is it worthwhile storing a pointer to the terminal nodes beneath
43// each node? If this can be updated quickly, it would save a lot of time in
44// overlap/confluence checking. One compromise is to have a pointer to the rules
45// any given node is contained within. This could be updated easily when adding
46// new rules, but more care would be needed when removing rules.
47// TODO(1) change names from set_X and get_X to X(val) and X(). e.g.
48// set_suffix_link('a') -> suffix_link('a')
49// TODO(2) add something that gets a ranges element to find all terminal nodes.
50// TODO(2) change all_nodes[i] to node_no_checks(i);
51
56namespace libsemigroups {
57
58 class Dot; // forward decl
59
81 public:
83 using index_type = size_t;
84
86 using const_iterator = typename word_type::const_iterator;
87
89 static constexpr index_type root = 0;
90
91 private:
92 // Implements the node of a trie
93 class Node {
94 private:
96 mutable index_type _link;
97 mutable size_t _height;
98 index_type _parent;
99 letter_type _parent_letter;
100 bool _terminal;
101
102 public:
103 Node() : Node(UNDEFINED, UNDEFINED) {}
104
105 Node(index_type parent, letter_type a)
106 : _children(),
107 _link(),
108 _height(),
109 _parent(),
110 _parent_letter(),
111 _terminal() {
112 init(parent, a);
113 }
114
115 Node(const Node&) = default;
116 Node& operator=(const Node&) = default;
117 Node(Node&&) = default;
118 Node& operator=(Node&&) = default;
119
120 Node& init(index_type parent, letter_type a) noexcept;
121
122 Node& init() noexcept {
123 return init(UNDEFINED, UNDEFINED);
124 }
125
126 [[nodiscard]] index_type child(letter_type a) const;
127
128 [[nodiscard]] size_t height() const noexcept {
129 return _height;
130 }
131
132 [[nodiscard]] index_type suffix_link() const noexcept {
133 return _link;
134 }
135
136 void set_suffix_link(index_type val) const noexcept {
137 _link = val;
138 }
139
140 void clear_suffix_link() const noexcept;
141
142 [[nodiscard]] decltype(_children)& children() const noexcept {
143 return _children;
144 }
145
146 [[nodiscard]] size_t number_of_children() const noexcept {
147 return _children.size();
148 }
149
150 [[nodiscard]] bool is_terminal() const noexcept {
151 return _terminal;
152 }
153
154 Node& set_terminal(bool val) noexcept {
155 _terminal = val;
156 return *this;
157 }
158
159 [[nodiscard]] index_type parent() const noexcept {
160 return _parent;
161 }
162
163 [[nodiscard]] letter_type parent_letter() const noexcept {
164 return _parent_letter;
165 }
166
167 void set_height(size_t val) const noexcept {
168 _height = val;
169 }
170 };
171
172 std::vector<Node> _all_nodes;
173 std::set<index_type> _active_nodes_index;
174 std::stack<index_type> _inactive_nodes_index;
175 mutable bool _valid_links;
176
177 public:
183
187 AhoCorasick(AhoCorasick const&) = default;
188
193
198
203
204 ~AhoCorasick();
205
219
232 [[nodiscard]] size_t number_of_nodes() const noexcept {
233 return _active_nodes_index.size();
234 }
235
248 [[nodiscard]] rx::iterator_range<std::set<index_type>::const_iterator>
249 active_nodes() const {
250 return rx::iterator_range(cbegin_nodes(), cend_nodes());
251 }
252
266 cbegin_nodes() const noexcept {
267 return _active_nodes_index.cbegin();
268 }
269
283 cend_nodes() const noexcept {
284 return _active_nodes_index.cend();
285 }
286
299 [[nodiscard]] std::set<index_type>::iterator begin_nodes() const noexcept {
300 return _active_nodes_index.begin();
301 }
302
315 [[nodiscard]] std::set<index_type>::iterator end_nodes() const noexcept {
316 return _active_nodes_index.end();
317 }
318
329 template <typename Iterator>
330 index_type add_word(Iterator first, Iterator last);
331
356 template <typename Iterator>
357 index_type add_word_no_checks(Iterator first, Iterator last);
358
369 template <typename Iterator>
370 index_type rm_word(Iterator first, Iterator last);
371
403 template <typename Iterator>
404 index_type rm_word_no_checks(Iterator first, Iterator last);
405
424 // TODO(2) Should this be templated?
426 letter_type a) const;
427
437 [[nodiscard]] index_type traverse(index_type current, letter_type a) const {
439 return traverse_no_checks(current, a);
440 }
441
461 // TODO(2) template to accept Iterator not word_type&
463
484 word_type w;
486 return w;
487 }
488
502
516
535 [[nodiscard]] size_t height_no_checks(index_type i) const;
536
546 [[nodiscard]] size_t height(index_type i) const {
548 return height_no_checks(i);
549 }
550
571 [[nodiscard]] index_type suffix_link_no_checks(index_type current) const;
572
582 [[nodiscard]] index_type suffix_link(index_type current) const {
584 return suffix_link_no_checks(current);
585 }
586
608 [[nodiscard]] Node const& node_no_checks(index_type i) const {
609 LIBSEMIGROUPS_ASSERT(i < _all_nodes.size());
610 return _all_nodes[i];
611 }
612
622 [[nodiscard]] Node const& node(index_type i) const {
624 return node_no_checks(i);
625 }
626
648 letter_type letter) const {
649 LIBSEMIGROUPS_ASSERT(parent < _all_nodes.size());
650 LIBSEMIGROUPS_ASSERT(_active_nodes_index.count(parent) == 1);
651 return _all_nodes[parent].child(letter);
652 }
653
663 [[nodiscard]] index_type child(index_type parent,
664 letter_type letter) const {
666 return _all_nodes[parent].child(letter);
667 }
668
683
700
701 private:
702 [[nodiscard]] index_type new_active_node_no_checks(index_type parent,
703 letter_type a);
704
705 [[nodiscard]] index_type new_active_node(index_type parent, letter_type a) {
707 return new_active_node_no_checks(parent, a);
708 }
709
710 void deactivate_node_no_checks(index_type i);
711
712 void deactivate_node(index_type i) {
714 deactivate_node_no_checks(i);
715 }
716
717 template <typename Iterator>
718 [[nodiscard]] index_type traverse_trie(Iterator first, Iterator last) const;
719
720 void clear_suffix_links() const;
721 };
722
735
746 namespace aho_corasick {
749
769 template <typename Word>
771 return ac.add_word_no_checks(w.cbegin(), w.cend());
772 }
773
793 template <typename Word>
795 return ac.rm_word_no_checks(w.cbegin(), w.cend());
796 }
797
816 template <typename Word>
817 index_type add_word(AhoCorasick& ac, Word const& w) {
818 return ac.add_word(w.cbegin(), w.cend());
819 }
820
839 template <typename Word>
840 index_type rm_word(AhoCorasick& ac, Word const& w) {
841 return ac.rm_word(w.cbegin(), w.cend());
842 }
843
867 template <typename Iterator>
869 index_type start,
870 Iterator first,
871 Iterator last);
872
880 [[nodiscard]] inline index_type
882 index_type start,
883 word_type const& w) {
884 return traverse_word_no_checks(ac, start, w.cbegin(), w.cend());
885 }
886
898 template <typename Iterator>
899 [[nodiscard]] index_type traverse_word(AhoCorasick const& ac,
900 index_type start,
901 Iterator first,
902 Iterator last) {
904 return traverse_word_no_checks(ac, start, first, last);
905 }
906
914 template <typename Word>
915 [[nodiscard]] inline index_type traverse_word(AhoCorasick const& ac,
916 index_type start,
917 Word const& w) {
918 return traverse_word(ac, start, w.cbegin(), w.cend());
919 }
920
933 template <typename Iterator>
934 [[nodiscard]] index_type traverse_word(AhoCorasick const& ac,
935 Iterator first,
936 Iterator last) {
937 return traverse_word_no_checks(ac, AhoCorasick::root, first, last);
938 }
939
952 template <typename Word>
953 [[nodiscard]] index_type traverse_word(AhoCorasick const& ac,
954 Word const& w) {
955 return traverse_word(ac, AhoCorasick::root, w.cbegin(), w.cend());
956 }
957
962 [[nodiscard]] Dot dot(AhoCorasick& ac);
963
964 } // namespace aho_corasick
965
966} // namespace libsemigroups
967
968#include "aho-corasick.tpp"
969
970#endif // LIBSEMIGROUPS_AHO_CORASICK_HPP_
T cbegin(T... args)
For an implementation of the Aho-Corasick algorithm.
Definition aho-corasick.hpp:80
index_type suffix_link_no_checks(index_type current) const
Calculate the index of the suffix link of a node.
word_type signature(index_type i) const
Find the signature of a node (out-of-place).
Definition aho-corasick.hpp:512
typename word_type::const_iterator const_iterator
Used for word_type iterators.
Definition aho-corasick.hpp:86
AhoCorasick & operator=(AhoCorasick &&)=default
Default move assignment.
std::set< index_type >::const_iterator cbegin_nodes() const noexcept
Return a const iterator pointing to the first active node.
Definition aho-corasick.hpp:266
void signature(word_type &w, index_type i) const
Find the signature of a node (in-place).
Definition aho-corasick.hpp:498
AhoCorasick & operator=(AhoCorasick const &)=default
Default copy assignment.
void signature_no_checks(word_type &w, index_type i) const
Find the signature of a node (in-place).
size_t number_of_nodes() const noexcept
Returns the number of nodes in the trie.
Definition aho-corasick.hpp:232
Node const & node_no_checks(index_type i) const
Return the node given an index.
Definition aho-corasick.hpp:608
size_t height(index_type i) const
Calculate the height of a node.
Definition aho-corasick.hpp:546
index_type suffix_link(index_type current) const
Calculate the index of the suffix link of a node.
Definition aho-corasick.hpp:582
Node const & node(index_type i) const
Return the node given an index.
Definition aho-corasick.hpp:622
AhoCorasick()
Construct an empty AhoCorasick.
index_type add_word_no_checks(Iterator first, Iterator last)
Add a word to the trie.
void throw_if_node_index_out_of_range(index_type i) const
Check if an index corresponds to a node.
index_type child(index_type parent, letter_type letter) const
Return the child of parent with edge-label letter.
Definition aho-corasick.hpp:663
size_t index_type
Alias for the index of a node in the trie.
Definition aho-corasick.hpp:83
std::set< index_type >::iterator end_nodes() const noexcept
Return an iterator pointing one beyond the last active node.
Definition aho-corasick.hpp:315
size_t height_no_checks(index_type i) const
Calculate the height of a node.
void throw_if_node_index_not_active(index_type i) const
Check if an index corresponds to a node currently in the trie.
index_type traverse_no_checks(index_type current, letter_type a) const
Traverse the trie using suffix links where necessary.
index_type traverse(index_type current, letter_type a) const
Traverse the trie using suffix links where necessary.
Definition aho-corasick.hpp:437
std::set< index_type >::const_iterator cend_nodes() const noexcept
Return a const iterator pointing one beyond the last active node.
Definition aho-corasick.hpp:283
index_type rm_word_no_checks(Iterator first, Iterator last)
Remove a word from the trie.
rx::iterator_range< std::set< index_type >::const_iterator > active_nodes() const
Return the active nodes.
Definition aho-corasick.hpp:249
AhoCorasick(AhoCorasick &&)=default
Default move constructor.
static constexpr index_type root
Constant for the root of the trie.
Definition aho-corasick.hpp:89
index_type rm_word(Iterator first, Iterator last)
Check and add a word to the trie.
std::set< index_type >::iterator begin_nodes() const noexcept
Return an iterator pointing to the first active node.
Definition aho-corasick.hpp:299
AhoCorasick(AhoCorasick const &)=default
Default copy constructor.
AhoCorasick & init()
Reinitialise an existing AhoCorasick object.
word_type signature_no_checks(index_type i) const
Find the signature of a node (out-of-place).
Definition aho-corasick.hpp:483
index_type child_no_checks(index_type parent, letter_type letter) const
Return the child of parent with edge-label letter.
Definition aho-corasick.hpp:647
index_type add_word(Iterator first, Iterator last)
Check and add a word to the trie.
A representation of a graph in the DOT language of Graphviz.
Definition dot.hpp:115
T cend(T... args)
std::string to_human_readable_repr(AhoCorasick const &ac)
Return a string representation.
Undefined const UNDEFINED
Value for something undefined.
std::vector< letter_type > word_type
Type for a word over the generators of a semigroup.
Definition types.hpp:101
size_t letter_type
Type for the index of a generator of a semigroup.
Definition types.hpp:98
Namespace for AhoCorasick helper functions.
Definition aho-corasick.hpp:746
AhoCorasick::index_type index_type
Alias for the index of a node in the trie.
Definition aho-corasick.hpp:748
index_type rm_word_no_checks(AhoCorasick &ac, Word const &w)
Remove a word from the trie of ac.
Definition aho-corasick.hpp:794
index_type traverse_word_no_checks(AhoCorasick const &ac, index_type start, Iterator first, Iterator last)
Traverse the trie of ac using suffix links where necessary.
index_type add_word_no_checks(AhoCorasick &ac, Word const &w)
Add a word to the trie of ac.
Definition aho-corasick.hpp:770
index_type add_word(AhoCorasick &ac, Word const &w)
Add a word to the trie of ac.
Definition aho-corasick.hpp:817
Dot dot(AhoCorasick &ac)
Construct a dot object of ac.
index_type traverse_word(AhoCorasick const &ac, index_type start, Iterator first, Iterator last)
Traverse the trie of ac using suffix links where necessary.
Definition aho-corasick.hpp:899
index_type rm_word(AhoCorasick &ac, Word const &w)
Remove a word from the trie of ac.
Definition aho-corasick.hpp:840
Namespace for everything in the libsemigroups library.
Definition action.hpp:44
T size(T... args)