libsemigroups  v3.1.2
C++ library for semigroups and monoids
Loading...
Searching...
No Matches
aho-corasick-impl.hpp
1//
2// libsemigroups - C++ library for semigroups and monoids
3// Copyright (C) 2023-2025 Joseph Edwards + James D. Mitchell
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with this program. If not, see <http://www.gnu.org/licenses/>.
17//
18
19// This file contains the implementation of a trie with suffix links for use by
20// the Aho-Corasick dictionary search algorithm
21
22#ifndef LIBSEMIGROUPS_DETAIL_AHO_CORASICK_IMPL_HPP_
23#define LIBSEMIGROUPS_DETAIL_AHO_CORASICK_IMPL_HPP_
24
25#include <cstddef> // for size_t
26#include <memory> // for allocator_traits<>::value_type
27#include <set> // for set
28#include <stack> // for stack
29#include <string> // for string
30#include <unordered_set> // for unordered_set
31#include <vector> // for vector
32
33#include "libsemigroups/constants.hpp" // for Undefined, operator!=, UNDEFINED, operator==
34#include "libsemigroups/debug.hpp" // for LIBSEMIGROUPS_ASSERT
35#include "libsemigroups/exception.hpp" // for LIBSEMIGROUPS_EXCEPTION
36#include "libsemigroups/ranges.hpp" // for rx::iterator_range
37#include "libsemigroups/types.hpp" // for letter_type, word_type
38
39#include "containers.hpp" // DynamicArray2
40#include "print.hpp" // for to_printable
41
42// TODO(2) is it worthwhile storing a pointer to the terminal nodes beneath
43// each node? If this can be updated quickly, it would save a lot of time in
44// overlap/confluence checking. One compromise is to have a pointer to the rules
45// any given node is contained within. This could be updated easily when adding
46// new rules, but more care would be needed when removing rules.
47// TODO(2) add something that gets a ranges element to find all terminal nodes.
48// TODO(2) change all_nodes[i] to node_no_checks(i);
49
50namespace libsemigroups {
51 namespace detail {
52
53 class AhoCorasickImpl {
54 public:
55 using index_type = uint32_t;
56
57 static constexpr const index_type root = 0;
58
59 private:
60 class Node {
61 friend class AhoCorasickImpl;
63 // Private data
65 private:
66 uint32_t _height;
67 index_type _link;
68 index_type _parent;
69 letter_type _parent_letter;
70 std::unordered_set<index_type> _suffix_link_sources;
71 bool _terminal;
72
73 Node& init() noexcept {
74 return init(UNDEFINED, UNDEFINED);
75 }
76
77 Node& init(index_type parent, letter_type a) noexcept;
78
79 public:
81 // Constructors/initializers - public
83
84 Node() : Node(UNDEFINED, UNDEFINED) {}
85 Node(index_type parent, letter_type a);
86
87 Node(Node const&) = default;
88 Node& operator=(Node const&) = default;
89 Node(Node&&) = default;
90 Node& operator=(Node&&) = default;
91
92 ~Node() = default;
93
95 // Getters - public
97
98 [[nodiscard]] size_t height() const noexcept {
99 return _height;
100 }
101
102 [[nodiscard]] index_type suffix_link() const noexcept {
103 return _link;
104 }
105
106 [[nodiscard]] bool terminal() const noexcept {
107 return _terminal;
108 }
109
110 [[nodiscard]] index_type parent() const noexcept {
111 return _parent;
112 }
113
114 [[nodiscard]] letter_type parent_letter() const noexcept {
115 return _parent_letter;
116 }
117
118 private:
120 // Setters - private
122
123 // All setters are private to avoid corrupting the objects.
124
125 Node const& height(size_t val) noexcept {
126 _height = val;
127 return *this;
128 }
129
130 Node const& suffix_link(index_type val) noexcept {
131 _link = val;
132 return *this;
133 }
134
135 Node& terminal(bool val) noexcept {
136 _terminal = val;
137 return *this;
138 }
139
140 std::unordered_set<index_type>& suffix_link_sources() noexcept {
141 return _suffix_link_sources;
142 }
143 }; // class Node
144
145 // TODO(1) if we store pointers here instead of Nodes, then inside the
146 // Nodes themselves we could store pointers to the parents etc, rather
147 // than indices, which would mean we could set the _link and other info in
148 // the Node::init() function, also will remove the index <-> Node
149 // conversions everywhere.
150 std::vector<Node> _all_nodes;
151 detail::DynamicArray2<index_type> _children;
152 std::unordered_set<index_type> _active_nodes_index;
153 std::vector<index_type> _inactive_nodes_index;
154 std::vector<index_type> _node_indices_to_update;
155
156 // TODO(1): it seems likely that the positions of the active nodes in
157 // _all_nodes will become scattered and disordered over time, and so it'd
158 // probably be best to periodically (or maybe always?) compress, and sort
159 // the nodes.
160
161 public:
162 AhoCorasickImpl();
163 AhoCorasickImpl& init();
164
165 explicit AhoCorasickImpl(size_t num_letters);
166 AhoCorasickImpl& init(size_t num_letters);
167
168 AhoCorasickImpl(AhoCorasickImpl const&);
169 AhoCorasickImpl& operator=(AhoCorasickImpl const&);
170 AhoCorasickImpl(AhoCorasickImpl&&);
171 AhoCorasickImpl& operator=(AhoCorasickImpl&&);
172
173 ~AhoCorasickImpl();
174
175 size_t alphabet_size() const noexcept {
176 return _children.number_of_cols();
177 }
178
179 AhoCorasickImpl& increase_alphabet_size_by(size_t val);
180
181 [[nodiscard]] size_t number_of_nodes() const noexcept {
182 LIBSEMIGROUPS_ASSERT(_children.number_of_rows() == _all_nodes.size());
183 return _active_nodes_index.size();
184 }
185
186 template <typename Iterator>
187 index_type add_word(Iterator first, Iterator last);
188
189 template <typename Iterator>
190 index_type add_word_no_checks(Iterator first, Iterator last);
191
192 template <typename Iterator>
193 index_type rm_word(Iterator first, Iterator last);
194
195 template <typename Iterator>
196 index_type rm_word_no_checks(Iterator first, Iterator last);
197
198 // The following function is critical for KnuthBendix and so we leave it
199 // here to be inlined possibly.
200 [[nodiscard]] index_type traverse_no_checks(index_type current,
201 letter_type a) const {
202 LIBSEMIGROUPS_ASSERT(current < _all_nodes.size());
203 LIBSEMIGROUPS_ASSERT(_active_nodes_index.count(current) == 1);
204 index_type next = _children.get(current, a);
205 if (next != UNDEFINED) {
206 return next;
207 } else if (current == root) {
208 return root;
209 }
210 return traverse_no_checks(suffix_link_no_checks(current), a);
211 }
212
213 [[nodiscard]] index_type traverse(index_type current,
214 letter_type a) const {
215 throw_if_node_index_not_active(current);
216 return traverse_no_checks(current, a);
217 }
218
219 [[nodiscard]] size_t height_no_checks(index_type i) const;
220
221 [[nodiscard]] size_t height(index_type i) const {
222 throw_if_node_index_not_active(i);
223 return height_no_checks(i);
224 }
225
226 [[nodiscard]] bool terminal_no_checks(index_type i) const;
227
228 [[nodiscard]] bool terminal(index_type i) const {
229 throw_if_node_index_not_active(i);
230 return terminal_no_checks(i);
231 }
232
233 [[nodiscard]] index_type suffix_link_no_checks(index_type i) const {
234 LIBSEMIGROUPS_ASSERT(i < _all_nodes.size());
235 LIBSEMIGROUPS_ASSERT(_active_nodes_index.count(i) == 1);
236 return _all_nodes[i].suffix_link();
237 }
238
239 [[nodiscard]] index_type suffix_link(index_type current) const {
240 throw_if_node_index_not_active(current);
241 return suffix_link_no_checks(current);
242 }
243
244 [[nodiscard]] Node const& node_no_checks(index_type i) const {
245 LIBSEMIGROUPS_ASSERT(i < _all_nodes.size());
246 return _all_nodes[i];
247 }
248
249 [[nodiscard]] Node const& node(index_type i) const {
250 throw_if_node_index_out_of_range(i);
251 return node_no_checks(i);
252 }
253
254 [[nodiscard]] index_type child_no_checks(index_type parent,
255 letter_type letter) const {
256 LIBSEMIGROUPS_ASSERT(parent < _all_nodes.size());
257 LIBSEMIGROUPS_ASSERT(_active_nodes_index.count(parent) == 1);
258 return _children.get(parent, letter);
259 }
260
261 [[nodiscard]] index_type child(index_type parent,
262 letter_type letter) const {
263 throw_if_node_index_not_active(parent);
264 return child_no_checks(parent, letter);
265 }
266
267 [[nodiscard]] size_t
268 number_of_children_no_checks(index_type i) const noexcept {
269 return _children.number_of_cols()
270 - std::count(
271 _children.cbegin_row(i), _children.cend_row(i), UNDEFINED);
272 }
273
274 [[nodiscard]] size_t number_of_children(index_type i) const noexcept {
275 throw_if_node_index_not_active(i);
276 return number_of_children_no_checks(i);
277 }
278
279 template <typename Iterator>
280 [[nodiscard]] index_type traverse_trie_no_checks(Iterator first,
281 Iterator last) const;
282
283 template <typename Iterator>
284 [[nodiscard]] index_type traverse_trie(Iterator first,
285 Iterator last) const {
286 throw_if_any_letter_out_of_range(first, last);
287 return traverse_trie_no_checks(first, last);
288 }
289
290 [[nodiscard]] bool empty() const noexcept {
291 return number_of_nodes() == 1;
292 }
293
294 void throw_if_node_index_out_of_range(index_type i) const;
295 void throw_if_node_index_not_active(index_type i) const;
296
297 private:
299 // Exceptions
301
302 void throw_if_letter_out_of_range(index_type i) const;
303
304 template <typename Iterator>
305 void throw_if_any_letter_out_of_range(Iterator first,
306 Iterator last) const {
307 for (auto it = first; it != last; ++it) {
308 throw_if_letter_out_of_range(*it);
309 }
310 }
311
313 // Activate or deactivate a node
315
316 [[nodiscard]] bool is_active_node(index_type i) {
317 return _active_nodes_index.find(i) != _active_nodes_index.end();
318 }
319
320 [[nodiscard]] index_type new_active_node_no_checks(index_type parent,
321 letter_type a);
322
323 void deactivate_node_no_checks(index_type i);
324
326 // Update suffix link sources
328
329 // Add <source_index> as a suffix link source of <target_index>, i.e.
330 // _all_nodes[source_index].suffix_link() == target_index
331 void add_suffix_link_source(index_type source_index,
332 index_type target_index);
333
334 // Remove <source_index> as a suffix link source of <target_index>, i.e.
335 // _all_nodes[source_index].suffix_link() == target_index
336 void rm_suffix_link_source(index_type source_index,
337 index_type target_index);
338
339 void populate_node_indices_to_update(index_type target_index,
340 index_type new_node_index,
341 letter_type a);
342 }; // class AhoCorasickImpl
343
344 namespace aho_corasick_impl {
345
346 template <typename Word>
347 AhoCorasickImpl::index_type add_word_no_checks(AhoCorasickImpl& ac,
348 Word const& w) {
349 return ac.add_word_no_checks(w.begin(), w.end());
350 }
351
352 template <typename Word>
353 AhoCorasickImpl::index_type rm_word_no_checks(AhoCorasickImpl& ac,
354 Word const& w) {
355 return ac.rm_word_no_checks(w.begin(), w.end());
356 }
357
358 // Check if a word is one of those used to create the trie
359 template <typename Iterator>
360 [[nodiscard]] bool contains_no_checks(AhoCorasickImpl const& ac,
361 Iterator first,
362 Iterator last) {
363 auto index = ac.traverse_trie_no_checks(first, last);
364 return index == UNDEFINED ? false : ac.node_no_checks(index).terminal();
365 }
366
367 template <typename Word>
368 [[nodiscard]] AhoCorasickImpl::index_type
369 contains_no_checks(AhoCorasickImpl& ac, Word const& w) {
370 return contains_no_checks(ac, w.begin(), w.end());
371 }
372
373 template <typename Iterator>
374 AhoCorasickImpl::index_type
375 traverse_word_no_checks(AhoCorasickImpl const& ac,
376 AhoCorasickImpl::index_type start,
377 Iterator first,
378 Iterator last);
379
380 template <typename Iterator>
381 AhoCorasickImpl::index_type
382 traverse_word_no_checks(AhoCorasickImpl const& ac,
383 Iterator first,
384 Iterator last) {
385 return traverse_word_no_checks(ac, ac.root, first, last);
386 }
387
388 template <typename Word>
389 [[nodiscard]] AhoCorasickImpl::index_type
390 traverse_word_no_checks(AhoCorasickImpl& ac, Word const& w) {
391 return traverse_word_no_checks(ac, w.begin(), w.end());
392 }
393
394 template <typename Iterator>
395 class SearchIterator {
396 using index_type = AhoCorasickImpl::index_type;
397
398 Iterator _first;
399 Iterator _last;
400 index_type _prefix;
401 index_type _suffix;
402 AhoCorasickImpl const& _trie;
403
404 public:
405 using iterator_category = std::input_iterator_tag;
406 using value_type = index_type;
407 using difference_type = std::ptrdiff_t;
408 using pointer = value_type const*;
409 using reference = value_type const&;
410
411 SearchIterator(AhoCorasickImpl const& trie,
412 Iterator first,
413 Iterator last);
414
415 explicit SearchIterator(AhoCorasickImpl const& trie);
416
417 reference operator*() const {
418 // TODO(1) would be easy enough to return the position of the match
419 // also, I think it's just height(_prefix) - height(_suffix)
420 return _suffix;
421 }
422
423 // Pre-increment
424 SearchIterator& operator++();
425
426 // Post-increment
427 SearchIterator operator++(int) {
428 SearchIterator tmp = *this;
429 ++(*this);
430 return tmp;
431 }
432
433 friend bool operator==(SearchIterator const& a,
434 SearchIterator const& b) {
435 // TODO(1) more?
436 return a._prefix == b._prefix && a._suffix == b._suffix;
437 }
438
439 friend bool operator!=(SearchIterator const& a,
440 SearchIterator const& b) {
441 return !(a == b);
442 }
443 }; // class SearchIterator
444
445 // Deduction guide
446 template <typename Iterator>
447 SearchIterator(AhoCorasickImpl const& ac, Iterator first, Iterator last)
448 -> SearchIterator<Iterator>;
449
450 template <typename Iterator>
451 [[nodiscard]] auto begin_search_no_checks(AhoCorasickImpl const& ac,
452 Iterator first,
453 Iterator last) {
454 return SearchIterator(ac, first, last);
455 }
456
457 template <typename Iterator>
458 [[nodiscard]] auto end_search_no_checks(AhoCorasickImpl const& ac,
459 Iterator,
460 Iterator) {
461 return SearchIterator<Iterator>(ac);
462 }
463
464 template <typename Word>
465 [[nodiscard]] auto begin_search_no_checks(AhoCorasickImpl& ac,
466 Word const& w) {
467 return begin_search_no_checks(ac, w.begin(), w.end());
468 }
469
470 template <typename Word>
471 [[nodiscard]] auto end_search_no_checks(AhoCorasickImpl& ac,
472 Word const& w) {
473 return end_search_no_checks(ac, w.begin(), w.end());
474 }
475
476 } // namespace aho_corasick_impl
477 } // namespace detail
478} // namespace libsemigroups
479
480#include "aho-corasick-impl.tpp"
481
482#endif // LIBSEMIGROUPS_DETAIL_AHO_CORASICK_IMPL_HPP_
T count(T... args)
Undefined const UNDEFINED
Value for something undefined.
size_t letter_type
Type for the index of a generator of a semigroup.
Definition types.hpp:96
AhoCorasick::index_type index_type
Alias for the index of a node in the trie.
Definition aho-corasick.hpp:750
Namespace for everything in the libsemigroups library.
Definition action.hpp:44
T next(T... args)