HPCombi
High Performance Combinatorics in C++ using vector instructions v1.0.0
Loading...
Searching...
No Matches
epu8_impl.hpp
Go to the documentation of this file.
1//****************************************************************************//
2// Copyright (C) 2016-2024 Florent Hivert <Florent.Hivert@lisn.fr>, //
3// //
4// This file is part of HP-Combi <https://github.com/libsemigroups/HPCombi> //
5// //
6// HP-Combi is free software: you can redistribute it and/or modify it //
7// under the terms of the GNU General Public License as published by the //
8// Free Software Foundation, either version 3 of the License, or //
9// (at your option) any later version. //
10// //
11// HP-Combi is distributed in the hope that it will be useful, but WITHOUT //
12// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or //
13// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License //
14// for more details. //
15// //
16// You should have received a copy of the GNU General Public License along //
17// with HP-Combi. If not, see <https://www.gnu.org/licenses/>. //
18//****************************************************************************//
19
20// NOLINT(build/header_guard)
21
22// This is the implementation part of epu8.hpp this should be seen as
23// implementation details and should not be included directly.
24
25#include <initializer_list>
26#include <iostream>
27#include <random>
28#include <sstream>
29
30#include "vect_generic.hpp"
31
32#ifdef SIMDE_X86_SSE4_2_NATIVE
33// Comparison mode for _mm_cmpestri
34#define FIRST_DIFF \
35 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | \
36 SIMDE_SIDD_NEGATIVE_POLARITY)
37#define LAST_DIFF \
38 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | \
39 SIMDE_SIDD_NEGATIVE_POLARITY | SIMDE_SIDD_MOST_SIGNIFICANT)
40#define FIRST_ZERO (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY)
41#define LAST_ZERO \
42 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | \
43 SIMDE_SIDD_MOST_SIGNIFICANT)
44#define FIRST_NON_ZERO \
45 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | \
46 SIMDE_SIDD_MASKED_NEGATIVE_POLARITY)
47#define LAST_NON_ZERO \
48 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | \
49 SIMDE_SIDD_MASKED_NEGATIVE_POLARITY | SIMDE_SIDD_MOST_SIGNIFICANT)
50#endif
51
52namespace HPCombi {
53
55// Implementation part for inline functions
57
59inline epu8 permuted_ref(epu8 a, epu8 b) noexcept {
60 epu8 res;
61 for (uint64_t i = 0; i < 16; i++)
62 res[i] = a[b[i] & 0xF];
63 return res;
64}
65
66// Msk is supposed to be a boolean mask (i.e. each entry is either 0 or
67// 255)
68inline uint64_t first_mask(epu8 msk, size_t bound) {
69 uint64_t res = simde_mm_movemask_epi8(msk & (Epu8.id() < Epu8(bound)));
70 return res == 0 ? 16 : (__builtin_ffsll(res) - 1);
71}
72inline uint64_t last_mask(epu8 msk, size_t bound) {
73 auto res = simde_mm_movemask_epi8(msk & (Epu8.id() < Epu8(bound)));
74 return res == 0 ? 16 : (63 - __builtin_clzll(res));
75}
76
77inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) noexcept {
78 for (size_t i = 0; i < bound; i++)
79 if (a[i] != b[i])
80 return i;
81 return 16;
82}
83#ifdef SIMDE_X86_SSE4_2_NATIVE
84inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) noexcept {
85 return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));
86}
87#endif
88inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) noexcept {
89 return first_mask(a != b, bound);
90}
91
92inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) noexcept {
93 while (bound != 0) {
94 --bound;
95 if (a[bound] != b[bound])
96 return bound;
97 }
98 return 16;
99}
100#ifdef SIMDE_X86_SSE4_2_NATIVE
101inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) noexcept {
102 return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));
103}
104#endif
105inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) noexcept {
106 return last_mask(a != b, bound);
107}
108
109inline bool less(epu8 a, epu8 b) noexcept {
110 uint64_t diff = first_diff(a, b);
111 return (diff < 16) && (a[diff] < b[diff]);
112}
113inline int8_t less_partial(epu8 a, epu8 b, int k) noexcept {
114 uint64_t diff = first_diff(a, b, k);
115 return (diff == 16)
116 ? 0
117 : static_cast<int8_t>(a[diff]) - static_cast<int8_t>(b[diff]);
118}
119
120inline uint64_t first_zero(epu8 v, int bnd) noexcept {
121 return first_mask(v == epu8{}, bnd);
122}
123inline uint64_t last_zero(epu8 v, int bnd) noexcept {
124 return last_mask(v == epu8{}, bnd);
125}
126inline uint64_t first_non_zero(epu8 v, int bnd) noexcept {
127 return first_mask(v != epu8{}, bnd);
128}
129inline uint64_t last_non_zero(epu8 v, int bnd) noexcept {
130 return last_mask(v != epu8{}, bnd);
131}
132
134template <bool Increasing = true, size_t sz>
135inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {
136 for (auto round : rounds) {
137 // This conditional should be optimized out by the compiler
138 epu8 mask = Increasing ? round < Epu8.id() : Epu8.id() < round;
139 epu8 b = permuted(res, round);
140 // res = mask ? min(res,b) : max(res,b); is not accepted by clang
141 res = simde_mm_blendv_epi8(min(res, b), max(res, b), mask);
142 }
143 return res;
144}
145
147template <bool Increasing = true, size_t sz>
148inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {
149 epu8 res = Epu8.id();
150 for (auto round : rounds) {
151 // This conditional should be optimized out by the compiler
152 epu8 mask = Increasing ? round < Epu8.id() : Epu8.id() < round;
153 epu8 b = permuted(v, round);
154 epu8 cmp = simde_mm_blendv_epi8(b < v, v < b, mask);
155 v = simde_mm_blendv_epi8(v, b, cmp);
156 res = simde_mm_blendv_epi8(res, permuted(res, round), cmp);
157 }
158 return res;
159}
160
167constexpr std::array<epu8, 9> sorting_rounds
168 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
169 {{epu8{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
170 epu8{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
171 epu8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
172 epu8{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},
173 epu8{0, 2, 1, 12, 8, 10, 9, 11, 4, 6, 5, 7, 3, 14, 13, 15},
174 epu8{0, 4, 8, 10, 1, 9, 12, 13, 2, 5, 3, 14, 6, 7, 11, 15},
175 epu8{0, 1, 4, 5, 2, 3, 8, 9, 6, 7, 12, 13, 10, 11, 14, 15},
176 epu8{0, 1, 2, 6, 4, 8, 3, 10, 5, 12, 7, 11, 9, 13, 14, 15},
177 epu8{0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 13, 14, 15}}};
178
187constexpr std::array<epu8, 6> sorting_rounds8
188 // clang-format off
189 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
190{{
191 epu8 { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
192 epu8 { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
193 epu8 { 0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15},
194 epu8 { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
195 epu8 { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
196 epu8 { 0, 2, 1, 4, 3, 6, 5, 7, 8, 10, 9, 12, 11, 14, 13, 15}
197}};
198// clang-format on
199
200inline bool is_sorted(epu8 a) noexcept {
201 return simde_mm_movemask_epi8(shifted_right(a) > a) == 0;
202}
203inline epu8 sorted(epu8 a) noexcept {
204 return network_sort<true>(a, sorting_rounds);
205}
206inline epu8 sorted8(epu8 a) noexcept {
207 return network_sort<true>(a, sorting_rounds8);
208}
209inline epu8 revsorted(epu8 a) noexcept {
210 return network_sort<false>(a, sorting_rounds);
211}
212inline epu8 revsorted8(epu8 a) noexcept {
213 return network_sort<false>(a, sorting_rounds8);
214}
215
216inline epu8 sort_perm(epu8 &a) noexcept {
217 return network_sort_perm<true>(a, sorting_rounds);
218}
219inline epu8 sort8_perm(epu8 &a) noexcept {
220 return network_sort_perm<true>(a, sorting_rounds8);
221}
222
223constexpr std::array<epu8, 6> merge_rounds
224 // clang-format off
225 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
226{{
227 epu8 { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},
228 epu8 { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
229 epu8 { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
230 epu8 { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
231}};
232// clang-format on
233inline void merge_rev(epu8 &a, epu8 &b) noexcept {
234 epu8 mn = min(a, b);
235 b = max(a, b);
236 a = mn;
237 a = network_sort<true>(a, merge_rounds);
238 b = network_sort<true>(b, merge_rounds);
239}
240inline void merge(epu8 &a, epu8 &b) noexcept {
241 a = permuted(a, Epu8.rev());
242 merge_rev(a, b);
243}
244// TODO : AVX2 version.
245// TODO : compute merge_rounds on the fly instead of loading those from
246// memory
247
248inline epu8 random_epu8(uint16_t bnd) {
249 epu8 res;
250
251 static std::random_device rd;
252 static std::default_random_engine e1(rd());
253 std::uniform_int_distribution<int> uniform_dist(0, bnd - 1);
254
255 for (size_t i = 0; i < 16; i++)
256 res[i] = uniform_dist(e1);
257 return res;
258}
259
260inline epu8 remove_dups(epu8 v, uint8_t repl) noexcept {
261 // Vector ternary operator is not supported by clang.
262 // return (v != shifted_right(v) ? v : Epu8(repl);
263 return simde_mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
264}
265
266// Gather at the front numbers with (3-i)-th bit not set.
267constexpr std::array<epu8, 3> inverting_rounds{{
268 // clang-format off
269 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
270 epu8 { 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15},
271 epu8 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
272 epu8 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}
273 // clang-format on
274}};
275
276#ifdef SIMDE_X86_SSE4_2_NATIVE
277#define FIND_IN_VECT \
278 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK | \
279 SIMDE_SIDD_NEGATIVE_POLARITY)
280#define FIND_IN_VECT_COMPL \
281 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK)
282
283inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept {
284 epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
285 for (epu8 round : inverting_rounds) {
286 a = permuted(a, round);
287 res <<= 1;
288 res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
289 }
290 return res;
291}
292#endif
293
294inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept {
295 auto ar = as_array(a);
296 epu8 res{};
297 for (size_t i = 0; i < 16; i++) {
298 res[i] =
299 std::distance(ar.begin(), std::find(ar.begin(), ar.end(), b[i]));
300 }
301 return res;
302}
303inline epu8 permutation_of(epu8 a, epu8 b) noexcept {
304#ifdef SIMDE_X86_SSE4_2_NATIVE
305 return permutation_of_cmpestrm(a, b);
306#else
307 return permutation_of_ref(a, b);
308#endif
309}
310
311#if defined(FF)
312#error FF is defined !
313#endif /* FF */
314#define FF 0xff
315
317constexpr std::array<epu8, 4> summing_rounds{{
318 // clang-format off
319 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
320 epu8 { FF, 0, FF, 2, FF, 4, FF, 6, FF, 8, FF, 10, FF, 12, FF, 14},
321 epu8 { FF, FF, 1, 1, FF, FF, 5, 5, FF, FF, 9, 9, FF, FF, 13, 13},
322 epu8 { FF, FF, FF, FF, 3, 3, 3, 3, FF, FF, FF, FF, 11, 11, 11, 11},
323 epu8 { FF, FF, FF, FF, FF, FF, FF, FF, 7, 7, 7, 7, 7, 7, 7, 7}
324 // clang-format on
325}};
326
327constexpr std::array<epu8, 4> mining_rounds{{
328 // clang-format off
329 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
330 epu8 { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14},
331 epu8 { 0, 1, 1, 1, 4, 5, 5, 5, 8, 9, 9, 9, 12, 13, 13, 13},
332 epu8 { 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11},
333 epu8 { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7}
334 // clang-format on
335}};
336
337#undef FF
338
339inline uint8_t horiz_sum_ref(epu8 v) noexcept {
340 uint8_t res = 0;
341 for (size_t i = 0; i < 16; i++)
342 res += v[i];
343 return res;
344}
345inline uint8_t horiz_sum_gen(epu8 v) noexcept {
346 return as_VectGeneric(v).horiz_sum();
347}
348inline uint8_t horiz_sum4(epu8 v) noexcept { return partial_sums_round(v)[15]; }
349inline uint8_t horiz_sum3(epu8 v) noexcept {
350 auto sr = summing_rounds;
351 v += permuted(v, sr[0]);
352 v += permuted(v, sr[1]);
353 v += permuted(v, sr[2]);
354 return v[7] + v[15];
355}
356
357inline epu8 partial_sums_ref(epu8 v) noexcept {
358 epu8 res{};
359 res[0] = v[0];
360 for (size_t i = 1; i < 16; i++)
361 res[i] = res[i - 1] + v[i];
362 return res;
363}
364inline epu8 partial_sums_gen(epu8 v) noexcept {
365 as_VectGeneric(v).partial_sums_inplace();
366 return v;
367}
368inline epu8 partial_sums_round(epu8 v) noexcept {
369 for (epu8 round : summing_rounds)
370 v += permuted(v, round);
371 return v;
372}
373
374inline uint8_t horiz_max_ref(epu8 v) noexcept {
375 uint8_t res = 0;
376 for (size_t i = 0; i < 16; i++)
377 res = std::max(res, v[i]);
378 return res;
379}
380inline uint8_t horiz_max_gen(epu8 v) noexcept {
381 return as_VectGeneric(v).horiz_max();
382}
383inline uint8_t horiz_max4(epu8 v) noexcept { return partial_max_round(v)[15]; }
384inline uint8_t horiz_max3(epu8 v) noexcept {
385 auto sr = summing_rounds;
386 v = max(v, permuted(v, sr[0]));
387 v = max(v, permuted(v, sr[1]));
388 v = max(v, permuted(v, sr[2]));
389 return std::max(v[7], v[15]);
390}
391
392inline epu8 partial_max_ref(epu8 v) noexcept {
393 epu8 res;
394 res[0] = v[0];
395 for (size_t i = 1; i < 16; i++)
396 res[i] = std::max(res[i - 1], v[i]);
397 return res;
398}
399inline epu8 partial_max_gen(epu8 v) noexcept {
400 as_VectGeneric(v).partial_max_inplace();
401 return v;
402}
403inline epu8 partial_max_round(epu8 v) noexcept {
404 for (epu8 round : summing_rounds)
405 v = max(v, permuted(v, round));
406 return v;
407}
408
409inline uint8_t horiz_min_ref(epu8 v) noexcept {
410 uint8_t res = 255;
411 for (size_t i = 0; i < 16; i++)
412 res = std::min(res, v[i]);
413 return res;
414}
415inline uint8_t horiz_min_gen(epu8 v) noexcept {
416 return as_VectGeneric(v).horiz_min();
417}
418inline uint8_t horiz_min4(epu8 v) noexcept { return partial_min_round(v)[15]; }
419inline uint8_t horiz_min3(epu8 v) noexcept {
420 auto sr = mining_rounds;
421 v = min(v, permuted(v, sr[0]));
422 v = min(v, permuted(v, sr[1]));
423 v = min(v, permuted(v, sr[2]));
424 return std::min(v[7], v[15]);
425}
426
427inline epu8 partial_min_ref(epu8 v) noexcept {
428 epu8 res;
429 res[0] = v[0];
430 for (size_t i = 1; i < 16; i++)
431 res[i] = std::min(res[i - 1], v[i]);
432 return res;
433}
434inline epu8 partial_min_gen(epu8 v) noexcept {
435 as_VectGeneric(v).partial_min_inplace();
436 return v;
437}
438inline epu8 partial_min_round(epu8 v) noexcept {
439 for (epu8 round : mining_rounds)
440 v = min(v, permuted(v, round));
441 return v;
442}
443
444inline epu8 eval16_ref(epu8 v) noexcept {
445 epu8 res{};
446 for (size_t i = 0; i < 16; i++)
447 if (v[i] < 16)
448 res[v[i]]++;
449 return res;
450}
451
452inline epu8 eval16_arr(epu8 v8) noexcept {
453 decltype(Epu8)::array res{};
454 auto v = as_array(v8);
455 for (size_t i = 0; i < 16; i++)
456 if (v[i] < 16)
457 res[v[i]]++;
458 return Epu8(res);
459}
460inline epu8 eval16_gen(epu8 v) noexcept {
461 return Epu8(as_VectGeneric(v).eval().v);
462}
463inline epu8 eval16_cycle(epu8 v) noexcept {
464 epu8 res = -(Epu8.id() == v);
465 for (int i = 1; i < 16; i++) {
466 v = permuted(v, Epu8.left_cycle());
467 res -= (Epu8.id() == v);
468 }
469 return res;
470}
471inline epu8 eval16_popcount(epu8 v) noexcept {
472 epu8 res{};
473 for (size_t i = 0; i < 16; i++) {
474 res[i] =
475 __builtin_popcountl(simde_mm_movemask_epi8(v == Epu8(uint8_t(i))));
476 }
477 return res;
478}
479
480inline epu8 popcount16(epu8 v) noexcept {
481 return (permuted(Epu8.popcount(), v & Epu8(0x0f)) +
482 permuted(Epu8.popcount(), v >> 4));
483}
484
485inline bool is_partial_transformation(epu8 v, const size_t k) noexcept {
486 uint64_t diff = last_diff(v, Epu8.id(), 16);
487 // (forall x in v, x + 1 <= 16) and
488 // (v = Perm16::one() or last diff index < 16)
489 return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) &&
490 (diff == 16 || diff < k);
491}
492
493inline bool is_transformation(epu8 v, const size_t k) noexcept {
494 uint64_t diff = last_diff(v, Epu8.id(), 16);
495 return (simde_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff) &&
496 (diff == 16 || diff < k);
497}
498
499inline bool is_partial_permutation(epu8 v, const size_t k) noexcept {
500 uint64_t diff = last_diff(v, Epu8.id(), 16);
501 // (forall x in v, x <= 15) and
502 // (forall x < 15, multiplicity x v <= 1
503 // (v = Perm16::one() or last diff index < 16)
504 return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) &&
505 (simde_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff) &&
506 (diff == 16 || diff < k);
507}
508
509#ifdef SIMDE_X86_SSE4_2_NATIVE
510inline bool is_permutation_cmpestri(epu8 v, const size_t k) noexcept {
511 uint64_t diff = last_diff(v, Epu8.id(), 16);
512 // (forall x in v, x in Perm16::one()) and
513 // (forall x in Perm16::one(), x in v) and
514 // (v = Perm16::one() or last diff index < 16)
515 return _mm_cmpestri(Epu8.id(), 16, v, 16, FIRST_NON_ZERO) == 16 &&
516 _mm_cmpestri(v, 16, Epu8.id(), 16, FIRST_NON_ZERO) == 16 &&
517 (diff == 16 || diff < k);
518}
519#endif
520
521inline bool is_permutation_sort(epu8 v, const size_t k) noexcept {
522 uint64_t diff = last_diff(v, Epu8.id(), 16);
523 return equal(sorted(v), Epu8.id()) && (diff == 16 || diff < k);
524}
525inline bool is_permutation_eval(epu8 v, const size_t k) noexcept {
526 uint64_t diff = last_diff(v, Epu8.id(), 16);
527 return equal(eval16(v), Epu8({}, 1)) && (diff == 16 || diff < k);
528}
529
530inline bool is_permutation(epu8 v, const size_t k) noexcept {
531#ifdef SIMDE_X86_SSE4_2_NATIVE
532 return is_permutation_cmpestri(v, k);
533#else
534 return is_permutation_sort(v, k);
535#endif
536}
537
538} // namespace HPCombi
539
540namespace std {
541
542inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a) {
543 stream << "{" << std::setw(2) << unsigned(a[0]);
544 for (unsigned i = 1; i < 16; ++i)
545 stream << "," << std::setw(2) << unsigned(a[i]);
546 stream << "}";
547 return stream;
548}
549
550inline std::string to_string(HPCombi::epu8 const &a) {
551 std::ostringstream ss;
552 ss << a;
553 return ss.str();
554}
555
556template <> struct equal_to<HPCombi::epu8> {
557 bool operator()(const HPCombi::epu8 &lhs,
558 const HPCombi::epu8 &rhs) const noexcept {
559 return HPCombi::equal(lhs, rhs);
560 }
561};
562
563template <> struct not_equal_to<HPCombi::epu8> {
564 bool operator()(const HPCombi::epu8 &lhs,
565 const HPCombi::epu8 &rhs) const noexcept {
566 return HPCombi::not_equal(lhs, rhs);
567 }
568};
569
570template <> struct hash<HPCombi::epu8> {
571 inline size_t operator()(HPCombi::epu8 a) const noexcept {
572 unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
573 unsigned __int128 v1 = simde_mm_extract_epi64(a, 1);
574 return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;
575
576 /* The following is extremely slow on Renner benchmark
577 uint64_t v0 = simde_mm_extract_epi64(ar.v, 0);
578 uint64_t v1 = simde_mm_extract_epi64(ar.v, 1);
579 size_t seed = v0 + 0x9e3779b9;
580 seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);
581 return seed;
582 */
583 }
584};
585
586template <> struct less<HPCombi::epu8> {
587 // WARNING: due to endianness this is not lexicographic comparison,
588 // but we don't care when using in std::set.
589 // 10% faster than calling the lexicographic comparison operator !
590 inline size_t operator()(const HPCombi::epu8 &v1,
591 const HPCombi::epu8 &v2) const noexcept {
592 simde__m128 v1v = simde__m128(v1), v2v = simde__m128(v2);
593 return v1v[0] == v2v[0] ? v1v[1] < v2v[1] : v1v[0] < v2v[0];
594 }
595};
596
597} // namespace std
#define FF
Definition bmat8_impl.hpp:246
std::array< std::tuple< uint16_t, uint16_t, std::array< uint16_t, gens.size()> >, 65536 > res
Definition image.cpp:66
Definition bmat8.hpp:41
uint8_t horiz_min4(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:418
epu8 max(epu8 a, epu8 b) noexcept
Vector max between two HPCombi::epu8 0.
Definition epu8.hpp:91
uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound=16) noexcept
The last difference between two HPCombi::epu8.
Definition epu8_impl.hpp:92
uint64_t first_non_zero(epu8 v, int bnd) noexcept
return the index of the first non zero entry or 16 if there are none Only index smaller than bound ar...
Definition epu8_impl.hpp:126
uint8_t horiz_min_ref(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:409
epu8 eval16_arr(epu8 v) noexcept
Evaluation of a HPCombi::epu8.
Definition epu8_impl.hpp:452
epu8 permuted(epu8 a, epu8 b) noexcept
Permuting a HPCombi::epu8.
Definition epu8.hpp:72
epu8 sort8_perm(epu8 &a) noexcept
Sort this and return the sorting permutation.
Definition epu8_impl.hpp:219
epu8 shifted_right(epu8 a) noexcept
Left shifted of a HPCombi::epu8 inserting a 0.
Definition epu8.hpp:78
uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound=16) noexcept
The first difference between two HPCombi::epu8.
Definition epu8_impl.hpp:77
uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound=16) noexcept
The first difference between two HPCombi::epu8.
Definition epu8_impl.hpp:88
epu8 partial_sums_ref(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:357
epu8 eval16_gen(epu8 v) noexcept
Definition epu8_impl.hpp:460
epu8 network_sort(epu8 res, std::array< epu8, sz > rounds)
Apply a sorting network.
Definition epu8_impl.hpp:135
epu8 remove_dups(epu8 a, uint8_t repl=0) noexcept
Remove duplicates in a sorted HPCombi::epu8.
Definition epu8_impl.hpp:260
epu8 revsorted8(epu8 a) noexcept
Return a HPCombi::epu8 with the two half reverse sorted.
Definition epu8_impl.hpp:212
epu8 permutation_of(epu8 a, epu8 b) noexcept
Find if a vector is a permutation of one other.
Definition epu8_impl.hpp:303
bool is_permutation(epu8 v, const size_t k=16) noexcept
Definition epu8_impl.hpp:530
void merge_rev(epu8 &a, epu8 &b) noexcept
Definition epu8_impl.hpp:233
constexpr std::array< epu8, 4 > summing_rounds
Permutation Round for partial and horizontal sums.
Definition epu8_impl.hpp:317
int8_t less_partial(epu8 a, epu8 b, int k) noexcept
Partial lexicographic comparison between two HPCombi::epu8.
Definition epu8_impl.hpp:113
uint8_t horiz_max4(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:383
uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound=16) noexcept
The last difference between two HPCombi::epu8.
Definition epu8_impl.hpp:105
uint8_t horiz_max3(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:384
constexpr uint64_t prime
A prime number good for hashing.
Definition epu8.hpp:164
uint8_t horiz_min_gen(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:415
bool is_partial_permutation(epu8 v, const size_t k=16) noexcept
Test for partial permutations.
Definition epu8_impl.hpp:499
bool is_permutation_sort(epu8 v, const size_t k=16) noexcept
Definition epu8_impl.hpp:521
uint64_t last_zero(epu8 v, int bnd) noexcept
return the index of the last zero entry or 16 if there are none Only index smaller than bound are tak...
Definition epu8_impl.hpp:123
void merge(epu8 &a, epu8 &b) noexcept
Merge two sorted epu8.
Definition epu8_impl.hpp:240
uint8_t horiz_sum4(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:348
epu8 popcount16(epu8 v) noexcept
a vector popcount function
Definition epu8_impl.hpp:480
epu8 partial_sums_round(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:368
uint8_t horiz_sum3(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:349
uint8_t horiz_sum_ref(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:339
epu8 partial_sums_gen(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:364
bool equal(epu8 a, epu8 b) noexcept
Equality of HPCombi::epu8.
Definition epu8.hpp:63
epu8 permutation_of_ref(epu8 a, epu8 b) noexcept
Find if a vector is a permutation of one other.
Definition epu8_impl.hpp:294
epu8 min(epu8 a, epu8 b) noexcept
Vector min between two HPCombi::epu8 0.
Definition epu8.hpp:89
uint64_t first_mask(epu8 msk, size_t bound)
Definition epu8_impl.hpp:68
epu8 partial_max_ref(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:392
epu8 sorted8(epu8 a) noexcept
Return a HPCombi::epu8 with the two half sorted.
Definition epu8_impl.hpp:206
epu8 partial_max_round(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:403
epu8 eval16(epu8 v) noexcept
Evaluation of a HPCombi::epu8.
Definition epu8.hpp:408
epu8 network_sort_perm(epu8 &v, std::array< epu8, sz > rounds)
Apply a sorting network in place and return the permutation.
Definition epu8_impl.hpp:148
constexpr std::array< epu8, 6 > sorting_rounds8
A duplicated 8-way sorting network.
Definition epu8_impl.hpp:190
epu8 eval16_cycle(epu8 v) noexcept
Evaluation of a HPCombi::epu8.
Definition epu8_impl.hpp:463
constexpr std::array< epu8, 9 > sorting_rounds
A 16-way sorting network.
Definition epu8_impl.hpp:169
epu8 eval16_ref(epu8 v) noexcept
Evaluation of a HPCombi::epu8.
Definition epu8_impl.hpp:444
epu8 partial_max_gen(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:399
bool less(epu8 a, epu8 b) noexcept
Lexicographic comparison between two HPCombi::epu8.
Definition epu8_impl.hpp:109
uint8_t horiz_max_ref(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:374
epu8 sorted(epu8 a) noexcept
Return a sorted HPCombi::epu8.
Definition epu8_impl.hpp:203
uint64_t last_mask(epu8 msk, size_t bound)
Definition epu8_impl.hpp:72
constexpr std::array< epu8, 4 > mining_rounds
Definition epu8_impl.hpp:327
constexpr TPUBuild< epu8 > Epu8
Factory object acting as a class constructor for type HPCombi::epu8.
Definition epu8.hpp:53
bool is_permutation_eval(epu8 v, const size_t k=16) noexcept
Definition epu8_impl.hpp:525
VectGeneric< TPUBuild< TPU >::size > & as_VectGeneric(TPU &v)
Cast a HPCombi::epu8 to a c++ HPCombi::VectGeneric.
Definition builder.hpp:156
uint64_t first_zero(epu8 v, int bnd) noexcept
return the index of the first zero entry or 16 if there are none Only index smaller than bound are ta...
Definition epu8_impl.hpp:120
epu8 random_epu8(uint16_t bnd)
A random HPCombi::epu8.
Definition epu8_impl.hpp:248
epu8 revsorted(epu8 a) noexcept
Return a reverse sorted HPCombi::epu8.
Definition epu8_impl.hpp:209
epu8 partial_min_round(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:438
bool is_sorted(epu8 a) noexcept
Testing if a HPCombi::epu8 is sorted.
Definition epu8_impl.hpp:200
uint8_t __attribute__((vector_size(16))) epu8
SIMD vector of 16 unsigned bytes.
Definition epu8.hpp:45
constexpr std::array< epu8, 3 > inverting_rounds
Definition epu8_impl.hpp:267
uint64_t last_non_zero(epu8 v, int bnd) noexcept
return the index of the last non zero entry or 16 if there are none Only index smaller than bound are...
Definition epu8_impl.hpp:129
uint64_t last_diff(epu8 a, epu8 b, size_t bound=16) noexcept
The last difference between two HPCombi::epu8.
Definition epu8.hpp:486
epu8 eval16_popcount(epu8 v) noexcept
Evaluation of a HPCombi::epu8.
Definition epu8_impl.hpp:471
epu8 sort_perm(epu8 &a) noexcept
Sort this and return the sorting permutation.
Definition epu8_impl.hpp:216
epu8 partial_min_gen(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:434
epu8 partial_min_ref(epu8) noexcept
Horizontal partial sum of a HPCombi::epu8.
Definition epu8_impl.hpp:427
uint8_t horiz_max_gen(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:380
bool is_transformation(epu8 v, const size_t k=16) noexcept
Test for transformation.
Definition epu8_impl.hpp:493
uint64_t first_diff(epu8 a, epu8 b, size_t bound=16) noexcept
The first difference between two HPCombi::epu8.
Definition epu8.hpp:446
bool is_partial_transformation(epu8 v, const size_t k=16) noexcept
Test for partial transformation.
Definition epu8_impl.hpp:485
epu8 permuted_ref(epu8 a, epu8 b) noexcept
Permuting a HPCombi::epu8.
Definition epu8_impl.hpp:59
constexpr std::array< epu8, 6 > merge_rounds
Definition epu8_impl.hpp:226
TPUBuild< TPU >::array & as_array(TPU &v) noexcept
Cast a TPU to a c++ std::array.
Definition builder.hpp:139
uint8_t horiz_sum_gen(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:345
uint8_t horiz_min3(epu8) noexcept
Horizontal sum of a HPCombi::epu8.
Definition epu8_impl.hpp:419
bool not_equal(epu8 a, epu8 b) noexcept
Non equality of HPCombi::epu8.
Definition epu8.hpp:67
Definition bmat8.hpp:364
std::ostream & operator<<(std::ostream &os, HPCombi::BMat8 const &bm)
Definition bmat8_impl.hpp:508
std::string to_string(HPCombi::epu8 const &a)
Definition epu8_impl.hpp:550
bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const noexcept
Definition epu8_impl.hpp:557
size_t operator()(HPCombi::epu8 a) const noexcept
Definition epu8_impl.hpp:571
size_t operator()(const HPCombi::epu8 &v1, const HPCombi::epu8 &v2) const noexcept
Definition epu8_impl.hpp:590
bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const noexcept
Definition epu8_impl.hpp:564