HPCombi
High Performance Combinatorics in C++ using vector instructions v1.0.3
Loading...
Searching...
No Matches
epu8_impl.hpp
Go to the documentation of this file.
1//****************************************************************************//
2// Copyright (C) 2016-2024 Florent Hivert <Florent.Hivert@lisn.fr>, //
3// //
4// This file is part of HP-Combi <https://github.com/libsemigroups/HPCombi> //
5// //
6// HP-Combi is free software: you can redistribute it and/or modify it //
7// under the terms of the GNU General Public License as published by the //
8// Free Software Foundation, either version 3 of the License, or //
9// (at your option) any later version. //
10// //
11// HP-Combi is distributed in the hope that it will be useful, but WITHOUT //
12// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or //
13// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License //
14// for more details. //
15// //
16// You should have received a copy of the GNU General Public License along //
17// with HP-Combi. If not, see <https://www.gnu.org/licenses/>. //
18//****************************************************************************//
19
20// NOLINT(build/header_guard)
21
25
26#include <initializer_list>
27#include <iostream>
28#include <random>
29#include <sstream>
30
31#include "vect_generic.hpp"
32
33#ifdef SIMDE_X86_SSE4_2_NATIVE
34// Comparison mode for _mm_cmpestri
35#define FIRST_DIFF \
36 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | \
37 SIMDE_SIDD_NEGATIVE_POLARITY)
38#define LAST_DIFF \
39 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | \
40 SIMDE_SIDD_NEGATIVE_POLARITY | SIMDE_SIDD_MOST_SIGNIFICANT)
41#define FIRST_ZERO (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY)
42#define LAST_ZERO \
43 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | \
44 SIMDE_SIDD_MOST_SIGNIFICANT)
45#define FIRST_NON_ZERO \
46 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | \
47 SIMDE_SIDD_MASKED_NEGATIVE_POLARITY)
48#define LAST_NON_ZERO \
49 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | \
50 SIMDE_SIDD_MASKED_NEGATIVE_POLARITY | SIMDE_SIDD_MOST_SIGNIFICANT)
51#endif
52
53namespace HPCombi {
54
56// Implementation part for inline functions
58
60inline epu8 permuted_ref(epu8 a, epu8 b) noexcept {
61 epu8 res;
62 for (uint64_t i = 0; i < 16; i++)
63 res[i] = a[b[i] & 0xF];
64 return res;
65}
66
67// Msk is supposed to be a boolean mask (i.e. each entry is either 0 or
68// 255)
69inline uint64_t first_mask(epu8 msk, size_t bound) {
70 uint64_t res = simde_mm_movemask_epi8(msk & (Epu8.id() < Epu8(bound)));
71 return res == 0 ? 16 : (__builtin_ffsll(res) - 1);
72}
73inline uint64_t last_mask(epu8 msk, size_t bound) {
74 auto res = simde_mm_movemask_epi8(msk & (Epu8.id() < Epu8(bound)));
75 return res == 0 ? 16 : (63 - __builtin_clzll(res));
76}
77
78inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) noexcept {
79 for (size_t i = 0; i < bound; i++)
80 if (a[i] != b[i])
81 return i;
82 return 16;
83}
84#ifdef SIMDE_X86_SSE4_2_NATIVE
85inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) noexcept {
86 return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));
87}
88#endif
89inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) noexcept {
90 return first_mask(a != b, bound);
91}
92
93inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) noexcept {
94 while (bound != 0) {
95 --bound;
96 if (a[bound] != b[bound])
97 return bound;
98 }
99 return 16;
100}
101#ifdef SIMDE_X86_SSE4_2_NATIVE
102inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) noexcept {
103 return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));
104}
105#endif
106inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) noexcept {
107 return last_mask(a != b, bound);
108}
109
110inline bool less(epu8 a, epu8 b) noexcept {
111 uint64_t diff = first_diff(a, b);
112 return (diff < 16) && (a[diff] < b[diff]);
113}
114inline int8_t less_partial(epu8 a, epu8 b, int k) noexcept {
115 uint64_t diff = first_diff(a, b, k);
116 return (diff == 16)
117 ? 0
118 : static_cast<int8_t>(a[diff]) - static_cast<int8_t>(b[diff]);
119}
120
121inline uint64_t first_zero(epu8 v, int bnd) noexcept {
122 return first_mask(v == epu8{}, bnd);
123}
124inline uint64_t last_zero(epu8 v, int bnd) noexcept {
125 return last_mask(v == epu8{}, bnd);
126}
127inline uint64_t first_non_zero(epu8 v, int bnd) noexcept {
128 return first_mask(v != epu8{}, bnd);
129}
130inline uint64_t last_non_zero(epu8 v, int bnd) noexcept {
131 return last_mask(v != epu8{}, bnd);
132}
133
135template <bool Increasing = true, size_t sz>
136inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {
137 for (auto round : rounds) {
138 // This conditional should be optimized out by the compiler
139 epu8 mask = Increasing ? round < Epu8.id() : Epu8.id() < round;
140 epu8 b = permuted(res, round);
141 // res = mask ? min(res,b) : max(res,b); is not accepted by clang
142 res = simde_mm_blendv_epi8(min(res, b), max(res, b), mask);
143 }
144 return res;
145}
146
148template <bool Increasing = true, size_t sz>
149inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {
150 epu8 res = Epu8.id();
151 for (auto round : rounds) {
152 // This conditional should be optimized out by the compiler
153 epu8 mask = Increasing ? round < Epu8.id() : Epu8.id() < round;
154 epu8 b = permuted(v, round);
155 epu8 cmp = simde_mm_blendv_epi8(b < v, v < b, mask);
156 v = simde_mm_blendv_epi8(v, b, cmp);
157 res = simde_mm_blendv_epi8(res, permuted(res, round), cmp);
158 }
159 return res;
160}
161
168constexpr std::array<epu8, 9> sorting_rounds
169 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
170 {{epu8{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
171 epu8{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
172 epu8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
173 epu8{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},
174 epu8{0, 2, 1, 12, 8, 10, 9, 11, 4, 6, 5, 7, 3, 14, 13, 15},
175 epu8{0, 4, 8, 10, 1, 9, 12, 13, 2, 5, 3, 14, 6, 7, 11, 15},
176 epu8{0, 1, 4, 5, 2, 3, 8, 9, 6, 7, 12, 13, 10, 11, 14, 15},
177 epu8{0, 1, 2, 6, 4, 8, 3, 10, 5, 12, 7, 11, 9, 13, 14, 15},
178 epu8{0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 13, 14, 15}}};
179
188constexpr std::array<epu8, 6> sorting_rounds8
189 // clang-format off
190 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
191{{
192 epu8 { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
193 epu8 { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
194 epu8 { 0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15},
195 epu8 { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
196 epu8 { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
197 epu8 { 0, 2, 1, 4, 3, 6, 5, 7, 8, 10, 9, 12, 11, 14, 13, 15}
198}};
199// clang-format on
200
201inline bool is_sorted(epu8 a) noexcept {
202 return simde_mm_movemask_epi8(shifted_right(a) > a) == 0;
203}
204inline epu8 sorted(epu8 a) noexcept {
206}
207inline epu8 sorted8(epu8 a) noexcept {
209}
210inline epu8 revsorted(epu8 a) noexcept {
212}
213inline epu8 revsorted8(epu8 a) noexcept {
215}
216
217inline epu8 sort_perm(epu8 &a) noexcept {
219}
220inline epu8 sort8_perm(epu8 &a) noexcept {
222}
223
224constexpr std::array<epu8, 6> merge_rounds
225 // clang-format off
226 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
227{{
228 epu8 { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},
229 epu8 { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
230 epu8 { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
231 epu8 { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
232}};
233// clang-format on
234inline void merge_rev(epu8 &a, epu8 &b) noexcept {
235 epu8 mn = min(a, b);
236 b = max(a, b);
237 a = mn;
240}
241inline void merge(epu8 &a, epu8 &b) noexcept {
242 a = permuted(a, Epu8.rev());
243 merge_rev(a, b);
244}
245// TODO : AVX2 version.
246// TODO : compute merge_rounds on the fly instead of loading those from
247// memory
248
249inline epu8 random_epu8(uint16_t bnd) {
250 epu8 res;
251
252 static std::random_device rd;
253 static std::default_random_engine e1(rd());
254 std::uniform_int_distribution<int> uniform_dist(0, bnd - 1);
255
256 for (size_t i = 0; i < 16; i++)
257 res[i] = uniform_dist(e1);
258 return res;
259}
260
261inline epu8 remove_dups(epu8 v, uint8_t repl) noexcept {
262 // Vector ternary operator is not supported by clang.
263 // return (v != shifted_right(v) ? v : Epu8(repl);
264 return simde_mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
265}
266
267// Gather at the front numbers with (3-i)-th bit not set.
268constexpr std::array<epu8, 3> inverting_rounds{{
269 // clang-format off
270 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
271 epu8 { 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15},
272 epu8 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
273 epu8 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}
274 // clang-format on
275}};
276
277#ifdef SIMDE_X86_SSE4_2_NATIVE
278#define FIND_IN_VECT \
279 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK | \
280 SIMDE_SIDD_NEGATIVE_POLARITY)
281#define FIND_IN_VECT_COMPL \
282 (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK)
283
284inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept {
285 epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
286 for (epu8 round : inverting_rounds) {
287 a = permuted(a, round);
288 res <<= 1;
289 res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
290 }
291 return res;
292}
293#endif
294
295inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept {
296 auto ar = as_array(a);
297 epu8 res{};
298 for (size_t i = 0; i < 16; i++) {
299 res[i] =
300 std::distance(ar.begin(), std::find(ar.begin(), ar.end(), b[i]));
301 }
302 return res;
303}
304inline epu8 permutation_of(epu8 a, epu8 b) noexcept {
305#ifdef SIMDE_X86_SSE4_2_NATIVE
306 return permutation_of_cmpestrm(a, b);
307#else
308 return permutation_of_ref(a, b);
309#endif
310}
311
312#if defined(FF)
313#error FF is defined !
314#endif /* FF */
315#define FF 0xff
316
318constexpr std::array<epu8, 4> summing_rounds{{
319 // clang-format off
320 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
321 epu8 { FF, 0, FF, 2, FF, 4, FF, 6, FF, 8, FF, 10, FF, 12, FF, 14},
322 epu8 { FF, FF, 1, 1, FF, FF, 5, 5, FF, FF, 9, 9, FF, FF, 13, 13},
323 epu8 { FF, FF, FF, FF, 3, 3, 3, 3, FF, FF, FF, FF, 11, 11, 11, 11},
324 epu8 { FF, FF, FF, FF, FF, FF, FF, FF, 7, 7, 7, 7, 7, 7, 7, 7}
325 // clang-format on
326}};
327
328constexpr std::array<epu8, 4> mining_rounds{{
329 // clang-format off
330 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
331 epu8 { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14},
332 epu8 { 0, 1, 1, 1, 4, 5, 5, 5, 8, 9, 9, 9, 12, 13, 13, 13},
333 epu8 { 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11},
334 epu8 { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7}
335 // clang-format on
336}};
337
338#undef FF
339
340inline uint8_t horiz_sum_ref(epu8 v) noexcept {
341 uint8_t res = 0;
342 for (size_t i = 0; i < 16; i++)
343 res += v[i];
344 return res;
345}
346inline uint8_t horiz_sum_gen(epu8 v) noexcept {
347 return as_VectGeneric(v).horiz_sum();
348}
349inline uint8_t horiz_sum4(epu8 v) noexcept { return partial_sums_round(v)[15]; }
350inline uint8_t horiz_sum3(epu8 v) noexcept {
351 auto sr = summing_rounds;
352 v += permuted(v, sr[0]);
353 v += permuted(v, sr[1]);
354 v += permuted(v, sr[2]);
355 return v[7] + v[15];
356}
357
358inline epu8 partial_sums_ref(epu8 v) noexcept {
359 epu8 res{};
360 res[0] = v[0];
361 for (size_t i = 1; i < 16; i++)
362 res[i] = res[i - 1] + v[i];
363 return res;
364}
365inline epu8 partial_sums_gen(epu8 v) noexcept {
366 as_VectGeneric(v).partial_sums_inplace();
367 return v;
368}
369inline epu8 partial_sums_round(epu8 v) noexcept {
370 for (epu8 round : summing_rounds)
371 v += permuted(v, round);
372 return v;
373}
374
375inline uint8_t horiz_max_ref(epu8 v) noexcept {
376 uint8_t res = 0;
377 for (size_t i = 0; i < 16; i++)
378 res = std::max(res, v[i]);
379 return res;
380}
381inline uint8_t horiz_max_gen(epu8 v) noexcept {
382 return as_VectGeneric(v).horiz_max();
383}
384inline uint8_t horiz_max4(epu8 v) noexcept { return partial_max_round(v)[15]; }
385inline uint8_t horiz_max3(epu8 v) noexcept {
386 auto sr = summing_rounds;
387 v = max(v, permuted(v, sr[0]));
388 v = max(v, permuted(v, sr[1]));
389 v = max(v, permuted(v, sr[2]));
390 return std::max(v[7], v[15]);
391}
392
393inline epu8 partial_max_ref(epu8 v) noexcept {
394 epu8 res;
395 res[0] = v[0];
396 for (size_t i = 1; i < 16; i++)
397 res[i] = std::max(res[i - 1], v[i]);
398 return res;
399}
400inline epu8 partial_max_gen(epu8 v) noexcept {
401 as_VectGeneric(v).partial_max_inplace();
402 return v;
403}
404inline epu8 partial_max_round(epu8 v) noexcept {
405 for (epu8 round : summing_rounds)
406 v = max(v, permuted(v, round));
407 return v;
408}
409
410inline uint8_t horiz_min_ref(epu8 v) noexcept {
411 uint8_t res = 255;
412 for (size_t i = 0; i < 16; i++)
413 res = std::min(res, v[i]);
414 return res;
415}
416inline uint8_t horiz_min_gen(epu8 v) noexcept {
417 return as_VectGeneric(v).horiz_min();
418}
419inline uint8_t horiz_min4(epu8 v) noexcept { return partial_min_round(v)[15]; }
420inline uint8_t horiz_min3(epu8 v) noexcept {
421 auto sr = mining_rounds;
422 v = min(v, permuted(v, sr[0]));
423 v = min(v, permuted(v, sr[1]));
424 v = min(v, permuted(v, sr[2]));
425 return std::min(v[7], v[15]);
426}
427
428inline epu8 partial_min_ref(epu8 v) noexcept {
429 epu8 res;
430 res[0] = v[0];
431 for (size_t i = 1; i < 16; i++)
432 res[i] = std::min(res[i - 1], v[i]);
433 return res;
434}
435inline epu8 partial_min_gen(epu8 v) noexcept {
436 as_VectGeneric(v).partial_min_inplace();
437 return v;
438}
439inline epu8 partial_min_round(epu8 v) noexcept {
440 for (epu8 round : mining_rounds)
441 v = min(v, permuted(v, round));
442 return v;
443}
444
445inline epu8 eval16_ref(epu8 v) noexcept {
446 epu8 res{};
447 for (size_t i = 0; i < 16; i++)
448 if (v[i] < 16)
449 res[v[i]]++;
450 return res;
451}
452
453inline epu8 eval16_arr(epu8 v8) noexcept {
454 decltype(Epu8)::array res{};
455 auto v = as_array(v8);
456 for (size_t i = 0; i < 16; i++)
457 if (v[i] < 16)
458 res[v[i]]++;
459 return Epu8(res);
460}
461inline epu8 eval16_gen(epu8 v) noexcept {
462 return Epu8(as_VectGeneric(v).eval().v);
463}
464inline epu8 eval16_cycle(epu8 v) noexcept {
465 epu8 res = -(Epu8.id() == v);
466 for (int i = 1; i < 16; i++) {
467 v = permuted(v, Epu8.left_cycle());
468 res -= (Epu8.id() == v);
469 }
470 return res;
471}
472inline epu8 eval16_popcount(epu8 v) noexcept {
473 epu8 res{};
474 for (size_t i = 0; i < 16; i++) {
475 res[i] =
476 __builtin_popcountl(simde_mm_movemask_epi8(v == Epu8(uint8_t(i))));
477 }
478 return res;
479}
480
481inline epu8 popcount16(epu8 v) noexcept {
482 return (permuted(Epu8.popcount(), v & Epu8(0x0f)) +
483 permuted(Epu8.popcount(), v >> 4));
484}
485
486inline bool is_partial_transformation(epu8 v, const size_t k) noexcept {
487 uint64_t diff = last_diff(v, Epu8.id(), 16);
488 // (forall x in v, x + 1 <= 16) and
489 // (v = Perm16::one() or last diff index < 16)
490 return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) &&
491 (diff == 16 || diff < k);
492}
493
494inline bool is_transformation(epu8 v, const size_t k) noexcept {
495 uint64_t diff = last_diff(v, Epu8.id(), 16);
496 return (simde_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff) &&
497 (diff == 16 || diff < k);
498}
499
500inline bool is_partial_permutation(epu8 v, const size_t k) noexcept {
501 uint64_t diff = last_diff(v, Epu8.id(), 16);
502 // (forall x in v, x <= 15) and
503 // (forall x < 15, multiplicity x v <= 1
504 // (v = Perm16::one() or last diff index < 16)
505 return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) &&
506 (simde_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff) &&
507 (diff == 16 || diff < k);
508}
509
510#ifdef SIMDE_X86_SSE4_2_NATIVE
511inline bool is_permutation_cmpestri(epu8 v, const size_t k) noexcept {
512 uint64_t diff = last_diff(v, Epu8.id(), 16);
513 // (forall x in v, x in Perm16::one()) and
514 // (forall x in Perm16::one(), x in v) and
515 // (v = Perm16::one() or last diff index < 16)
516 return _mm_cmpestri(Epu8.id(), 16, v, 16, FIRST_NON_ZERO) == 16 &&
517 _mm_cmpestri(v, 16, Epu8.id(), 16, FIRST_NON_ZERO) == 16 &&
518 (diff == 16 || diff < k);
519}
520#endif
521
522inline bool is_permutation_sort(epu8 v, const size_t k) noexcept {
523 uint64_t diff = last_diff(v, Epu8.id(), 16);
524 return equal(sorted(v), Epu8.id()) && (diff == 16 || diff < k);
525}
526inline bool is_permutation_eval(epu8 v, const size_t k) noexcept {
527 uint64_t diff = last_diff(v, Epu8.id(), 16);
528 return equal(eval16(v), Epu8({}, 1)) && (diff == 16 || diff < k);
529}
530
531inline bool is_permutation(epu8 v, const size_t k) noexcept {
532#ifdef SIMDE_X86_SSE4_2_NATIVE
533 return is_permutation_cmpestri(v, k);
534#else
535 return is_permutation_sort(v, k);
536#endif
537}
538
539} // namespace HPCombi
540
541namespace std {
542
543inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a) {
544 stream << "{" << std::setw(2) << unsigned(a[0]);
545 for (unsigned i = 1; i < 16; ++i)
546 stream << "," << std::setw(2) << unsigned(a[i]);
547 stream << "}";
548 return stream;
549}
550
551inline std::string to_string(HPCombi::epu8 const &a) {
552 std::ostringstream ss;
553 ss << a;
554 return ss.str();
555}
556
559template <> struct equal_to<HPCombi::epu8> {
560 bool operator()(const HPCombi::epu8 &lhs,
561 const HPCombi::epu8 &rhs) const noexcept {
562 return HPCombi::equal(lhs, rhs);
563 }
564};
565
568template <> struct not_equal_to<HPCombi::epu8> {
569 bool operator()(const HPCombi::epu8 &lhs,
570 const HPCombi::epu8 &rhs) const noexcept {
571 return HPCombi::not_equal(lhs, rhs);
572 }
573};
574
577template <> struct hash<HPCombi::epu8> {
578 inline size_t operator()(HPCombi::epu8 a) const noexcept {
579 unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
580 unsigned __int128 v1 = simde_mm_extract_epi64(a, 1);
581 return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;
582
583 /* The following is extremely slow on Renner benchmark
584 uint64_t v0 = simde_mm_extract_epi64(ar.v, 0);
585 uint64_t v1 = simde_mm_extract_epi64(ar.v, 1);
586 size_t seed = v0 + 0x9e3779b9;
587 seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);
588 return seed;
589 */
590 }
591};
592
595template <> struct less<HPCombi::epu8> {
596 // WARNING: due to endianness this is not lexicographic comparison,
597 // but we don't care when using in std::set.
598 // 10% faster than calling the lexicographic comparison operator!
599 inline size_t operator()(const HPCombi::epu8 &v1,
600 const HPCombi::epu8 &v2) const noexcept {
601 simde__m128 v1v = simde__m128(v1), v2v = simde__m128(v2);
602 return v1v[0] == v2v[0] ? v1v[1] < v2v[1] : v1v[0] < v2v[0];
603 }
604};
605
606} // namespace std
uint8_t __attribute__((vector_size(16))) epu8
epu8 stands for Extended Packed Unsigned, grouped by 8 bits; this is the low level type chosen by Int...
Definition epu8.hpp:73
#define FF
Definition bmat8_impl.hpp:297
std::array< std::tuple< uint16_t, uint16_t, std::array< uint16_t, gens.size()> >, 65536 > res
Definition image.cpp:66
Definition bmat16.hpp:39
uint8_t horiz_min4(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:419
epu8 max(epu8 a, epu8 b) noexcept
Vector max between two HPCombi::epu8 0.
Definition epu8.hpp:125
uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as last_diff but with a different implementation.
Definition epu8_impl.hpp:93
uint64_t first_non_zero(epu8 v, int bnd) noexcept
return the index of the first non zero entry or 16 if there are none Only index smaller than bound ar...
Definition epu8_impl.hpp:127
uint8_t horiz_min_ref(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:410
epu8 eval16_arr(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:453
epu8 permuted(epu8 a, epu8 b) noexcept
Same as permuted_ref but with an optimized implementation using intrinsics.
Definition epu8.hpp:103
epu8 sort8_perm(epu8 &a) noexcept
Sort this and return the sorting permutation.
Definition epu8_impl.hpp:220
epu8 shifted_right(epu8 a) noexcept
Left shifted of a HPCombi::epu8 inserting a 0.
Definition epu8.hpp:110
uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as first_diff but with a different implementation.
Definition epu8_impl.hpp:78
uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as first_diff but with a different implementation.
Definition epu8_impl.hpp:89
epu8 partial_sums_ref(epu8) noexcept
Same interface as partial_sums but with a different implementation.
Definition epu8_impl.hpp:358
epu8 eval16_gen(epu8 v) noexcept
Definition epu8_impl.hpp:461
epu8 network_sort(epu8 res, std::array< epu8, sz > rounds)
Apply a sorting network.
Definition epu8_impl.hpp:136
epu8 remove_dups(epu8 a, uint8_t repl=0) noexcept
Remove duplicates in a sorted HPCombi::epu8.
Definition epu8_impl.hpp:261
epu8 revsorted8(epu8 a) noexcept
Return a HPCombi::epu8 with both halves reverse sorted.
Definition epu8_impl.hpp:213
epu8 permutation_of(epu8 a, epu8 b) noexcept
Find if a vector is a permutation of another one.
Definition epu8_impl.hpp:304
bool is_permutation(epu8 v, const size_t k=16) noexcept
Definition epu8_impl.hpp:531
void merge_rev(epu8 &a, epu8 &b) noexcept
Definition epu8_impl.hpp:234
constexpr std::array< epu8, 4 > summing_rounds
Permutation Round for partial and horizontal sums.
Definition epu8_impl.hpp:318
int8_t less_partial(epu8 a, epu8 b, int k) noexcept
Partial lexicographic comparison between two HPCombi::epu8.
Definition epu8_impl.hpp:114
uint8_t horiz_max4(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:384
uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as last_diff but with a different implementation.
Definition epu8_impl.hpp:106
uint8_t horiz_max3(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:385
constexpr uint64_t prime
A prime number good for hashing.
Definition epu8.hpp:198
uint8_t horiz_min_gen(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:416
bool is_partial_permutation(epu8 v, const size_t k=16) noexcept
Test for partial permutations.
Definition epu8_impl.hpp:500
bool is_permutation_sort(epu8 v, const size_t k=16) noexcept
Same interface as is_permutation but with a different implementation.
Definition epu8_impl.hpp:522
uint64_t last_zero(epu8 v, int bnd) noexcept
return the index of the last zero entry or 16 if there are none Only index smaller than bound are tak...
Definition epu8_impl.hpp:124
void merge(epu8 &a, epu8 &b) noexcept
Merge two sorted epu8.
Definition epu8_impl.hpp:241
uint8_t horiz_sum4(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:349
epu8 popcount16(epu8 v) noexcept
a vector popcount function
Definition epu8_impl.hpp:481
epu8 partial_sums_round(epu8) noexcept
Same interface as partial_sums but with a different implementation.
Definition epu8_impl.hpp:369
uint8_t horiz_sum3(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:350
uint8_t horiz_sum_ref(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:340
epu8 partial_sums_gen(epu8) noexcept
Same interface as partial_sums but with a different implementation.
Definition epu8_impl.hpp:365
bool equal(epu8 a, epu8 b) noexcept
Equality of HPCombi::epu8.
Definition epu8.hpp:91
epu8 permutation_of_ref(epu8 a, epu8 b) noexcept
Same interface as permutation_of but with a different implementation.
Definition epu8_impl.hpp:295
epu8 min(epu8 a, epu8 b) noexcept
Vector min between two HPCombi::epu8 0.
Definition epu8.hpp:123
uint64_t first_mask(epu8 msk, size_t bound)
Definition epu8_impl.hpp:69
epu8 partial_max_ref(epu8) noexcept
Same interface as partial_max but with a different implementation.
Definition epu8_impl.hpp:393
epu8 sorted8(epu8 a) noexcept
Return a HPCombi::epu8 with both halves sorted.
Definition epu8_impl.hpp:207
epu8 partial_max_round(epu8) noexcept
Same interface as partial_max but with a different implementation.
Definition epu8_impl.hpp:404
epu8 eval16(epu8 v) noexcept
Evaluation of a HPCombi::epu8: count how many times each int of 0..15 appears in the input.
Definition epu8.hpp:488
epu8 network_sort_perm(epu8 &v, std::array< epu8, sz > rounds)
Apply a sorting network in place and return the permutation.
Definition epu8_impl.hpp:149
constexpr std::array< epu8, 6 > sorting_rounds8
A duplicated 8-way sorting network.
Definition epu8_impl.hpp:191
epu8 eval16_cycle(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:464
constexpr std::array< epu8, 9 > sorting_rounds
A 16-way sorting network.
Definition epu8_impl.hpp:170
epu8 eval16_ref(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:445
epu8 partial_max_gen(epu8) noexcept
Same interface as partial_max but with a different implementation.
Definition epu8_impl.hpp:400
bool less(epu8 a, epu8 b) noexcept
Lexicographic comparison between two HPCombi::epu8.
Definition epu8_impl.hpp:110
uint8_t horiz_max_ref(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:375
epu8 sorted(epu8 a) noexcept
Return a sorted HPCombi::epu8.
Definition epu8_impl.hpp:204
uint64_t last_mask(epu8 msk, size_t bound)
Definition epu8_impl.hpp:73
constexpr std::array< epu8, 4 > mining_rounds
Definition epu8_impl.hpp:328
constexpr TPUBuild< epu8 > Epu8
Factory object acting as a class constructor for type HPCombi::epu8.
Definition epu8.hpp:81
bool is_permutation_eval(epu8 v, const size_t k=16) noexcept
Same interface as is_permutation but with a different implementation.
Definition epu8_impl.hpp:526
VectGeneric< TPUBuild< TPU >::size > & as_VectGeneric(TPU &v)
Cast a HPCombi::epu8 to a c++ HPCombi::VectGeneric.
Definition builder.hpp:162
uint64_t first_zero(epu8 v, int bnd) noexcept
return the index of the first zero entry or 16 if there are none Only index smaller than bound are ta...
Definition epu8_impl.hpp:121
epu8 random_epu8(uint16_t bnd)
A random HPCombi::epu8.
Definition epu8_impl.hpp:249
epu8 revsorted(epu8 a) noexcept
Return a reverse sorted HPCombi::epu8.
Definition epu8_impl.hpp:210
epu8 partial_min_round(epu8) noexcept
Same interface as partial_min but with a different implementation.
Definition epu8_impl.hpp:439
bool is_sorted(epu8 a) noexcept
Testing if a HPCombi::epu8 is sorted.
Definition epu8_impl.hpp:201
constexpr std::array< epu8, 3 > inverting_rounds
Definition epu8_impl.hpp:268
uint64_t last_non_zero(epu8 v, int bnd) noexcept
return the index of the last non zero entry or 16 if there are none Only index smaller than bound are...
Definition epu8_impl.hpp:130
uint64_t last_diff(epu8 a, epu8 b, size_t bound=16) noexcept
The last difference between two HPCombi::epu8.
Definition epu8.hpp:576
epu8 eval16_popcount(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:472
epu8 sort_perm(epu8 &a) noexcept
Sort this and return the sorting permutation.
Definition epu8_impl.hpp:217
epu8 partial_min_gen(epu8) noexcept
Same interface as partial_min but with a different implementation.
Definition epu8_impl.hpp:435
epu8 partial_min_ref(epu8) noexcept
Same interface as partial_min but with a different implementation.
Definition epu8_impl.hpp:428
uint8_t horiz_max_gen(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:381
uint8_t __attribute__((vector_size(16))) epu8
epu8 stands for Extended Packed Unsigned, grouped by 8 bits; this is the low level type chosen by Int...
Definition epu8.hpp:73
bool is_transformation(epu8 v, const size_t k=16) noexcept
Test for transformation.
Definition epu8_impl.hpp:494
uint64_t first_diff(epu8 a, epu8 b, size_t bound=16) noexcept
The first difference between two HPCombi::epu8.
Definition epu8.hpp:531
bool is_partial_transformation(epu8 v, const size_t k=16) noexcept
Test for partial transformation.
Definition epu8_impl.hpp:486
epu8 permuted_ref(epu8 a, epu8 b) noexcept
Apply a permutation b on the vector a: for i=0..16 {result[i] = a[b[i]}.
Definition epu8_impl.hpp:60
constexpr std::array< epu8, 6 > merge_rounds
Definition epu8_impl.hpp:227
TPUBuild< TPU >::array & as_array(TPU &v) noexcept
Cast a TPU to a c++ std::array.
Definition builder.hpp:145
uint8_t horiz_sum_gen(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:346
uint8_t horiz_min3(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:420
bool not_equal(epu8 a, epu8 b) noexcept
Non equality of HPCombi::epu8.
Definition epu8.hpp:95
Definition bmat16_impl.hpp:362
std::ostream & operator<<(std::ostream &os, HPCombi::BMat16 const &bm)
Definition bmat16_impl.hpp:365
std::string to_string(HPCombi::epu8 const &a)
Definition epu8_impl.hpp:551
bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const noexcept
Definition epu8_impl.hpp:560
size_t operator()(HPCombi::epu8 a) const noexcept
Definition epu8_impl.hpp:578
size_t operator()(const HPCombi::epu8 &v1, const HPCombi::epu8 &v2) const noexcept
Definition epu8_impl.hpp:599
bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const noexcept
Definition epu8_impl.hpp:569
HPCombi::VectGeneric.