HPCombi/epu8__impl_8hpp_source.html

//****************************************************************************//

//     Copyright (C) 2016-2024 Florent Hivert <Florent.Hivert@lisn.fr>,       //

//                                                                            //

//  This file is part of HP-Combi <https://github.com/libsemigroups/HPCombi>  //

//                                                                            //

//  HP-Combi is free software: you can redistribute it and/or modify it       //

//  under the terms of the GNU General Public License as published by the     //

//  Free Software Foundation, either version 3 of the License, or             //

//  (at your option) any later version.                                       //

//                                                                            //

//  HP-Combi is distributed in the hope that it will be useful, but WITHOUT   //

//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or     //

//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License      //

//  for  more details.                                                        //

//                                                                            //

//  You should have received a copy of the GNU General Public License along   //

//  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //

//****************************************************************************//


// NOLINT(build/header_guard)


#include <initializer_list>

#include <iostream>

#include <random>

#include <sstream>


#include "vect_generic.hpp"


#ifdef SIMDE_X86_SSE4_2_NATIVE

// Comparison mode for _mm_cmpestri

#define FIRST_DIFF                                                             \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH |                        \

     SIMDE_SIDD_NEGATIVE_POLARITY)

#define LAST_DIFF                                                              \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH |                        \

     SIMDE_SIDD_NEGATIVE_POLARITY | SIMDE_SIDD_MOST_SIGNIFICANT)

#define FIRST_ZERO (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY)

#define LAST_ZERO                                                              \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY |                         \

     SIMDE_SIDD_MOST_SIGNIFICANT)

#define FIRST_NON_ZERO                                                         \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY |                         \

     SIMDE_SIDD_MASKED_NEGATIVE_POLARITY)

#define LAST_NON_ZERO                                                          \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY |                         \

     SIMDE_SIDD_MASKED_NEGATIVE_POLARITY | SIMDE_SIDD_MOST_SIGNIFICANT)

#endif


namespace HPCombi {


// Implementation part for inline functions


inline epu8 permuted_ref(epu8 a, epu8 b) noexcept {

    epu8 res;

    for (uint64_t i = 0; i < 16; i++)

        res[i] = a[b[i] & 0xF];

    return res;

}


// Msk is supposed to be a boolean mask (i.e. each entry is either 0 or

// 255)


inline uint64_t first_mask(epu8 msk, size_t bound) {

    uint64_t res = simde_mm_movemask_epi8(msk & (Epu8.id() < Epu8(bound)));

    return res == 0 ? 16 : (__builtin_ffsll(res) - 1);

}


inline uint64_t last_mask(epu8 msk, size_t bound) {

    auto res = simde_mm_movemask_epi8(msk & (Epu8.id() < Epu8(bound)));

    return res == 0 ? 16 : (63 - __builtin_clzll(res));

}


inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) noexcept {

    for (size_t i = 0; i < bound; i++)

        if (a[i] != b[i])

            return i;

    return 16;

}


#ifdef SIMDE_X86_SSE4_2_NATIVE

inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) noexcept {

    return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));

}

#endif


inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) noexcept {

    return first_mask(a != b, bound);

}


inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) noexcept {

    while (bound != 0) {

        --bound;

        if (a[bound] != b[bound])

            return bound;

    }

    return 16;

}


#ifdef SIMDE_X86_SSE4_2_NATIVE

inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) noexcept {

    return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));

}

#endif


inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) noexcept {

    return last_mask(a != b, bound);

}


inline bool less(epu8 a, epu8 b) noexcept {

    uint64_t diff = first_diff(a, b);

    return (diff < 16) && (a[diff] < b[diff]);

}


inline int8_t less_partial(epu8 a, epu8 b, int k) noexcept {

    uint64_t diff = first_diff(a, b, k);

    return (diff == 16)

               ? 0

               : static_cast<int8_t>(a[diff]) - static_cast<int8_t>(b[diff]);

}


inline uint64_t first_zero(epu8 v, int bnd) noexcept {

    return first_mask(v == epu8{}, bnd);

}


inline uint64_t last_zero(epu8 v, int bnd) noexcept {

    return last_mask(v == epu8{}, bnd);

}


inline uint64_t first_non_zero(epu8 v, int bnd) noexcept {

    return first_mask(v != epu8{}, bnd);

}


inline uint64_t last_non_zero(epu8 v, int bnd) noexcept {

    return last_mask(v != epu8{}, bnd);

}


template <bool Increasing = true, size_t sz>


inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {

    for (auto round : rounds) {

        // This conditional should be optimized out by the compiler

        epu8 mask = Increasing ? round < Epu8.id() : Epu8.id() < round;

        epu8 b = permuted(res, round);

        // res = mask ? min(res,b) : max(res,b); is not accepted by clang

        res = simde_mm_blendv_epi8(min(res, b), max(res, b), mask);

    }

    return res;

}


template <bool Increasing = true, size_t sz>


inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {

    epu8 res = Epu8.id();

    for (auto round : rounds) {

        // This conditional should be optimized out by the compiler

        epu8 mask = Increasing ? round < Epu8.id() : Epu8.id() < round;

        epu8 b = permuted(v, round);

        epu8 cmp = simde_mm_blendv_epi8(b < v, v < b, mask);

        v = simde_mm_blendv_epi8(v, b, cmp);

        res = simde_mm_blendv_epi8(res, permuted(res, round), cmp);

    }

    return res;

}


constexpr std::array<epu8, 9> sorting_rounds

    //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

    {{epu8{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},

      epu8{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},

      epu8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},

      epu8{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},

      epu8{0, 2, 1, 12, 8, 10, 9, 11, 4, 6, 5, 7, 3, 14, 13, 15},

      epu8{0, 4, 8, 10, 1, 9, 12, 13, 2, 5, 3, 14, 6, 7, 11, 15},

      epu8{0, 1, 4, 5, 2, 3, 8, 9, 6, 7, 12, 13, 10, 11, 14, 15},

      epu8{0, 1, 2, 6, 4, 8, 3, 10, 5, 12, 7, 11, 9, 13, 14, 15},

      epu8{0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 13, 14, 15}}};


constexpr std::array<epu8, 6> sorting_rounds8

    // clang-format off

    //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

{{

    epu8 { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8, 11, 10, 13, 12, 15, 14},

    epu8 { 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13},

    epu8 { 0,  2,  1,  3,  4,  6,  5,  7,  8, 10,  9, 11, 12, 14, 13, 15},

    epu8 { 4,  5,  6,  7,  0,  1,  2,  3, 12, 13, 14, 15,  8,  9, 10, 11},

    epu8 { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15},

    epu8 { 0,  2,  1,  4,  3,  6,  5,  7,  8, 10,  9, 12, 11, 14, 13, 15}

}};


// clang-format on


inline bool is_sorted(epu8 a) noexcept {

    return simde_mm_movemask_epi8(shifted_right(a) > a) == 0;

}


inline epu8 sorted(epu8 a) noexcept {

    return network_sort<true>(a, sorting_rounds);

}


inline epu8 sorted8(epu8 a) noexcept {

    return network_sort<true>(a, sorting_rounds8);

}


inline epu8 revsorted(epu8 a) noexcept {

    return network_sort<false>(a, sorting_rounds);

}


inline epu8 revsorted8(epu8 a) noexcept {

    return network_sort<false>(a, sorting_rounds8);

}


inline epu8 sort_perm(epu8 &a) noexcept {

    return network_sort_perm<true>(a, sorting_rounds);

}


inline epu8 sort8_perm(epu8 &a) noexcept {

    return network_sort_perm<true>(a, sorting_rounds8);

}


constexpr std::array<epu8, 6> merge_rounds

    // clang-format off

    //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

{{

    epu8 { 8,  9, 10, 11, 12, 13, 14, 15,  0,  1,  2,  3,  4,  5,  6,  7},

    epu8 { 4,  5,  6,  7,  0,  1,  2,  3, 12, 13, 14, 15,  8,  9, 10, 11},

    epu8 { 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13},

    epu8 { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8, 11, 10, 13, 12, 15, 14},

}};


// clang-format on


inline void merge_rev(epu8 &a, epu8 &b) noexcept {

    epu8 mn = min(a, b);

    b = max(a, b);

    a = mn;

    a = network_sort<true>(a, merge_rounds);

    b = network_sort<true>(b, merge_rounds);

}


inline void merge(epu8 &a, epu8 &b) noexcept {

    a = permuted(a, Epu8.rev());

    merge_rev(a, b);

}


// TODO : AVX2 version.

// TODO : compute merge_rounds on the fly instead of loading those from

// memory


inline epu8 random_epu8(uint16_t bnd) {

    epu8 res;


    static std::random_device rd;

    static std::default_random_engine e1(rd());

    std::uniform_int_distribution<int> uniform_dist(0, bnd - 1);


    for (size_t i = 0; i < 16; i++)

        res[i] = uniform_dist(e1);

    return res;

}


inline epu8 remove_dups(epu8 v, uint8_t repl) noexcept {

    // Vector ternary operator is not supported by clang.

    // return (v != shifted_right(v) ? v : Epu8(repl);

    return simde_mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));

}


// Gather at the front numbers with (3-i)-th bit not set.


constexpr std::array<epu8, 3> inverting_rounds{{

    // clang-format off

    //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

    epu8 { 0,  1,  2,  3,  8,  9, 10, 11,  4,  5,  6,  7, 12, 13, 14, 15},

    epu8 { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15},

    epu8 { 0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15}

    // clang-format on

}};


#ifdef SIMDE_X86_SSE4_2_NATIVE

#define FIND_IN_VECT                                                           \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK |  \

     SIMDE_SIDD_NEGATIVE_POLARITY)

#define FIND_IN_VECT_COMPL                                                     \

    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK)


inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept {

    epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));

    for (epu8 round : inverting_rounds) {

        a = permuted(a, round);

        res <<= 1;

        res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));

    }

    return res;

}

#endif


inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept {

    auto ar = as_array(a);

    epu8 res{};

    for (size_t i = 0; i < 16; i++) {

        res[i] =

            std::distance(ar.begin(), std::find(ar.begin(), ar.end(), b[i]));

    }

    return res;

}


inline epu8 permutation_of(epu8 a, epu8 b) noexcept {

#ifdef SIMDE_X86_SSE4_2_NATIVE

    return permutation_of_cmpestrm(a, b);

#else

    return permutation_of_ref(a, b);

#endif

}


#if defined(FF)

#error FF is defined !

#endif /* FF */

#define FF 0xff


constexpr std::array<epu8, 4> summing_rounds{{

    // clang-format off

    //      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

    epu8 { FF,  0, FF,  2, FF,  4, FF,  6, FF,  8, FF, 10, FF, 12, FF, 14},

    epu8 { FF, FF,  1,  1, FF, FF,  5,  5, FF, FF,  9,  9, FF, FF, 13, 13},

    epu8 { FF, FF, FF, FF,  3,  3,  3,  3, FF, FF, FF, FF, 11, 11, 11, 11},

    epu8 { FF, FF, FF, FF, FF, FF, FF, FF,  7,  7,  7,  7,  7,  7,  7,  7}

    // clang-format on

}};


constexpr std::array<epu8, 4> mining_rounds{{

    // clang-format off

    //      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

    epu8 {  0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10, 10, 12, 12, 14, 14},

    epu8 {  0,  1,  1,  1,  4,  5,  5,  5,  8,  9,  9,  9, 12, 13, 13, 13},

    epu8 {  0,  1,  2,  3,  3,  3,  3,  3,  8,  9, 10, 11, 11, 11, 11, 11},

    epu8 {  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7}

    // clang-format on

}};


#undef FF


inline uint8_t horiz_sum_ref(epu8 v) noexcept {

    uint8_t res = 0;

    for (size_t i = 0; i < 16; i++)

        res += v[i];

    return res;

}


inline uint8_t horiz_sum_gen(epu8 v) noexcept {

    return as_VectGeneric(v).horiz_sum();

}


inline uint8_t horiz_sum4(epu8 v) noexcept { return partial_sums_round(v)[15]; }


inline uint8_t horiz_sum3(epu8 v) noexcept {

    auto sr = summing_rounds;

    v += permuted(v, sr[0]);

    v += permuted(v, sr[1]);

    v += permuted(v, sr[2]);

    return v[7] + v[15];

}


inline epu8 partial_sums_ref(epu8 v) noexcept {

    epu8 res{};

    res[0] = v[0];

    for (size_t i = 1; i < 16; i++)

        res[i] = res[i - 1] + v[i];

    return res;

}


inline epu8 partial_sums_gen(epu8 v) noexcept {

    as_VectGeneric(v).partial_sums_inplace();

    return v;

}


inline epu8 partial_sums_round(epu8 v) noexcept {

    for (epu8 round : summing_rounds)

        v += permuted(v, round);

    return v;

}


inline uint8_t horiz_max_ref(epu8 v) noexcept {

    uint8_t res = 0;

    for (size_t i = 0; i < 16; i++)

        res = std::max(res, v[i]);

    return res;

}


inline uint8_t horiz_max_gen(epu8 v) noexcept {

    return as_VectGeneric(v).horiz_max();

}


inline uint8_t horiz_max4(epu8 v) noexcept { return partial_max_round(v)[15]; }


inline uint8_t horiz_max3(epu8 v) noexcept {

    auto sr = summing_rounds;

    v = max(v, permuted(v, sr[0]));

    v = max(v, permuted(v, sr[1]));

    v = max(v, permuted(v, sr[2]));

    return std::max(v[7], v[15]);

}


inline epu8 partial_max_ref(epu8 v) noexcept {

    epu8 res;

    res[0] = v[0];

    for (size_t i = 1; i < 16; i++)

        res[i] = std::max(res[i - 1], v[i]);

    return res;

}


inline epu8 partial_max_gen(epu8 v) noexcept {

    as_VectGeneric(v).partial_max_inplace();

    return v;

}


inline epu8 partial_max_round(epu8 v) noexcept {

    for (epu8 round : summing_rounds)

        v = max(v, permuted(v, round));

    return v;

}


inline uint8_t horiz_min_ref(epu8 v) noexcept {

    uint8_t res = 255;

    for (size_t i = 0; i < 16; i++)

        res = std::min(res, v[i]);

    return res;

}


inline uint8_t horiz_min_gen(epu8 v) noexcept {

    return as_VectGeneric(v).horiz_min();

}


inline uint8_t horiz_min4(epu8 v) noexcept { return partial_min_round(v)[15]; }


inline uint8_t horiz_min3(epu8 v) noexcept {

    auto sr = mining_rounds;

    v = min(v, permuted(v, sr[0]));

    v = min(v, permuted(v, sr[1]));

    v = min(v, permuted(v, sr[2]));

    return std::min(v[7], v[15]);

}


inline epu8 partial_min_ref(epu8 v) noexcept {

    epu8 res;

    res[0] = v[0];

    for (size_t i = 1; i < 16; i++)

        res[i] = std::min(res[i - 1], v[i]);

    return res;

}


inline epu8 partial_min_gen(epu8 v) noexcept {

    as_VectGeneric(v).partial_min_inplace();

    return v;

}


inline epu8 partial_min_round(epu8 v) noexcept {

    for (epu8 round : mining_rounds)

        v = min(v, permuted(v, round));

    return v;

}


inline epu8 eval16_ref(epu8 v) noexcept {

    epu8 res{};

    for (size_t i = 0; i < 16; i++)

        if (v[i] < 16)

            res[v[i]]++;

    return res;

}


inline epu8 eval16_arr(epu8 v8) noexcept {

    decltype(Epu8)::array res{};

    auto v = as_array(v8);

    for (size_t i = 0; i < 16; i++)

        if (v[i] < 16)

            res[v[i]]++;

    return Epu8(res);

}


inline epu8 eval16_gen(epu8 v) noexcept {

    return Epu8(as_VectGeneric(v).eval().v);

}


inline epu8 eval16_cycle(epu8 v) noexcept {

    epu8 res = -(Epu8.id() == v);

    for (int i = 1; i < 16; i++) {

        v = permuted(v, Epu8.left_cycle());

        res -= (Epu8.id() == v);

    }

    return res;

}


inline epu8 eval16_popcount(epu8 v) noexcept {

    epu8 res{};

    for (size_t i = 0; i < 16; i++) {

        res[i] =

            __builtin_popcountl(simde_mm_movemask_epi8(v == Epu8(uint8_t(i))));

    }

    return res;

}


inline epu8 popcount16(epu8 v) noexcept {

    return (permuted(Epu8.popcount(), v & Epu8(0x0f)) +

            permuted(Epu8.popcount(), v >> 4));

}


inline bool is_partial_transformation(epu8 v, const size_t k) noexcept {

    uint64_t diff = last_diff(v, Epu8.id(), 16);

    // (forall x in v, x + 1 <= 16)  and

    // (v = Perm16::one()   or  last diff index < 16)

    return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) &&

           (diff == 16 || diff < k);

}


inline bool is_transformation(epu8 v, const size_t k) noexcept {

    uint64_t diff = last_diff(v, Epu8.id(), 16);

    return (simde_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff) &&

           (diff == 16 || diff < k);

}


inline bool is_partial_permutation(epu8 v, const size_t k) noexcept {

    uint64_t diff = last_diff(v, Epu8.id(), 16);

    // (forall x in v, x <= 15)  and

    // (forall x < 15, multiplicity x v <= 1

    // (v = Perm16::one()   or  last diff index < 16)

    return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) &&

           (simde_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff) &&

           (diff == 16 || diff < k);

}


#ifdef SIMDE_X86_SSE4_2_NATIVE

inline bool is_permutation_cmpestri(epu8 v, const size_t k) noexcept {

    uint64_t diff = last_diff(v, Epu8.id(), 16);

    // (forall x in v, x in Perm16::one())  and

    // (forall x in Perm16::one(), x in v)  and

    // (v = Perm16::one()   or  last diff index < 16)

    return _mm_cmpestri(Epu8.id(), 16, v, 16, FIRST_NON_ZERO) == 16 &&

           _mm_cmpestri(v, 16, Epu8.id(), 16, FIRST_NON_ZERO) == 16 &&

           (diff == 16 || diff < k);

}

#endif


inline bool is_permutation_sort(epu8 v, const size_t k) noexcept {

    uint64_t diff = last_diff(v, Epu8.id(), 16);

    return equal(sorted(v), Epu8.id()) && (diff == 16 || diff < k);

}


inline bool is_permutation_eval(epu8 v, const size_t k) noexcept {

    uint64_t diff = last_diff(v, Epu8.id(), 16);

    return equal(eval16(v), Epu8({}, 1)) && (diff == 16 || diff < k);

}


inline bool is_permutation(epu8 v, const size_t k) noexcept {

#ifdef SIMDE_X86_SSE4_2_NATIVE

    return is_permutation_cmpestri(v, k);

#else

    return is_permutation_sort(v, k);

#endif

}


}  // namespace HPCombi


namespace std {


inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a) {

    stream << "{" << std::setw(2) << unsigned(a[0]);

    for (unsigned i = 1; i < 16; ++i)

        stream << "," << std::setw(2) << unsigned(a[i]);

    stream << "}";

    return stream;

}


inline std::string to_string(HPCombi::epu8 const &a) {

    std::ostringstream ss;

    ss << a;

    return ss.str();

}


template <> struct equal_to<HPCombi::epu8> {


    bool operator()(const HPCombi::epu8 &lhs,

                    const HPCombi::epu8 &rhs) const noexcept {

        return HPCombi::equal(lhs, rhs);

    }


};


template <> struct not_equal_to<HPCombi::epu8> {


    bool operator()(const HPCombi::epu8 &lhs,

                    const HPCombi::epu8 &rhs) const noexcept {

        return HPCombi::not_equal(lhs, rhs);

    }


};


template <> struct hash<HPCombi::epu8> {


    inline size_t operator()(HPCombi::epu8 a) const noexcept {

        unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);

        unsigned __int128 v1 = simde_mm_extract_epi64(a, 1);

        return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;


        /* The following is extremely slow on Renner benchmark

           uint64_t v0 = simde_mm_extract_epi64(ar.v, 0);

           uint64_t v1 = simde_mm_extract_epi64(ar.v, 1);

           size_t seed = v0 + 0x9e3779b9;

           seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);

           return seed;

        */

    }


};


template <> struct less<HPCombi::epu8> {

    // WARNING: due to endianness this is not lexicographic comparison,

    //          but we don't care when using in std::set.

    // 10% faster than calling the lexicographic comparison operator!


    inline size_t operator()(const HPCombi::epu8 &v1,

                             const HPCombi::epu8 &v2) const noexcept {

        simde__m128 v1v = simde__m128(v1), v2v = simde__m128(v2);

        return v1v[0] == v2v[0] ? v1v[1] < v2v[1] : v1v[0] < v2v[0];

    }


};


}  // namespace std

epu8
uint8_t __attribute__((vector_size(16))) epu8
epu8 stands for Extended Packed Unsigned, grouped by 8 bits; this is the low level type chosen by Int...
Definition epu8.hpp:73

FF
#define FF
Definition bmat8_impl.hpp:297

res
std::array< std::tuple< uint16_t, uint16_t, std::array< uint16_t, gens.size()> >, 65536 > res
Definition image.cpp:66

HPCombi
Definition bmat16.hpp:39

HPCombi::horiz_min4
uint8_t horiz_min4(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:419

HPCombi::max
epu8 max(epu8 a, epu8 b) noexcept
Vector max between two HPCombi::epu8 0.
Definition epu8.hpp:125

HPCombi::last_diff_ref
uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as last_diff but with a different implementation.
Definition epu8_impl.hpp:93

HPCombi::first_non_zero
uint64_t first_non_zero(epu8 v, int bnd) noexcept
return the index of the first non zero entry or 16 if there are none Only index smaller than bound ar...
Definition epu8_impl.hpp:127

HPCombi::horiz_min_ref
uint8_t horiz_min_ref(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:410

HPCombi::eval16_arr
epu8 eval16_arr(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:453

HPCombi::permuted
epu8 permuted(epu8 a, epu8 b) noexcept
Same as permuted_ref but with an optimized implementation using intrinsics.
Definition epu8.hpp:103

HPCombi::sort8_perm
epu8 sort8_perm(epu8 &a) noexcept
Sort this and return the sorting permutation.
Definition epu8_impl.hpp:220

HPCombi::shifted_right
epu8 shifted_right(epu8 a) noexcept
Left shifted of a HPCombi::epu8 inserting a 0.
Definition epu8.hpp:110

HPCombi::first_diff_ref
uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as first_diff but with a different implementation.
Definition epu8_impl.hpp:78

HPCombi::first_diff_mask
uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as first_diff but with a different implementation.
Definition epu8_impl.hpp:89

HPCombi::partial_sums_ref
epu8 partial_sums_ref(epu8) noexcept
Same interface as partial_sums but with a different implementation.
Definition epu8_impl.hpp:358

HPCombi::eval16_gen
epu8 eval16_gen(epu8 v) noexcept
Definition epu8_impl.hpp:461

HPCombi::network_sort
epu8 network_sort(epu8 res, std::array< epu8, sz > rounds)
Apply a sorting network.
Definition epu8_impl.hpp:136

HPCombi::remove_dups
epu8 remove_dups(epu8 a, uint8_t repl=0) noexcept
Remove duplicates in a sorted HPCombi::epu8.
Definition epu8_impl.hpp:261

HPCombi::revsorted8
epu8 revsorted8(epu8 a) noexcept
Return a HPCombi::epu8 with both halves reverse sorted.
Definition epu8_impl.hpp:213

HPCombi::permutation_of
epu8 permutation_of(epu8 a, epu8 b) noexcept
Find if a vector is a permutation of another one.
Definition epu8_impl.hpp:304

HPCombi::is_permutation
bool is_permutation(epu8 v, const size_t k=16) noexcept
Definition epu8_impl.hpp:531

HPCombi::merge_rev
void merge_rev(epu8 &a, epu8 &b) noexcept
Definition epu8_impl.hpp:234

HPCombi::summing_rounds
constexpr std::array< epu8, 4 > summing_rounds
Permutation Round for partial and horizontal sums.
Definition epu8_impl.hpp:318

HPCombi::less_partial
int8_t less_partial(epu8 a, epu8 b, int k) noexcept
Partial lexicographic comparison between two HPCombi::epu8.
Definition epu8_impl.hpp:114

HPCombi::horiz_max4
uint8_t horiz_max4(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:384

HPCombi::last_diff_mask
uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound=16) noexcept
Same interface as last_diff but with a different implementation.
Definition epu8_impl.hpp:106

HPCombi::horiz_max3
uint8_t horiz_max3(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:385

HPCombi::prime
constexpr uint64_t prime
A prime number good for hashing.
Definition epu8.hpp:198

HPCombi::horiz_min_gen
uint8_t horiz_min_gen(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:416

HPCombi::is_partial_permutation
bool is_partial_permutation(epu8 v, const size_t k=16) noexcept
Test for partial permutations.
Definition epu8_impl.hpp:500

HPCombi::is_permutation_sort
bool is_permutation_sort(epu8 v, const size_t k=16) noexcept
Same interface as is_permutation but with a different implementation.
Definition epu8_impl.hpp:522

HPCombi::last_zero
uint64_t last_zero(epu8 v, int bnd) noexcept
return the index of the last zero entry or 16 if there are none Only index smaller than bound are tak...
Definition epu8_impl.hpp:124

HPCombi::merge
void merge(epu8 &a, epu8 &b) noexcept
Merge two sorted epu8.
Definition epu8_impl.hpp:241

HPCombi::horiz_sum4
uint8_t horiz_sum4(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:349

HPCombi::popcount16
epu8 popcount16(epu8 v) noexcept
a vector popcount function
Definition epu8_impl.hpp:481

HPCombi::partial_sums_round
epu8 partial_sums_round(epu8) noexcept
Same interface as partial_sums but with a different implementation.
Definition epu8_impl.hpp:369

HPCombi::horiz_sum3
uint8_t horiz_sum3(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:350

HPCombi::horiz_sum_ref
uint8_t horiz_sum_ref(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:340

HPCombi::partial_sums_gen
epu8 partial_sums_gen(epu8) noexcept
Same interface as partial_sums but with a different implementation.
Definition epu8_impl.hpp:365

HPCombi::equal
bool equal(epu8 a, epu8 b) noexcept
Equality of HPCombi::epu8.
Definition epu8.hpp:91

HPCombi::permutation_of_ref
epu8 permutation_of_ref(epu8 a, epu8 b) noexcept
Same interface as permutation_of but with a different implementation.
Definition epu8_impl.hpp:295

HPCombi::min
epu8 min(epu8 a, epu8 b) noexcept
Vector min between two HPCombi::epu8 0.
Definition epu8.hpp:123

HPCombi::first_mask
uint64_t first_mask(epu8 msk, size_t bound)
Definition epu8_impl.hpp:69

HPCombi::partial_max_ref
epu8 partial_max_ref(epu8) noexcept
Same interface as partial_max but with a different implementation.
Definition epu8_impl.hpp:393

HPCombi::sorted8
epu8 sorted8(epu8 a) noexcept
Return a HPCombi::epu8 with both halves sorted.
Definition epu8_impl.hpp:207

HPCombi::partial_max_round
epu8 partial_max_round(epu8) noexcept
Same interface as partial_max but with a different implementation.
Definition epu8_impl.hpp:404

HPCombi::eval16
epu8 eval16(epu8 v) noexcept
Evaluation of a HPCombi::epu8: count how many times each int of 0..15 appears in the input.
Definition epu8.hpp:488

HPCombi::network_sort_perm
epu8 network_sort_perm(epu8 &v, std::array< epu8, sz > rounds)
Apply a sorting network in place and return the permutation.
Definition epu8_impl.hpp:149

HPCombi::sorting_rounds8
constexpr std::array< epu8, 6 > sorting_rounds8
A duplicated 8-way sorting network.
Definition epu8_impl.hpp:191

HPCombi::eval16_cycle
epu8 eval16_cycle(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:464

HPCombi::sorting_rounds
constexpr std::array< epu8, 9 > sorting_rounds
A 16-way sorting network.
Definition epu8_impl.hpp:170

HPCombi::eval16_ref
epu8 eval16_ref(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:445

HPCombi::partial_max_gen
epu8 partial_max_gen(epu8) noexcept
Same interface as partial_max but with a different implementation.
Definition epu8_impl.hpp:400

HPCombi::less
bool less(epu8 a, epu8 b) noexcept
Lexicographic comparison between two HPCombi::epu8.
Definition epu8_impl.hpp:110

HPCombi::horiz_max_ref
uint8_t horiz_max_ref(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:375

HPCombi::sorted
epu8 sorted(epu8 a) noexcept
Return a sorted HPCombi::epu8.
Definition epu8_impl.hpp:204

HPCombi::last_mask
uint64_t last_mask(epu8 msk, size_t bound)
Definition epu8_impl.hpp:73

HPCombi::mining_rounds
constexpr std::array< epu8, 4 > mining_rounds
Definition epu8_impl.hpp:328

HPCombi::Epu8
constexpr TPUBuild< epu8 > Epu8
Factory object acting as a class constructor for type HPCombi::epu8.
Definition epu8.hpp:81

HPCombi::is_permutation_eval
bool is_permutation_eval(epu8 v, const size_t k=16) noexcept
Same interface as is_permutation but with a different implementation.
Definition epu8_impl.hpp:526

HPCombi::as_VectGeneric
VectGeneric< TPUBuild< TPU >::size > & as_VectGeneric(TPU &v)
Cast a HPCombi::epu8 to a c++ HPCombi::VectGeneric.
Definition builder.hpp:162

HPCombi::first_zero
uint64_t first_zero(epu8 v, int bnd) noexcept
return the index of the first zero entry or 16 if there are none Only index smaller than bound are ta...
Definition epu8_impl.hpp:121

HPCombi::random_epu8
epu8 random_epu8(uint16_t bnd)
A random HPCombi::epu8.
Definition epu8_impl.hpp:249

HPCombi::revsorted
epu8 revsorted(epu8 a) noexcept
Return a reverse sorted HPCombi::epu8.
Definition epu8_impl.hpp:210

HPCombi::partial_min_round
epu8 partial_min_round(epu8) noexcept
Same interface as partial_min but with a different implementation.
Definition epu8_impl.hpp:439

HPCombi::is_sorted
bool is_sorted(epu8 a) noexcept
Testing if a HPCombi::epu8 is sorted.
Definition epu8_impl.hpp:201

HPCombi::inverting_rounds
constexpr std::array< epu8, 3 > inverting_rounds
Definition epu8_impl.hpp:268

HPCombi::last_non_zero
uint64_t last_non_zero(epu8 v, int bnd) noexcept
return the index of the last non zero entry or 16 if there are none Only index smaller than bound are...
Definition epu8_impl.hpp:130

HPCombi::last_diff
uint64_t last_diff(epu8 a, epu8 b, size_t bound=16) noexcept
The last difference between two HPCombi::epu8.
Definition epu8.hpp:576

HPCombi::eval16_popcount
epu8 eval16_popcount(epu8 v) noexcept
Same interface as eval16 but with a different implementation.
Definition epu8_impl.hpp:472

HPCombi::sort_perm
epu8 sort_perm(epu8 &a) noexcept
Sort this and return the sorting permutation.
Definition epu8_impl.hpp:217

HPCombi::partial_min_gen
epu8 partial_min_gen(epu8) noexcept
Same interface as partial_min but with a different implementation.
Definition epu8_impl.hpp:435

HPCombi::partial_min_ref
epu8 partial_min_ref(epu8) noexcept
Same interface as partial_min but with a different implementation.
Definition epu8_impl.hpp:428

HPCombi::horiz_max_gen
uint8_t horiz_max_gen(epu8) noexcept
Same interface as horiz_max but with a different implementation.
Definition epu8_impl.hpp:381

HPCombi::epu8
uint8_t __attribute__((vector_size(16))) epu8
epu8 stands for Extended Packed Unsigned, grouped by 8 bits; this is the low level type chosen by Int...
Definition epu8.hpp:73

HPCombi::is_transformation
bool is_transformation(epu8 v, const size_t k=16) noexcept
Test for transformation.
Definition epu8_impl.hpp:494

HPCombi::first_diff
uint64_t first_diff(epu8 a, epu8 b, size_t bound=16) noexcept
The first difference between two HPCombi::epu8.
Definition epu8.hpp:531

HPCombi::is_partial_transformation
bool is_partial_transformation(epu8 v, const size_t k=16) noexcept
Test for partial transformation.
Definition epu8_impl.hpp:486

HPCombi::permuted_ref
epu8 permuted_ref(epu8 a, epu8 b) noexcept
Apply a permutation b on the vector a: for i=0..16 {result[i] = a[b[i]}.
Definition epu8_impl.hpp:60

HPCombi::merge_rounds
constexpr std::array< epu8, 6 > merge_rounds
Definition epu8_impl.hpp:227

HPCombi::as_array
TPUBuild< TPU >::array & as_array(TPU &v) noexcept
Cast a TPU to a c++ std::array.
Definition builder.hpp:145

HPCombi::horiz_sum_gen
uint8_t horiz_sum_gen(epu8) noexcept
Same interface as horiz_sum but with a different implementation.
Definition epu8_impl.hpp:346

HPCombi::horiz_min3
uint8_t horiz_min3(epu8) noexcept
Same interface as horiz_min but with a different implementation.
Definition epu8_impl.hpp:420

HPCombi::not_equal
bool not_equal(epu8 a, epu8 b) noexcept
Non equality of HPCombi::epu8.
Definition epu8.hpp:95

std
Definition bmat16_impl.hpp:362

std::operator<<
std::ostream & operator<<(std::ostream &os, HPCombi::BMat16 const &bm)
Definition bmat16_impl.hpp:365

std::to_string
std::string to_string(HPCombi::epu8 const &a)
Definition epu8_impl.hpp:551

std::equal_to< HPCombi::epu8 >::operator()
bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const noexcept
Definition epu8_impl.hpp:560

std::hash< HPCombi::epu8 >::operator()
size_t operator()(HPCombi::epu8 a) const noexcept
Definition epu8_impl.hpp:578

std::less< HPCombi::epu8 >::operator()
size_t operator()(const HPCombi::epu8 &v1, const HPCombi::epu8 &v2) const noexcept
Definition epu8_impl.hpp:599

std::not_equal_to< HPCombi::epu8 >::operator()
bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const noexcept
Definition epu8_impl.hpp:569

vect_generic.hpp
HPCombi::VectGeneric.