HPCombi
High Performance Combinatorics in C++ using vector instructions v1.0.3
All Classes Namespaces Files Functions Variables Typedefs Friends Macros Pages
bmat16_impl.hpp
Go to the documentation of this file.
1//****************************************************************************//
2// Copyright (C) 2018-2024 Finn Smith <fls3@st-andrews.ac.uk> //
3// Copyright (C) 2018-2024 James Mitchell <jdm3@st-andrews.ac.uk> //
4// Copyright (C) 2018-2024 Florent Hivert <Florent.Hivert@lisn.fr>, //
5// //
6// This file is part of HP-Combi <https://github.com/libsemigroups/HPCombi> //
7// //
8// HP-Combi is free software: you can redistribute it and/or modify it //
9// under the terms of the GNU General Public License as published by the //
10// Free Software Foundation, either version 3 of the License, or //
11// (at your option) any later version. //
12// //
13// HP-Combi is distributed in the hope that it will be useful, but WITHOUT //
14// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or //
15// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License //
16// for more details. //
17// //
18// You should have received a copy of the GNU General Public License along //
19// with HP-Combi. If not, see <https://www.gnu.org/licenses/>. //
20//****************************************************************************//
21
22// This file contains an implementation of fast boolean matrices up to
23// dimension 16 x 16.
24
25// NOLINT(build/header_guard)
26
27namespace HPCombi {
28static_assert(std::is_trivial<BMat16>(), "BMat16 is not a trivial class!");
29
30static constexpr xpu16 line{0x800, 0x901, 0xa02, 0xb03, 0xc04, 0xd05,
31 0xe06, 0xf07, 0x800, 0x901, 0xa02, 0xb03,
32 0xc04, 0xd05, 0xe06, 0xf07};
33static constexpr xpu16 block{0x200, 0x604, 0xa08, 0xe0c, 0x301, 0x705,
34 0xb09, 0xf0d, 0x200, 0x604, 0xa08, 0xe0c,
35 0x301, 0x705, 0xb09, 0xf0d};
36
37inline xpu64 to_line(xpu64 vect) {
38 return simde_mm256_shuffle_epi8(vect, line);
39}
40
41inline xpu64 to_block(xpu64 vect) {
42 return simde_mm256_shuffle_epi8(vect, block);
43}
44
45inline BMat16::BMat16(uint64_t n0, uint64_t n1, uint64_t n2,
46 uint64_t n3) noexcept {
47 xpu64 tmp{n0, n1, n2, n3};
48 _data = to_line(tmp);
49}
50
51inline BMat16::BMat16(std::vector<std::vector<bool>> const &mat) noexcept {
52 HPCOMBI_ASSERT(mat.size() <= 16);
53 HPCOMBI_ASSERT(0 < mat.size());
54 std::array<uint64_t, 4> tmp = {0, 0, 0, 0};
55 for (int i = mat.size() - 1; i >= 0; --i) {
56 HPCOMBI_ASSERT(mat.size() == mat[i].size());
57 tmp[i / 4] <<= 16 - mat.size();
58 for (int j = mat[i].size() - 1; j >= 0; --j) {
59 tmp[i / 4] = (tmp[i / 4] << 1) | mat[i][j];
60 }
61 }
62 _data = xpu64{tmp[0], tmp[1], tmp[2], tmp[3]};
63}
64
65inline bool BMat16::operator()(size_t i, size_t j) const noexcept {
66 return (_data[i / 4] >> (16 * (i % 4) + j)) & 1;
67}
68
69inline void BMat16::set(size_t i, size_t j, bool val) noexcept {
70 HPCOMBI_ASSERT(i < 16);
71 HPCOMBI_ASSERT(j < 16);
72 uint64_t a = 1;
73 a <<= 16 * (i % 4) + j;
74 xpu64 mask{(i / 4 == 0) * a, (i / 4 == 1) * a, (i / 4 == 2) * a,
75 (i / 4 == 3) * a};
76 _data ^= (-val ^ _data) & mask;
77}
78
79inline bool BMat16::operator==(BMat16 const &that) const noexcept {
80 xpu64 tmp = _data ^ that._data;
81 return simde_mm256_testz_si256(tmp, tmp);
82}
83
84inline bool BMat16::operator<(BMat16 const &that) const noexcept {
85 return _data[0] < that._data[0] ||
86 (_data[0] == that._data[0] &&
87 (_data[1] < that._data[1] ||
88 (_data[1] == that._data[1] &&
89 (_data[2] < that._data[2] ||
90 (_data[2] == that._data[2] && (_data[3] < that._data[3]))))));
91}
92
93inline bool BMat16::operator>(BMat16 const &that) const noexcept {
94 return _data[0] > that._data[0] ||
95 (_data[0] == that._data[0] &&
96 (_data[1] > that._data[1] ||
97 (_data[1] == that._data[1] &&
98 (_data[2] > that._data[2] ||
99 (_data[2] == that._data[2] && (_data[3] > that._data[3]))))));
100}
101
102inline std::array<std::array<bool, 16>, 16> BMat16::to_array() const noexcept {
103 xpu64 tmp = to_block(_data);
104 uint64_t a = tmp[0], b = tmp[1], c = tmp[2], d = tmp[3];
105 std::array<std::array<bool, 16>, 16> res;
106 for (size_t i = 0; i < 64; ++i) {
107 res[i / 8][i % 8] = a & 1;
108 a >>= 1;
109 res[i / 8][8 + i % 8] = b & 1;
110 b >>= 1;
111 res[8 + i / 8][i % 8] = c & 1;
112 c >>= 1;
113 res[8 + i / 8][8 + i % 8] = d & 1;
114 d >>= 1;
115 }
116 return res;
117}
118
119inline BMat16 BMat16::transpose_naive() const noexcept {
120 uint64_t a = 0, b = 0, c = 0, d = 0;
121 for (int i = 7; i >= 0; --i) {
122 for (int j = 7; j >= 0; --j) {
123 a = (a << 1) | (*this)(j, i);
124 b = (b << 1) | (*this)(j + 8, i);
125 c = (c << 1) | (*this)(j, i + 8);
126 d = (d << 1) | (*this)(j + 8, i + 8);
127 }
128 }
129 return BMat16(a, b, c, d);
130}
131
132inline BMat16 BMat16::transpose() const noexcept {
133 xpu64 tmp = to_block(_data);
134 xpu64 x = simde_mm256_set_epi64x(tmp[3], tmp[1], tmp[2], tmp[0]);
135 xpu64 y = (x ^ (x >> 7)) & (xpu64{0xAA00AA00AA00AA, 0xAA00AA00AA00AA,
136 0xAA00AA00AA00AA, 0xAA00AA00AA00AA});
137 x = x ^ y ^ (y << 7);
138 y = (x ^ (x >> 14)) &
139 (xpu64{0xCCCC0000CCCC, 0xCCCC0000CCCC, 0xCCCC0000CCCC, 0xCCCC0000CCCC});
140 x = x ^ y ^ (y << 14);
141 y = (x ^ (x >> 28)) &
142 (xpu64{0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0});
143 x = x ^ y ^ (y << 28);
144 return BMat16(to_line(x));
145}
146
147static constexpr xpu16 rot{0x302, 0x504, 0x706, 0x908, 0xb0a, 0xd0c,
148 0xf0e, 0x100, 0x302, 0x504, 0x706, 0x908,
149 0xb0a, 0xd0c, 0xf0e, 0x100};
150
151inline BMat16 BMat16::mult_transpose(BMat16 const &that) const noexcept {
152 xpu16 x = _data;
153 xpu16 y1 = that._data;
154 xpu16 y2 = simde_mm256_set_epi64x(that._data[1], that._data[0],
155 that._data[3], that._data[2]);
156 xpu16 zero = simde_mm256_setzero_si256();
157 xpu16 data = simde_mm256_setzero_si256();
158 xpu16 diag1{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
159 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
160 xpu16 diag2{0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000,
161 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
162 for (size_t i = 0; i < 8; ++i) {
163 data |= ((x & y1) != zero) & diag1;
164 data |= ((x & y2) != zero) & diag2;
165 y1 = simde_mm256_shuffle_epi8(y1, rot);
166 y2 = simde_mm256_shuffle_epi8(y2, rot);
167 diag1 = simde_mm256_shuffle_epi8(diag1, rot);
168 diag2 = simde_mm256_shuffle_epi8(diag2, rot);
169 }
170 return BMat16(data);
171}
172
173inline BMat16 BMat16::mult_4bmat8(BMat16 const &that) const noexcept {
174 BMat16 tmp = that.transpose();
175 xpu64 t1 = to_block(_data), t2 = to_block(tmp._data);
176 BMat8 a1(t1[0]), b1(t1[1]), c1(t1[2]), d1(t1[3]), a2(t2[0]), b2(t2[1]),
177 c2(t2[2]), d2(t2[3]);
178 return BMat16((a1.mult_transpose(a2) | b1.mult_transpose(b2)).to_int(),
179 (a1.mult_transpose(c2) | b1.mult_transpose(d2)).to_int(),
180 (c1.mult_transpose(a2) | d1.mult_transpose(b2)).to_int(),
181 (c1.mult_transpose(c2) | d1.mult_transpose(d2)).to_int());
182}
183
184inline BMat16 BMat16::mult_naive(BMat16 const &that) const noexcept {
185 uint64_t a = 0, b = 0, c = 0, d = 0;
186 for (int i = 7; i >= 0; --i) {
187 for (int j = 7; j >= 0; --j) {
188 a <<= 1;
189 b <<= 1;
190 c <<= 1;
191 d <<= 1;
192 for (size_t k = 0; k < 8; ++k) {
193 a |= ((*this)(i, k) & that(k, j)) |
194 ((*this)(i, k + 8) & that(k + 8, j));
195 b |= ((*this)(i, k) & that(k, j + 8)) |
196 ((*this)(i, k + 8) & that(k + 8, j + 8));
197 c |= ((*this)(i + 8, k) & that(k, j)) |
198 ((*this)(i + 8, k + 8) & that(k + 8, j));
199 d |= ((*this)(i + 8, k) & that(k, j + 8)) |
200 ((*this)(i + 8, k + 8) & that(k + 8, j + 8));
201 }
202 }
203 }
204 return BMat16(a, b, c, d);
205}
206
207inline BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept {
208 std::array<std::array<bool, 16>, 16> tab1 = to_array(),
209 tab2 = that.to_array();
210 uint64_t a = 0, b = 0, c = 0, d = 0;
211 for (int i = 7; i >= 0; --i) {
212 for (int j = 7; j >= 0; --j) {
213 a <<= 1;
214 b <<= 1;
215 c <<= 1;
216 d <<= 1;
217 for (size_t k = 0; k < 16; ++k) {
218 a |= tab1[i][k] & tab2[k][j];
219 b |= tab1[i][k] & tab2[k][j + 8];
220 c |= tab1[i + 8][k] & tab2[k][j];
221 d |= tab1[i + 8][k] & tab2[k][j + 8];
222 }
223 }
224 }
225 return BMat16(a, b, c, d);
226}
227
228inline size_t BMat16::nr_rows() const noexcept {
229 size_t res = 0;
230 for (size_t i = 0; i < 16; ++i)
231 if ((_data[i / 4] << (16 * (i % 4)) >> 48) != 0)
232 ++res;
233 return res;
234
237 // xpu16 tmp = _data, zero = simde_mm256_setzero_si256();
238 // xpu16 x = (tmp != zero);
239 // return simde_mm256_popcnt_epi16(x);
240}
241
242inline std::vector<uint16_t> BMat16::rows() const {
243 std::vector<uint16_t> rows;
244 for (size_t i = 0; i < 16; ++i) {
245 uint16_t row_rev = (_data[i / 4] << (16 * (3 - i % 4)) >> 48);
246
247 // The row needs to be reversed
248 uint16_t row = 0;
249 for (size_t j = 0; j < 16; ++j) {
250 row = (row << 1) | (row_rev & 1);
251 row_rev >>= 1;
252 }
253 rows.push_back(row);
254 }
255 return rows;
256}
257
259 static std::random_device _rd;
260 static std::mt19937 _gen(_rd());
261 static std::uniform_int_distribution<uint64_t> _dist(0, 0xffffffffffffffff);
262
263 return BMat16(_dist(_gen), _dist(_gen), _dist(_gen), _dist(_gen));
264}
265
266static const constexpr std::array<xpu64, 16> ROW_MASK16 = {
267 static_cast<xpu64>(
268 xpu16{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
269 static_cast<xpu64>(
270 xpu16{0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
271 static_cast<xpu64>(
272 xpu16{0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
273 static_cast<xpu64>(
274 xpu16{0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
275 static_cast<xpu64>(
276 xpu16{0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
277 static_cast<xpu64>(
278 xpu16{0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
279 static_cast<xpu64>(
280 xpu16{0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
281 static_cast<xpu64>(
282 xpu16{0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0}),
283 static_cast<xpu64>(
284 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0, 0}),
285 static_cast<xpu64>(
286 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0, 0}),
287 static_cast<xpu64>(
288 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0, 0}),
289 static_cast<xpu64>(
290 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0, 0}),
291 static_cast<xpu64>(
292 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0, 0}),
293 static_cast<xpu64>(
294 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0, 0}),
295 static_cast<xpu64>(
296 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff, 0}),
297 static_cast<xpu64>(
298 xpu16{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffff})};
299
300static const constexpr std::array<xpu64, 16> COL_MASK16 = {
301 static_cast<xpu64>(xpu16{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
302 static_cast<xpu64>(xpu16{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}),
303 static_cast<xpu64>(xpu16{4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}),
304 static_cast<xpu64>(xpu16{8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}),
305 static_cast<xpu64>(xpu16{0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
306 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}),
307 static_cast<xpu64>(xpu16{0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20}),
309 static_cast<xpu64>(xpu16{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
310 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40}),
311 static_cast<xpu64>(xpu16{0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
312 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}),
313 static_cast<xpu64>(xpu16{0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100,
314 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100,
315 0x100, 0x100}),
316 static_cast<xpu64>(xpu16{0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200,
317 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200,
318 0x200, 0x200}),
319 static_cast<xpu64>(xpu16{0x400, 0x400, 0x400, 0x400, 0x400, 0x400, 0x400,
320 0x400, 0x400, 0x400, 0x400, 0x400, 0x400, 0x400,
321 0x400, 0x400}),
322 static_cast<xpu64>(xpu16{0x800, 0x800, 0x800, 0x800, 0x800, 0x800, 0x800,
323 0x800, 0x800, 0x800, 0x800, 0x800, 0x800, 0x800,
324 0x800, 0x800}),
325 static_cast<xpu64>(xpu16{0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
326 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
327 0x1000, 0x1000, 0x1000, 0x1000}),
328 static_cast<xpu64>(xpu16{0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000,
329 0x2000, 0x2000, 0x2000, 0x2000, 0x2000, 0x2000,
330 0x2000, 0x2000, 0x2000, 0x2000}),
331 static_cast<xpu64>(xpu16{0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000,
332 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000,
333 0x4000, 0x4000, 0x4000, 0x4000}),
334 static_cast<xpu64>(xpu16{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
335 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
336 0x8000, 0x8000, 0x8000, 0x8000})};
337
338inline BMat16 BMat16::random(size_t const dim) {
339 // TO DO : Instead of nulling all the cols/rows one by one, one could do
340 // that at once with the proper mask
341 HPCOMBI_ASSERT(0 < dim && dim <= 16);
342 BMat16 bm = BMat16::random();
343 for (size_t i = dim; i < 16; ++i) {
344 bm._data &= ~ROW_MASK16[i];
345 bm._data &= ~COL_MASK16[i];
346 }
347 return bm;
348}
349
350inline std::ostream &BMat16::write(std::ostream &os) const {
351 for (size_t i = 0; i < 16; ++i) {
352 for (size_t j = 0; j < 16; ++j) {
353 os << (*this)(i, j);
354 }
355 os << "\n";
356 }
357 return os;
358}
359
360} // namespace HPCombi
361
362namespace std {
363
364// Not noexcept because BMat16::write isn't
365inline std::ostream &operator<<(std::ostream &os, HPCombi::BMat16 const &bm) {
366 return bm.write(os);
367}
368
369} // namespace std
Class for fast boolean matrices of dimension up to 16 x 16.
Definition bmat16.hpp:65
std::ostream & write(std::ostream &os) const
Write this on os.
Definition bmat16_impl.hpp:350
size_t nr_rows() const noexcept
Returns the number of non-zero rows of this.
Definition bmat16_impl.hpp:228
bool operator>(BMat16 const &that) const noexcept
Returns true if this is greater than that.
Definition bmat16_impl.hpp:93
BMat16 mult_transpose(BMat16 const &that) const noexcept
Returns the matrix product of this and the transpose of that.
Definition bmat16_impl.hpp:151
std::vector< uint16_t > rows() const
Returns a std::vector for rows of this.
Definition bmat16_impl.hpp:242
bool operator==(BMat16 const &that) const noexcept
Returns true if this equals that.
Definition bmat16_impl.hpp:79
BMat16 mult_4bmat8(BMat16 const &that) const noexcept
Returns the matrix product of this and that.
Definition bmat16_impl.hpp:173
BMat16 mult_naive(BMat16 const &that) const noexcept
Returns the matrix product of this and that.
Definition bmat16_impl.hpp:184
BMat16 mult_naive_array(BMat16 const &that) const noexcept
Returns the matrix product of this and that.
Definition bmat16_impl.hpp:207
BMat16 transpose() const noexcept
Returns the transpose of this.
Definition bmat16_impl.hpp:132
void set(size_t i, size_t j, bool val) noexcept
Sets the (i, j)th position to val.
Definition bmat16_impl.hpp:69
std::array< std::array< bool, 16 >, 16 > to_array() const noexcept
Returns the array representation of this.
Definition bmat16_impl.hpp:102
BMat16 transpose_naive() const noexcept
Returns the transpose of this.
Definition bmat16_impl.hpp:119
bool operator<(BMat16 const &that) const noexcept
Returns true if this is less than that.
Definition bmat16_impl.hpp:84
static BMat16 random()
Returns a random BMat16.
Definition bmat16_impl.hpp:258
BMat16() noexcept=default
A default constructor.
bool operator()(size_t i, size_t j) const noexcept
Returns the entry in the (i, j)th position.
Definition bmat16_impl.hpp:65
Boolean matrices of dimension up to 8×8, stored as a single uint64; isomorph to binary relations with...
Definition bmat8.hpp:55
#define HPCOMBI_ASSERT(x)
Definition debug.hpp:31
const Transf16 a1
Definition image.cpp:52
const Transf16 a2
Definition image.cpp:53
std::array< std::tuple< uint16_t, uint16_t, std::array< uint16_t, gens.size()> >, 65536 > res
Definition image.cpp:66
Definition bmat16.hpp:39
xpu64 to_block(xpu64 vect)
Converting storage type from rows to blocks of a xpu64 representing a 16x16 matrix (used in BMat16).
Definition bmat16_impl.hpp:41
uint64_t __attribute__((vector_size(32))) xpu64
Definition bmat16.hpp:41
uint16_t __attribute__((vector_size(32))) xpu16
Definition bmat16.hpp:40
xpu64 to_line(xpu64 vect)
Converting storage type from blocks to rows of a xpu64 representing a 16x16 matrix (used in BMat16).
Definition bmat16_impl.hpp:37
Definition bmat16_impl.hpp:362
std::ostream & operator<<(std::ostream &os, HPCombi::BMat16 const &bm)
Definition bmat16_impl.hpp:365