1 //-----------------------------------------------------------------------------
2 // Copyright (C) 2016, 2017 by piwi
4 // This code is licensed to you under the terms of the GNU GPL, version 2 or,
5 // at your option, any later version. See the LICENSE.txt file for the text of
7 //-----------------------------------------------------------------------------
8 // Implements a card only attack based on crypto text (encrypted nonces
9 // received during a nested authentication) only. Unlike other card only
10 // attacks this doesn't rely on implementation errors but only on the
11 // inherent weaknesses of the crypto1 cypher. Described in
12 // Carlo Meijer, Roel Verdult, "Ciphertext-only Cryptanalysis on Hardened
13 // Mifare Classic Cards" in Proceedings of the 22nd ACM SIGSAC Conference on
14 // Computer and Communications Security, 2015
15 //-----------------------------------------------------------------------------
16 // some helper functions which can benefit from SIMD instructions or other special instructions
19 #include "hardnested_bitarray_core.h"
20 #include "hardnested_bf_core.h"
29 // this needs to be compiled several times for each instruction set.
30 // For each instruction set, define a dedicated function name:
31 #if defined (__AVX512F__)
32 #define MALLOC_BITARRAY malloc_bitarray_AVX512
33 #define FREE_BITARRAY free_bitarray_AVX512
34 #define BITCOUNT bitcount_AVX512
35 #define COUNT_STATES count_states_AVX512
36 #define BITARRAY_AND bitarray_AND_AVX512
37 #define BITARRAY_LOW20_AND bitarray_low20_AND_AVX512
38 #define COUNT_BITARRAY_AND count_bitarray_AND_AVX512
39 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX512
40 #define BITARRAY_AND4 bitarray_AND4_AVX512
41 #define BITARRAY_OR bitarray_OR_AVX512
42 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX512
43 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX512
44 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX512
45 #elif defined (__AVX2__)
46 #define MALLOC_BITARRAY malloc_bitarray_AVX2
47 #define FREE_BITARRAY free_bitarray_AVX2
48 #define BITCOUNT bitcount_AVX2
49 #define COUNT_STATES count_states_AVX2
50 #define BITARRAY_AND bitarray_AND_AVX2
51 #define BITARRAY_LOW20_AND bitarray_low20_AND_AVX2
52 #define COUNT_BITARRAY_AND count_bitarray_AND_AVX2
53 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX2
54 #define BITARRAY_AND4 bitarray_AND4_AVX2
55 #define BITARRAY_OR bitarray_OR_AVX2
56 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX2
57 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX2
58 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX2
59 #elif defined (__AVX__)
60 #define MALLOC_BITARRAY malloc_bitarray_AVX
61 #define FREE_BITARRAY free_bitarray_AVX
62 #define BITCOUNT bitcount_AVX
63 #define COUNT_STATES count_states_AVX
64 #define BITARRAY_AND bitarray_AND_AVX
65 #define BITARRAY_LOW20_AND bitarray_low20_AND_AVX
66 #define COUNT_BITARRAY_AND count_bitarray_AND_AVX
67 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_AVX
68 #define BITARRAY_AND4 bitarray_AND4_AVX
69 #define BITARRAY_OR bitarray_OR_AVX
70 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_AVX
71 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_AVX
72 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_AVX
73 #elif defined (__SSE2__)
74 #define MALLOC_BITARRAY malloc_bitarray_SSE2
75 #define FREE_BITARRAY free_bitarray_SSE2
76 #define BITCOUNT bitcount_SSE2
77 #define COUNT_STATES count_states_SSE2
78 #define BITARRAY_AND bitarray_AND_SSE2
79 #define BITARRAY_LOW20_AND bitarray_low20_AND_SSE2
80 #define COUNT_BITARRAY_AND count_bitarray_AND_SSE2
81 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_SSE2
82 #define BITARRAY_AND4 bitarray_AND4_SSE2
83 #define BITARRAY_OR bitarray_OR_SSE2
84 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_SSE2
85 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_SSE2
86 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_SSE2
87 #elif defined (__MMX__)
88 #define MALLOC_BITARRAY malloc_bitarray_MMX
89 #define FREE_BITARRAY free_bitarray_MMX
90 #define BITCOUNT bitcount_MMX
91 #define COUNT_STATES count_states_MMX
92 #define BITARRAY_AND bitarray_AND_MMX
93 #define BITARRAY_LOW20_AND bitarray_low20_AND_MMX
94 #define COUNT_BITARRAY_AND count_bitarray_AND_MMX
95 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_MMX
96 #define BITARRAY_AND4 bitarray_AND4_MMX
97 #define BITARRAY_OR bitarray_OR_MMX
98 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_MMX
99 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_MMX
100 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_MMX
101 #elif defined (__ARM_NEON) && !defined (NOSIMD_BUILD)
102 #define MALLOC_BITARRAY malloc_bitarray_NEON
103 #define FREE_BITARRAY free_bitarray_NEON
104 #define BITCOUNT bitcount_NEON
105 #define COUNT_STATES count_states_NEON
106 #define BITARRAY_AND bitarray_AND_NEON
107 #define BITARRAY_LOW20_AND bitarray_low20_AND_NEON
108 #define COUNT_BITARRAY_AND count_bitarray_AND_NEON
109 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_NEON
110 #define BITARRAY_AND4 bitarray_AND4_NEON
111 #define BITARRAY_OR bitarray_OR_NEON
112 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_NEON
113 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_NEON
114 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_NEON
116 #define MALLOC_BITARRAY malloc_bitarray_NOSIMD
117 #define FREE_BITARRAY free_bitarray_NOSIMD
118 #define BITCOUNT bitcount_NOSIMD
119 #define COUNT_STATES count_states_NOSIMD
120 #define BITARRAY_AND bitarray_AND_NOSIMD
121 #define BITARRAY_LOW20_AND bitarray_low20_AND_NOSIMD
122 #define COUNT_BITARRAY_AND count_bitarray_AND_NOSIMD
123 #define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_NOSIMD
124 #define BITARRAY_AND4 bitarray_AND4_NOSIMD
125 #define BITARRAY_OR bitarray_OR_NOSIMD
126 #define COUNT_BITARRAY_AND2 count_bitarray_AND2_NOSIMD
127 #define COUNT_BITARRAY_AND3 count_bitarray_AND3_NOSIMD
128 #define COUNT_BITARRAY_AND4 count_bitarray_AND4_NOSIMD
132 // typedefs and declaration of functions:
133 typedef uint32_t *malloc_bitarray_t(uint32_t);
134 malloc_bitarray_t malloc_bitarray_AVX512
, malloc_bitarray_AVX2
, malloc_bitarray_AVX
, malloc_bitarray_SSE2
, malloc_bitarray_MMX
, malloc_bitarray_NOSIMD
, malloc_bitarray_NEON
, malloc_bitarray_dispatch
;
135 typedef void free_bitarray_t(uint32_t *);
136 free_bitarray_t free_bitarray_AVX512
, free_bitarray_AVX2
, free_bitarray_AVX
, free_bitarray_SSE2
, free_bitarray_MMX
, free_bitarray_NOSIMD
, free_bitarray_NEON
, free_bitarray_dispatch
;
137 typedef uint32_t bitcount_t(uint32_t);
138 bitcount_t bitcount_AVX512
, bitcount_AVX2
, bitcount_AVX
, bitcount_SSE2
, bitcount_MMX
, bitcount_NOSIMD
, bitcount_NEON
, bitcount_dispatch
;
139 typedef uint32_t count_states_t(uint32_t *);
140 count_states_t count_states_AVX512
, count_states_AVX2
, count_states_AVX
, count_states_SSE2
, count_states_MMX
, count_states_NOSIMD
, count_states_NEON
, count_states_dispatch
;
141 typedef void bitarray_AND_t(uint32_t[], uint32_t[]);
142 bitarray_AND_t bitarray_AND_AVX512
, bitarray_AND_AVX2
, bitarray_AND_AVX
, bitarray_AND_SSE2
, bitarray_AND_MMX
, bitarray_AND_NOSIMD
, bitarray_AND_NEON
, bitarray_AND_dispatch
;
143 typedef void bitarray_low20_AND_t(uint32_t *, uint32_t *);
144 bitarray_low20_AND_t bitarray_low20_AND_AVX512
, bitarray_low20_AND_AVX2
, bitarray_low20_AND_AVX
, bitarray_low20_AND_SSE2
, bitarray_low20_AND_MMX
, bitarray_low20_AND_NOSIMD
, bitarray_low20_AND_NEON
, bitarray_low20_AND_dispatch
;
145 typedef uint32_t count_bitarray_AND_t(uint32_t *, uint32_t *);
146 count_bitarray_AND_t count_bitarray_AND_AVX512
, count_bitarray_AND_AVX2
, count_bitarray_AND_AVX
, count_bitarray_AND_SSE2
, count_bitarray_AND_MMX
, count_bitarray_AND_NOSIMD
, count_bitarray_AND_NEON
, count_bitarray_AND_dispatch
;
147 typedef uint32_t count_bitarray_low20_AND_t(uint32_t *, uint32_t *);
148 count_bitarray_low20_AND_t count_bitarray_low20_AND_AVX512
, count_bitarray_low20_AND_AVX2
, count_bitarray_low20_AND_AVX
, count_bitarray_low20_AND_SSE2
, count_bitarray_low20_AND_MMX
, count_bitarray_low20_AND_NOSIMD
, count_bitarray_low20_AND_NEON
, count_bitarray_low20_AND_dispatch
;
149 typedef void bitarray_AND4_t(uint32_t *, uint32_t *, uint32_t *, uint32_t *);
150 bitarray_AND4_t bitarray_AND4_AVX512
, bitarray_AND4_AVX2
, bitarray_AND4_AVX
, bitarray_AND4_SSE2
, bitarray_AND4_MMX
, bitarray_AND4_NOSIMD
, bitarray_AND4_NEON
, bitarray_AND4_dispatch
;
151 typedef void bitarray_OR_t(uint32_t[], uint32_t[]);
152 bitarray_OR_t bitarray_OR_AVX512
, bitarray_OR_AVX2
, bitarray_OR_AVX
, bitarray_OR_SSE2
, bitarray_OR_MMX
, bitarray_OR_NOSIMD
, bitarray_OR_NEON
, bitarray_OR_dispatch
;
153 typedef uint32_t count_bitarray_AND2_t(uint32_t *, uint32_t *);
154 count_bitarray_AND2_t count_bitarray_AND2_AVX512
, count_bitarray_AND2_AVX2
, count_bitarray_AND2_AVX
, count_bitarray_AND2_SSE2
, count_bitarray_AND2_MMX
, count_bitarray_AND2_NOSIMD
, count_bitarray_AND2_NEON
, count_bitarray_AND2_dispatch
;
155 typedef uint32_t count_bitarray_AND3_t(uint32_t *, uint32_t *, uint32_t *);
156 count_bitarray_AND3_t count_bitarray_AND3_AVX512
, count_bitarray_AND3_AVX2
, count_bitarray_AND3_AVX
, count_bitarray_AND3_SSE2
, count_bitarray_AND3_MMX
, count_bitarray_AND3_NOSIMD
, count_bitarray_AND3_NEON
, count_bitarray_AND3_dispatch
;
157 typedef uint32_t count_bitarray_AND4_t(uint32_t *, uint32_t *, uint32_t *, uint32_t *);
158 count_bitarray_AND4_t count_bitarray_AND4_AVX512
, count_bitarray_AND4_AVX2
, count_bitarray_AND4_AVX
, count_bitarray_AND4_SSE2
, count_bitarray_AND4_MMX
, count_bitarray_AND4_NOSIMD
, count_bitarray_AND4_NEON
, count_bitarray_AND4_dispatch
;
161 inline uint32_t *MALLOC_BITARRAY(uint32_t x
) {
163 return __builtin_assume_aligned(_aligned_malloc((x
), __BIGGEST_ALIGNMENT__
), __BIGGEST_ALIGNMENT__
);
164 #elif defined (__APPLE__)
165 uint32_t *allocated_memory
;
166 if (posix_memalign((void **)&allocated_memory
, __BIGGEST_ALIGNMENT__
, x
)) {
169 return __builtin_assume_aligned(allocated_memory
, __BIGGEST_ALIGNMENT__
);
172 return __builtin_assume_aligned(memalign(__BIGGEST_ALIGNMENT__
, (x
)), __BIGGEST_ALIGNMENT__
);
177 inline void FREE_BITARRAY(uint32_t *x
) {
186 inline uint32_t BITCOUNT(uint32_t a
) {
187 return __builtin_popcountl(a
);
191 inline uint32_t COUNT_STATES(uint32_t *A
) {
193 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
194 count
+= BITCOUNT(A
[i
]);
200 inline void BITARRAY_AND(uint32_t *restrict A
, uint32_t *restrict B
) {
201 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
202 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
203 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
209 inline void BITARRAY_LOW20_AND(uint32_t *restrict A
, uint32_t *restrict B
) {
210 uint16_t *a
= (uint16_t *)__builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
211 uint16_t *b
= (uint16_t *)__builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
213 for (uint32_t i
= 0; i
< (1 << 20); i
++) {
221 inline uint32_t COUNT_BITARRAY_AND(uint32_t *restrict A
, uint32_t *restrict B
) {
222 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
223 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
225 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
227 count
+= BITCOUNT(A
[i
]);
233 inline uint32_t COUNT_BITARRAY_LOW20_AND(uint32_t *restrict A
, uint32_t *restrict B
) {
234 uint16_t *a
= (uint16_t *)__builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
235 uint16_t *b
= (uint16_t *)__builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
238 for (uint32_t i
= 0; i
< (1 << 20); i
++) {
242 count
+= BITCOUNT(a
[i
]);
248 inline void BITARRAY_AND4(uint32_t *restrict A
, uint32_t *restrict B
, uint32_t *restrict C
, uint32_t *restrict D
) {
249 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
250 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
251 C
= __builtin_assume_aligned(C
, __BIGGEST_ALIGNMENT__
);
252 D
= __builtin_assume_aligned(D
, __BIGGEST_ALIGNMENT__
);
253 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
254 A
[i
] = B
[i
] & C
[i
] & D
[i
];
259 inline void BITARRAY_OR(uint32_t *restrict A
, uint32_t *restrict B
) {
260 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
261 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
262 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
268 inline uint32_t COUNT_BITARRAY_AND2(uint32_t *restrict A
, uint32_t *restrict B
) {
269 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
270 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
272 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
273 count
+= BITCOUNT(A
[i
] & B
[i
]);
279 inline uint32_t COUNT_BITARRAY_AND3(uint32_t *restrict A
, uint32_t *restrict B
, uint32_t *restrict C
) {
280 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
281 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
282 C
= __builtin_assume_aligned(C
, __BIGGEST_ALIGNMENT__
);
284 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
285 count
+= BITCOUNT(A
[i
] & B
[i
] & C
[i
]);
291 inline uint32_t COUNT_BITARRAY_AND4(uint32_t *restrict A
, uint32_t *restrict B
, uint32_t *restrict C
, uint32_t *restrict D
) {
292 A
= __builtin_assume_aligned(A
, __BIGGEST_ALIGNMENT__
);
293 B
= __builtin_assume_aligned(B
, __BIGGEST_ALIGNMENT__
);
294 C
= __builtin_assume_aligned(C
, __BIGGEST_ALIGNMENT__
);
295 D
= __builtin_assume_aligned(D
, __BIGGEST_ALIGNMENT__
);
297 for (uint32_t i
= 0; i
< (1 << 19); i
++) {
298 count
+= BITCOUNT(A
[i
] & B
[i
] & C
[i
] & D
[i
]);
306 // pointers to functions:
307 malloc_bitarray_t
*malloc_bitarray_function_p
= &malloc_bitarray_dispatch
;
308 free_bitarray_t
*free_bitarray_function_p
= &free_bitarray_dispatch
;
309 bitcount_t
*bitcount_function_p
= &bitcount_dispatch
;
310 count_states_t
*count_states_function_p
= &count_states_dispatch
;
311 bitarray_AND_t
*bitarray_AND_function_p
= &bitarray_AND_dispatch
;
312 bitarray_low20_AND_t
*bitarray_low20_AND_function_p
= &bitarray_low20_AND_dispatch
;
313 count_bitarray_AND_t
*count_bitarray_AND_function_p
= &count_bitarray_AND_dispatch
;
314 count_bitarray_low20_AND_t
*count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_dispatch
;
315 bitarray_AND4_t
*bitarray_AND4_function_p
= &bitarray_AND4_dispatch
;
316 bitarray_OR_t
*bitarray_OR_function_p
= &bitarray_OR_dispatch
;
317 count_bitarray_AND2_t
*count_bitarray_AND2_function_p
= &count_bitarray_AND2_dispatch
;
318 count_bitarray_AND3_t
*count_bitarray_AND3_function_p
= &count_bitarray_AND3_dispatch
;
319 count_bitarray_AND4_t
*count_bitarray_AND4_function_p
= &count_bitarray_AND4_dispatch
;
321 // determine the available instruction set at runtime and call the correct function
322 uint32_t *malloc_bitarray_dispatch(uint32_t x
) {
323 #if defined(COMPILER_HAS_SIMD_NEON)
324 if (arm_has_neon()) malloc_bitarray_function_p
= &malloc_bitarray_NEON
;
328 #if defined(COMPILER_HAS_SIMD_AVX512)
329 if (__builtin_cpu_supports("avx512f")) malloc_bitarray_function_p
= &malloc_bitarray_AVX512
;
332 #if defined(COMPILER_HAS_SIMD_X86)
333 if (__builtin_cpu_supports("avx2")) malloc_bitarray_function_p
= &malloc_bitarray_AVX2
;
334 else if (__builtin_cpu_supports("avx")) malloc_bitarray_function_p
= &malloc_bitarray_AVX
;
335 else if (__builtin_cpu_supports("sse2")) malloc_bitarray_function_p
= &malloc_bitarray_SSE2
;
336 else if (__builtin_cpu_supports("mmx")) malloc_bitarray_function_p
= &malloc_bitarray_MMX
;
339 malloc_bitarray_function_p
= &malloc_bitarray_NOSIMD
;
341 // call the most optimized function for this CPU
342 return (*malloc_bitarray_function_p
)(x
);
345 void free_bitarray_dispatch(uint32_t *x
) {
346 #if defined(COMPILER_HAS_SIMD_NEON)
347 if (arm_has_neon()) free_bitarray_function_p
= &free_bitarray_NEON
;
351 #if defined(COMPILER_HAS_SIMD_AVX512)
352 if (__builtin_cpu_supports("avx512f")) free_bitarray_function_p
= &free_bitarray_AVX512
;
355 #if defined(COMPILER_HAS_SIMD_X86)
356 if (__builtin_cpu_supports("avx2")) free_bitarray_function_p
= &free_bitarray_AVX2
;
357 else if (__builtin_cpu_supports("avx")) free_bitarray_function_p
= &free_bitarray_AVX
;
358 else if (__builtin_cpu_supports("sse2")) free_bitarray_function_p
= &free_bitarray_SSE2
;
359 else if (__builtin_cpu_supports("mmx")) free_bitarray_function_p
= &free_bitarray_MMX
;
362 free_bitarray_function_p
= &free_bitarray_NOSIMD
;
364 // call the most optimized function for this CPU
365 (*free_bitarray_function_p
)(x
);
368 uint32_t bitcount_dispatch(uint32_t a
) {
369 #if defined(COMPILER_HAS_SIMD_NEON)
370 if (arm_has_neon()) bitcount_function_p
= &bitcount_NEON
;
374 #if defined(COMPILER_HAS_SIMD_AVX512)
375 if (__builtin_cpu_supports("avx512f")) bitcount_function_p
= &bitcount_AVX512
;
378 #if defined(COMPILER_HAS_SIMD_X86)
379 if (__builtin_cpu_supports("avx2")) bitcount_function_p
= &bitcount_AVX2
;
380 else if (__builtin_cpu_supports("avx")) bitcount_function_p
= &bitcount_AVX
;
381 else if (__builtin_cpu_supports("sse2")) bitcount_function_p
= &bitcount_SSE2
;
382 else if (__builtin_cpu_supports("mmx")) bitcount_function_p
= &bitcount_MMX
;
385 bitcount_function_p
= &bitcount_NOSIMD
;
387 // call the most optimized function for this CPU
388 return (*bitcount_function_p
)(a
);
391 uint32_t count_states_dispatch(uint32_t *bitarray
) {
392 #if defined(COMPILER_HAS_SIMD_NEON)
393 if (arm_has_neon()) count_states_function_p
= &count_states_NEON
;
397 #if defined(COMPILER_HAS_SIMD_AVX512)
398 if (__builtin_cpu_supports("avx512f")) count_states_function_p
= &count_states_AVX512
;
401 #if defined(COMPILER_HAS_SIMD_X86)
402 if (__builtin_cpu_supports("avx2")) count_states_function_p
= &count_states_AVX2
;
403 else if (__builtin_cpu_supports("avx")) count_states_function_p
= &count_states_AVX
;
404 else if (__builtin_cpu_supports("sse2")) count_states_function_p
= &count_states_SSE2
;
405 else if (__builtin_cpu_supports("mmx")) count_states_function_p
= &count_states_MMX
;
408 count_states_function_p
= &count_states_NOSIMD
;
410 // call the most optimized function for this CPU
411 return (*count_states_function_p
)(bitarray
);
414 void bitarray_AND_dispatch(uint32_t *A
, uint32_t *B
) {
415 #if defined(COMPILER_HAS_SIMD_NEON)
416 if (arm_has_neon()) bitarray_AND_function_p
= &bitarray_AND_NEON
;
420 #if defined(COMPILER_HAS_SIMD_AVX512)
421 if (__builtin_cpu_supports("avx512f")) bitarray_AND_function_p
= &bitarray_AND_AVX512
;
424 #if defined(COMPILER_HAS_SIMD_X86)
425 if (__builtin_cpu_supports("avx2")) bitarray_AND_function_p
= &bitarray_AND_AVX2
;
426 else if (__builtin_cpu_supports("avx")) bitarray_AND_function_p
= &bitarray_AND_AVX
;
427 else if (__builtin_cpu_supports("sse2")) bitarray_AND_function_p
= &bitarray_AND_SSE2
;
428 else if (__builtin_cpu_supports("mmx")) bitarray_AND_function_p
= &bitarray_AND_MMX
;
431 bitarray_AND_function_p
= &bitarray_AND_NOSIMD
;
433 // call the most optimized function for this CPU
434 (*bitarray_AND_function_p
)(A
, B
);
437 void bitarray_low20_AND_dispatch(uint32_t *A
, uint32_t *B
) {
438 #if defined(COMPILER_HAS_SIMD_NEON)
439 if (arm_has_neon()) bitarray_low20_AND_function_p
= &bitarray_low20_AND_NEON
;
443 #if defined(COMPILER_HAS_SIMD_AVX512)
444 if (__builtin_cpu_supports("avx512f")) bitarray_low20_AND_function_p
= &bitarray_low20_AND_AVX512
;
447 #if defined(COMPILER_HAS_SIMD_X86)
448 if (__builtin_cpu_supports("avx2")) bitarray_low20_AND_function_p
= &bitarray_low20_AND_AVX2
;
449 else if (__builtin_cpu_supports("avx")) bitarray_low20_AND_function_p
= &bitarray_low20_AND_AVX
;
450 else if (__builtin_cpu_supports("sse2")) bitarray_low20_AND_function_p
= &bitarray_low20_AND_SSE2
;
451 else if (__builtin_cpu_supports("mmx")) bitarray_low20_AND_function_p
= &bitarray_low20_AND_MMX
;
454 bitarray_low20_AND_function_p
= &bitarray_low20_AND_NOSIMD
;
456 // call the most optimized function for this CPU
457 (*bitarray_low20_AND_function_p
)(A
, B
);
460 uint32_t count_bitarray_AND_dispatch(uint32_t *A
, uint32_t *B
) {
461 #if defined(COMPILER_HAS_SIMD_NEON)
462 if (arm_has_neon()) count_bitarray_AND_function_p
= &count_bitarray_AND_NEON
;
466 #if defined(COMPILER_HAS_SIMD_AVX512)
467 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND_function_p
= &count_bitarray_AND_AVX512
;
470 #if defined(COMPILER_HAS_SIMD_X86)
471 if (__builtin_cpu_supports("avx2")) count_bitarray_AND_function_p
= &count_bitarray_AND_AVX2
;
472 else if (__builtin_cpu_supports("avx")) count_bitarray_AND_function_p
= &count_bitarray_AND_AVX
;
473 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND_function_p
= &count_bitarray_AND_SSE2
;
474 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND_function_p
= &count_bitarray_AND_MMX
;
477 count_bitarray_AND_function_p
= &count_bitarray_AND_NOSIMD
;
479 // call the most optimized function for this CPU
480 return (*count_bitarray_AND_function_p
)(A
, B
);
483 uint32_t count_bitarray_low20_AND_dispatch(uint32_t *A
, uint32_t *B
) {
484 #if defined(COMPILER_HAS_SIMD_NEON)
485 if (arm_has_neon()) count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_NEON
;
489 #if defined(COMPILER_HAS_SIMD_AVX512)
490 if (__builtin_cpu_supports("avx512f")) count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_AVX512
;
493 #if defined(COMPILER_HAS_SIMD_X86)
494 if (__builtin_cpu_supports("avx2")) count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_AVX2
;
495 else if (__builtin_cpu_supports("avx")) count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_AVX
;
496 else if (__builtin_cpu_supports("sse2")) count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_SSE2
;
497 else if (__builtin_cpu_supports("mmx")) count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_MMX
;
500 count_bitarray_low20_AND_function_p
= &count_bitarray_low20_AND_NOSIMD
;
502 // call the most optimized function for this CPU
503 return (*count_bitarray_low20_AND_function_p
)(A
, B
);
506 void bitarray_AND4_dispatch(uint32_t *A
, uint32_t *B
, uint32_t *C
, uint32_t *D
) {
507 #if defined(COMPILER_HAS_SIMD_NEON)
508 if (arm_has_neon()) bitarray_AND4_function_p
= &bitarray_AND4_NEON
;
512 #if defined(COMPILER_HAS_SIMD_AVX512)
513 if (__builtin_cpu_supports("avx512f")) bitarray_AND4_function_p
= &bitarray_AND4_AVX512
;
516 #if defined(COMPILER_HAS_SIMD_X86)
517 if (__builtin_cpu_supports("avx2")) bitarray_AND4_function_p
= &bitarray_AND4_AVX2
;
518 else if (__builtin_cpu_supports("avx")) bitarray_AND4_function_p
= &bitarray_AND4_AVX
;
519 else if (__builtin_cpu_supports("sse2")) bitarray_AND4_function_p
= &bitarray_AND4_SSE2
;
520 else if (__builtin_cpu_supports("mmx")) bitarray_AND4_function_p
= &bitarray_AND4_MMX
;
523 bitarray_AND4_function_p
= &bitarray_AND4_NOSIMD
;
525 // call the most optimized function for this CPU
526 (*bitarray_AND4_function_p
)(A
, B
, C
, D
);
529 void bitarray_OR_dispatch(uint32_t *A
, uint32_t *B
) {
530 #if defined(COMPILER_HAS_SIMD_NEON)
531 if (arm_has_neon()) bitarray_OR_function_p
= &bitarray_OR_NEON
;
535 #if defined(COMPILER_HAS_SIMD_AVX512)
536 if (__builtin_cpu_supports("avx512f")) bitarray_OR_function_p
= &bitarray_OR_AVX512
;
539 #if defined(COMPILER_HAS_SIMD_X86)
540 if (__builtin_cpu_supports("avx2")) bitarray_OR_function_p
= &bitarray_OR_AVX2
;
541 else if (__builtin_cpu_supports("avx")) bitarray_OR_function_p
= &bitarray_OR_AVX
;
542 else if (__builtin_cpu_supports("sse2")) bitarray_OR_function_p
= &bitarray_OR_SSE2
;
543 else if (__builtin_cpu_supports("mmx")) bitarray_OR_function_p
= &bitarray_OR_MMX
;
546 bitarray_OR_function_p
= &bitarray_OR_NOSIMD
;
548 // call the most optimized function for this CPU
549 (*bitarray_OR_function_p
)(A
, B
);
552 uint32_t count_bitarray_AND2_dispatch(uint32_t *A
, uint32_t *B
) {
553 #if defined(COMPILER_HAS_SIMD_NEON)
554 if (arm_has_neon()) count_bitarray_AND2_function_p
= &count_bitarray_AND2_NEON
;
558 #if defined(COMPILER_HAS_SIMD_AVX512)
559 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND2_function_p
= &count_bitarray_AND2_AVX512
;
562 #if defined(COMPILER_HAS_SIMD_X86)
563 if (__builtin_cpu_supports("avx2")) count_bitarray_AND2_function_p
= &count_bitarray_AND2_AVX2
;
564 else if (__builtin_cpu_supports("avx")) count_bitarray_AND2_function_p
= &count_bitarray_AND2_AVX
;
565 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND2_function_p
= &count_bitarray_AND2_SSE2
;
566 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND2_function_p
= &count_bitarray_AND2_MMX
;
569 count_bitarray_AND2_function_p
= &count_bitarray_AND2_NOSIMD
;
571 // call the most optimized function for this CPU
572 return (*count_bitarray_AND2_function_p
)(A
, B
);
575 uint32_t count_bitarray_AND3_dispatch(uint32_t *A
, uint32_t *B
, uint32_t *C
) {
576 #if defined(COMPILER_HAS_SIMD_NEON)
577 if (arm_has_neon()) count_bitarray_AND3_function_p
= &count_bitarray_AND3_NEON
;
581 #if defined(COMPILER_HAS_SIMD_AVX512)
582 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND3_function_p
= &count_bitarray_AND3_AVX512
;
585 #if defined(COMPILER_HAS_SIMD_X86)
586 if (__builtin_cpu_supports("avx2")) count_bitarray_AND3_function_p
= &count_bitarray_AND3_AVX2
;
587 else if (__builtin_cpu_supports("avx")) count_bitarray_AND3_function_p
= &count_bitarray_AND3_AVX
;
588 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND3_function_p
= &count_bitarray_AND3_SSE2
;
589 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND3_function_p
= &count_bitarray_AND3_MMX
;
592 count_bitarray_AND3_function_p
= &count_bitarray_AND3_NOSIMD
;
594 // call the most optimized function for this CPU
595 return (*count_bitarray_AND3_function_p
)(A
, B
, C
);
598 uint32_t count_bitarray_AND4_dispatch(uint32_t *A
, uint32_t *B
, uint32_t *C
, uint32_t *D
) {
599 #if defined(COMPILER_HAS_SIMD_NEON)
600 if (arm_has_neon()) count_bitarray_AND4_function_p
= &count_bitarray_AND4_NEON
;
604 #if defined(COMPILER_HAS_SIMD_AVX512)
605 if (__builtin_cpu_supports("avx512f")) count_bitarray_AND4_function_p
= &count_bitarray_AND4_AVX512
;
608 #if defined(COMPILER_HAS_SIMD_X86)
609 if (__builtin_cpu_supports("avx2")) count_bitarray_AND4_function_p
= &count_bitarray_AND4_AVX2
;
610 else if (__builtin_cpu_supports("avx")) count_bitarray_AND4_function_p
= &count_bitarray_AND4_AVX
;
611 else if (__builtin_cpu_supports("sse2")) count_bitarray_AND4_function_p
= &count_bitarray_AND4_SSE2
;
612 else if (__builtin_cpu_supports("mmx")) count_bitarray_AND4_function_p
= &count_bitarray_AND4_MMX
;
615 count_bitarray_AND4_function_p
= &count_bitarray_AND4_NOSIMD
;
617 // call the most optimized function for this CPU
618 return (*count_bitarray_AND4_function_p
)(A
, B
, C
, D
);
622 ///////////////////////////////////////////////77
623 // Entries to dispatched function calls
625 uint32_t *malloc_bitarray(uint32_t x
) {
626 return (*malloc_bitarray_function_p
)(x
);
629 void free_bitarray(uint32_t *x
) {
630 (*free_bitarray_function_p
)(x
);
633 uint32_t bitcount(uint32_t a
) {
634 return (*bitcount_function_p
)(a
);
637 uint32_t count_states(uint32_t *A
) {
638 return (*count_states_function_p
)(A
);
641 void bitarray_AND(uint32_t *A
, uint32_t *B
) {
642 (*bitarray_AND_function_p
)(A
, B
);
645 void bitarray_low20_AND(uint32_t *A
, uint32_t *B
) {
646 (*bitarray_low20_AND_function_p
)(A
, B
);
649 uint32_t count_bitarray_AND(uint32_t *A
, uint32_t *B
) {
650 return (*count_bitarray_AND_function_p
)(A
, B
);
653 uint32_t count_bitarray_low20_AND(uint32_t *A
, uint32_t *B
) {
654 return (*count_bitarray_low20_AND_function_p
)(A
, B
);
657 void bitarray_AND4(uint32_t *A
, uint32_t *B
, uint32_t *C
, uint32_t *D
) {
658 (*bitarray_AND4_function_p
)(A
, B
, C
, D
);
661 void bitarray_OR(uint32_t *A
, uint32_t *B
) {
662 (*bitarray_OR_function_p
)(A
, B
);
665 uint32_t count_bitarray_AND2(uint32_t *A
, uint32_t *B
) {
666 return (*count_bitarray_AND2_function_p
)(A
, B
);
669 uint32_t count_bitarray_AND3(uint32_t *A
, uint32_t *B
, uint32_t *C
) {
670 return (*count_bitarray_AND3_function_p
)(A
, B
, C
);
673 uint32_t count_bitarray_AND4(uint32_t *A
, uint32_t *B
, uint32_t *C
, uint32_t *D
) {
674 return (*count_bitarray_AND4_function_p
)(A
, B
, C
, D
);