codegen: use the and instruction when doing zero-extend
[ajla.git] / arithm-b.h
blob6bf7a8305f1e3e236edc65af88ec19c5c4921775
1 /*
2 * Copyright (C) 2024 Mikulas Patocka
4 * This file is part of Ajla.
6 * Ajla is free software: you can redistribute it and/or modify it under the
7 * terms of the GNU General Public License as published by the Free Software
8 * Foundation, either version 3 of the License, or (at your option) any later
9 * version.
11 * Ajla is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
13 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along with
16 * Ajla. If not, see <https://www.gnu.org/licenses/>.
19 #ifndef AJLA_ARITHM_B_H
20 #define AJLA_ARITHM_B_H
22 #include "asm.h"
24 #ifndef DEBUG_NOINLINE
25 #define ipret_inline attr_always_inline
26 #else
27 #define ipret_inline attr_noinline attr_noclone
28 #endif
31 * DIV
34 #define gen_generic_div_mod(fn, type, utype, op, us) \
35 static maybe_inline bool attr_unused cat4(FIXED_binary_,fn,_,type)(const utype *op1, const utype *op2, utype *res)\
36 { \
37 const bool mod = !(1 op 1); \
38 if (unlikely(!*op2)) { \
39 *res = !mod ? 0 : *op1; \
40 return true; \
41 } \
42 if (us) { \
43 *res = *op1 op *op2; \
44 } else if (DIVIDE_ROUNDS_TO_ZERO) { \
45 if (sizeof(type) >= sizeof(int) && \
46 unlikely(*op2 == (utype)-1) && \
47 unlikely(*op1 == sign_bit(utype))) { \
48 *res = !mod ? *op1 : 0; \
49 return true; \
50 } \
51 *res = (type)*op1 op (type)*op2; \
52 } else { \
53 utype o1 = (type)*op1 < 0 ? -*op1 : *op1; \
54 utype o2 = (type)*op2 < 0 ? -*op2 : *op2; \
55 utype neg = !mod \
56 ? (*op1 ^ *op2) & sign_bit(type) \
57 : (utype)((type)*op1 < 0); \
58 utype r = o1 op o2; \
59 if (unlikely(neg != 0)) \
60 r = -r; \
61 *res = r; \
62 } \
63 return true; \
66 #define gen_generic_div_functions(type, utype) \
67 gen_generic_div_mod(divide, type, utype, /, 0) \
68 gen_generic_div_mod(udivide, type, utype, /, 1) \
69 gen_generic_div_mod(modulo, type, utype, %, 0) \
70 gen_generic_div_mod(umodulo, type, utype, %, 1)
72 #define gen_arm_div_mod(type, utype, int_type, s, alt) \
73 static ipret_inline bool attr_unused cat3(FIXED_binary_divide,alt,type)(const utype *op1, const utype *op2, utype *res)\
74 { \
75 int_type r; \
76 if (!ARM_ASM_DIV_NO_TRAP && unlikely(!op2)) { *res = 0; return true; }\
77 __asm__ (ARM_ASM_PREFIX "sdiv %"s"0, %"s"1, %"s"2" : \
78 "=r"(r) : "r"((int_type)(type)*op1), "r"((int_type)(type)*op2));\
79 *res = r; \
80 return true; \
81 } \
82 static ipret_inline bool attr_unused cat3(FIXED_binary_udivide,alt,type)(const utype *op1, const utype *op2, utype *res)\
83 { \
84 int_type r; \
85 if (!ARM_ASM_DIV_NO_TRAP && unlikely(!op2)) { *res = 0; return true; }\
86 __asm__ (ARM_ASM_PREFIX "udiv %"s"0, %"s"1, %"s"2" : \
87 "=r"(r) : "r"((int_type)*op1), "r"((int_type)*op2)); \
88 *res = r; \
89 return true; \
90 } \
91 static ipret_inline bool attr_unused cat3(FIXED_binary_modulo,alt,type)(const utype *op1, const utype *op2, utype *res)\
92 { \
93 int_type r; \
94 if (!ARM_ASM_DIV_NO_TRAP && unlikely(!op2)) { *res = *op1; return true; }\
95 __asm__ (ARM_ASM_PREFIX "sdiv %"s"0, %"s"1, %"s"2" : \
96 "=r"(r) : "r"((int_type)(type)*op1), "r"((int_type)(type)*op2));\
97 *res = *op1 - (type)*op2 * r; \
98 return true; \
99 } \
100 static ipret_inline bool attr_unused cat3(FIXED_binary_umodulo,alt,type)(const utype *op1, const utype *op2, utype *res)\
102 int_type r; \
103 if (!ARM_ASM_DIV_NO_TRAP && unlikely(!op2)) { *res = *op1; return true; }\
104 __asm__ (ARM_ASM_PREFIX "udiv %"s"0, %"s"1, %"s"2" : \
105 "=r"(r) : "r"((int_type)*op1), "r"((int_type)*op2)); \
106 *res = *op1 - *op2 * r; \
107 return true; \
110 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_SDIV_UDIV)
111 #define FIXED_DIVIDE_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_idiv))
112 #define FIXED_DIVIDE_ALT1_TYPES 0x7
113 #define FIXED_UDIVIDE_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_idiv))
114 #define FIXED_UDIVIDE_ALT1_TYPES 0x7
115 #define FIXED_MODULO_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_idiv))
116 #define FIXED_MODULO_ALT1_TYPES 0x7
117 #define FIXED_UMODULO_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_idiv))
118 #define FIXED_UMODULO_ALT1_TYPES 0x7
119 gen_arm_div_mod(int8_t, uint8_t, uint32_t, "", _alt1_)
120 gen_arm_div_mod(int16_t, uint16_t, uint32_t, "", _alt1_)
121 gen_arm_div_mod(int32_t, uint32_t, uint32_t, "", _alt1_)
122 #endif
125 * POWER
128 #define gen_generic_power(type, utype) \
129 static bool attr_unused cat(FIXED_binary_power_,type)(const utype *op1, const utype *op2, utype *res)\
131 broken_128bit_multiply utype r = 1; \
132 broken_128bit_multiply utype o1 = *op1; \
133 utype o2 = *op2; \
134 do { \
135 if (o2 & 1) \
136 r *= o1; \
137 o1 *= o1; \
138 o2 >>= 1; \
139 } while (o2); \
140 *res = r; \
141 return true; \
145 * ROL/ROR
148 #define gen_generic_rot(fn, type, utype, mode) \
149 static maybe_inline bool attr_unused cat4(FIXED_binary_,fn,_,type)(const utype *op1, const utype *op2, utype *res)\
151 const uint8_t mask = sizeof(type) * 8 - 1; \
152 if (!(mode)) \
153 *res = (*op1 << (*op2 & mask)) | (*op1 >> (-*op2 & mask));\
154 else \
155 *res = (*op1 >> (*op2 & mask)) | (*op1 << (-*op2 & mask));\
156 return true; \
159 #define gen_x86_rot(fn, type, utype, asmtag, constr) \
160 static ipret_inline bool attr_unused cat4(FIXED_binary_,fn,_,type)(const utype *op1, const utype *op2, utype *res)\
162 __asm__ (#fn #asmtag " %2, %0" : constr(*res) : "0"(*op1), "cN"((uint8_t)*op2) : "cc");\
163 return true; \
166 #define gen_arm_rot(fn, type, utype, int_type, right, s) \
167 static ipret_inline bool attr_unused cat4(FIXED_binary_,fn,_,type)(const utype *op1, const utype *op2, utype *res)\
169 int_type o1; \
170 int_type o2 = (uint8_t)*op2; \
171 int_type r; \
172 if (!(right)) \
173 o2 = -o2; \
174 o1 = *op1 & (utype)(-1); \
175 if (sizeof(type) == 1) { \
176 o2 &= sizeof(type) * 8 - 1; \
177 o1 |= o1 << 8; \
178 __asm__(ARM_ASM_PREFIX "lsr"ARM_ASM_S" %"s"0, %"s"1, %"s"2" : \
179 "=r"(r) : ARM_ASM_IF_T12("0","r")(o1), "r"(o2) ARM_ASM_S_CLOB);\
180 } else { \
181 if (sizeof(type) == 2) \
182 o1 |= o1 << 16; \
183 __asm__(ARM_ASM_PREFIX "ror"ARM_ASM_S" %"s"0, %"s"1, %"s"2" : \
184 "=r"(r) : ARM_ASM_IF_T12("0","r")(o1), "r"(o2) ARM_ASM_S_CLOB);\
186 *res = (utype)r; \
187 return true; \
191 * BTS/BTR/BTC
194 #define gen_generic_bit_mask(type, utype) \
195 static attr_always_inline utype cat(bits_mask_,type)(uint8_t num) \
197 return (utype)1 << (num & (sizeof(type) * 8 - 1)); \
199 static attr_always_inline utype cat(bitr_mask_,type)(uint8_t num) \
201 return ~((utype)1 << (num & (sizeof(type) * 8 - 1))); \
203 static attr_always_inline utype cat(bitt_ror_,type)(utype val, uint8_t num)\
205 return val >> ((num & (sizeof(type) * 8 - 1))); \
208 #define gen_x86_bit_mask(type, utype, asmtag, constr) \
209 static attr_always_inline utype cat(bits_mask_,type)(uint8_t num) \
211 utype result; \
212 __asm__ ("rol"#asmtag" %2, %1":constr(result):"0"((utype)1),"c"(num):"cc");\
213 return result; \
215 static attr_always_inline utype cat(bitr_mask_,type)(uint8_t num) \
217 utype result; \
218 __asm__ ("rol"#asmtag" %2, %1":constr(result):"0"((utype)-2),"c"(num):"cc");\
219 return result; \
221 static attr_always_inline utype cat(bitt_ror_,type)(utype val, uint8_t num)\
223 utype result; \
224 __asm__ ("ror"#asmtag" %2, %1":constr(result):"0"(val),"c"(num):"cc");\
225 return result; \
228 #define gen_generic_bit_functions(type, utype) \
229 static maybe_inline bool attr_unused cat(FIXED_binary_bts_,type)(const utype *op1, const utype *op2, utype *res)\
231 *res = *op1 | cat(bits_mask_,type)((uint8_t)*op2); \
232 return true; \
234 static maybe_inline bool attr_unused cat(FIXED_binary_btr_,type)(const utype *op1, const utype *op2, utype *res)\
236 *res = *op1 & cat(bitr_mask_,type)((uint8_t)*op2); \
237 return true; \
239 static maybe_inline bool attr_unused cat(FIXED_binary_btc_,type)(const utype *op1, const utype *op2, utype *res)\
241 *res = *op1 ^ cat(bits_mask_,type)((uint8_t)*op2); \
242 return true; \
244 static maybe_inline bool attr_unused cat(FIXED_binary_bt_,type)(const utype *op1, const utype *op2, ajla_flat_option_t *res)\
246 *res = cat(bitt_ror_,type)(*op1, (uint8_t)*op2) & 1; \
247 return true; \
251 * BSWAP
254 #define gen_generic_bswap_8() \
255 static ipret_inline void attr_unused FIXED_unary_bswap_int8_t(const uint8_t *op, uint8_t *res)\
257 *res = *op; \
259 #define gen_generic_bswap_16() \
260 static ipret_inline void attr_unused FIXED_unary_bswap_int16_t(const uint16_t *op, uint16_t *res)\
262 *res = (*op << 8 | *op >> 8); \
264 #if defined(HAVE___BUILTIN_BSWAP32) && defined(HAVE___BUILTIN_BSWAP64) && !defined(UNUSUAL_ARITHMETICS)
265 #define gen_generic_bswap_32() \
266 static ipret_inline void attr_unused FIXED_unary_bswap_int32_t(const uint32_t *op, uint32_t *res)\
268 *res = __builtin_bswap32(*op); \
270 #define gen_generic_bswap_64() \
271 static ipret_inline void attr_unused FIXED_unary_bswap_int64_t(const uint64_t *op, uint64_t *res)\
273 *res = __builtin_bswap64(*op); \
275 #else
276 #define gen_generic_bswap_32() \
277 static ipret_inline void attr_unused FIXED_unary_bswap_int32_t(const uint32_t *op, uint32_t *res)\
279 *res = (*op >> 24) | \
280 ((*op >> 8) & 0xff00U) | \
281 ((*op & 0xff00U) << 8) | \
282 (*op << 24); \
284 #define gen_generic_bswap_64() \
285 static ipret_inline void attr_unused FIXED_unary_bswap_int64_t(const uint64_t *op, uint64_t *res)\
287 uint32_t o_lo = (uint32_t)*op; \
288 uint32_t o_hi = (uint32_t)(*op >> 32); \
289 FIXED_unary_bswap_int32_t(&o_lo, &o_lo); \
290 FIXED_unary_bswap_int32_t(&o_hi, &o_hi); \
291 *res = o_hi | ((uint64_t)o_lo << 32); \
293 #endif
294 #define gen_generic_bswap_128() \
295 static ipret_inline void attr_unused FIXED_unary_bswap_int128_t(const uint128_t *op, uint128_t *res)\
297 uint64_t o_lo = *op; \
298 uint64_t o_hi = *op >> 64; \
299 FIXED_unary_bswap_int64_t(&o_lo, &o_lo); \
300 FIXED_unary_bswap_int64_t(&o_hi, &o_hi); \
301 *res = o_hi | ((uint128_t)o_lo << 64); \
304 #if defined(INLINE_ASM_GCC_I386) && !(defined(HAVE___BUILTIN_BSWAP32) && defined(HAVE___BUILTIN_BSWAP64) && static_test_bswap)
305 #define FIXED_BSWAP_ALT1_FEATURES cpu_feature_mask(CPU_FEATURE_bswap)
306 #define FIXED_BSWAP_ALT1_TYPES 0xc
307 static ipret_inline void attr_unused FIXED_unary_bswap_alt1_int32_t(const uint32_t *op, uint32_t *res)
309 __asm__ ("bswap %0":"=r"(*res):"0"(*op));
311 #if TYPE_FIXED_N >= 4
312 static ipret_inline void attr_unused FIXED_unary_bswap_alt1_int64_t(const uint64_t *op, uint64_t *res)
314 __asm__ ("bswap %%eax; bswap %%edx; xchg %%eax, %%edx":"=A"(*res):"0"(*op));
316 #endif
317 #endif
320 * BREV
323 #define brev_distribute_mask(utype, m) ((utype)(m) * 0x01010101UL * ((one << 15 << 15 << 2) + 1) * ((one << 15 << 15 << 15 << 15 << 4) + 1))
325 #define gen_generic_brev(type, utype) \
326 static maybe_inline void attr_unused cat(FIXED_unary_brev_,type)(const utype *op, utype *res)\
328 utype one = 1; /* avoid shift overflow warning in clang */ \
329 utype mask; \
330 utype o = *op; \
331 mask = (utype)brev_distribute_mask(utype, 0x55); \
332 o = ((o & mask) << 1) | ((o & ~mask) >> 1); \
333 mask = (utype)brev_distribute_mask(utype, 0x33); \
334 o = ((o & mask) << 2) | ((o & ~mask) >> 2); \
335 mask = (utype)brev_distribute_mask(utype, 0x0f); \
336 o = ((o & mask) << 4) | ((o & ~mask) >> 4); \
337 cat(FIXED_unary_bswap_,type)(&o, res); \
340 #define gen_arm_brev(type, utype, int_type, s, alt) \
341 static ipret_inline void attr_unused cat3(FIXED_unary_brev,alt,type)(const utype *op, utype *res)\
343 int_type r; \
344 __asm__ (ARM_ASM_PREFIX "rbit %"s"0, %"s"1" : "=r"(r) : "r"((int_type)*op));\
345 *res = r >> ((sizeof(int_type) - sizeof(type)) * 8); \
347 #define gen_arm_brev_2reg(type, utype, int_type, s, alt) \
348 static ipret_inline void attr_unused cat3(FIXED_unary_brev,alt,type)(const utype *op, utype *res)\
350 const int shift = (int)sizeof(int_type) * 8; \
351 utype o1 = *op; \
352 int_type r1, r2; \
353 __asm__ (ARM_ASM_PREFIX "rbit %"s"0, %"s"1" : "=r"(r2) : "r"((int_type)o1));\
354 __asm__ (ARM_ASM_PREFIX "rbit %"s"0, %"s"1" : "=r"(r1) : "r"((int_type)(o1 >> shift)));\
355 *res = ((utype)r2 << shift) | r1; \
357 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_RBIT)
358 #define FIXED_BREV_ALT1_FEATURES cpu_feature_mask(CPU_FEATURE_armv6t2)
359 #define FIXED_BREV_ALT1_TYPES 0xf
360 gen_arm_brev(int8_t, uint8_t, uint32_t, "", _alt1_)
361 gen_arm_brev(int16_t, uint16_t, uint32_t, "", _alt1_)
362 gen_arm_brev(int32_t, uint32_t, uint32_t, "", _alt1_)
363 #if TYPE_FIXED_N >= 4
364 gen_arm_brev_2reg(int64_t, uint64_t, uint32_t, "", _alt1_)
365 #endif
366 #endif
369 * BSF/BSR
372 #if defined(HAVE_BUILTIN_CTZ)
373 #define libc_ffs_int8_t if (unlikely(!o)) { *res = -1; return; } else { *res = __builtin_ctz(o); return; }
374 #define libc_ffs_int16_t if (unlikely(!o)) { *res = -1; return; } else { *res = __builtin_ctz(o); return; }
375 #elif defined(HAVE_FFS)
376 #define libc_ffs_int8_t *res = ffs(o) - 1; return;
377 #define libc_ffs_int16_t *res = ffs(o) - 1; return;
378 #else
379 #define libc_ffs_int8_t
380 #define libc_ffs_int16_t
381 #endif
382 #if defined(HAVE_BUILTIN_CTZ) && SIZEOF_UNSIGNED >= 4
383 #define libc_ffs_int32_t if (unlikely(!o)) { *res = -1; return; } else { *res = __builtin_ctz(o); return; }
384 #elif defined(HAVE_FFS) && SIZEOF_UNSIGNED >= 4
385 #define libc_ffs_int32_t *res = ffs(o) - 1; return;
386 #elif defined(HAVE_FFSL)
387 #define libc_ffs_int32_t *res = ffsl(o) - 1; return;
388 #else
389 #define libc_ffs_int32_t
390 #endif
391 #if defined(HAVE_BUILTIN_CTZ) && SIZEOF_UNSIGNED_LONG_LONG == 8
392 #define libc_ffs_int64_t if (unlikely(!o)) { *res = -1; return; } else { *res = __builtin_ctzll(o); return; }
393 #define libc_ffs_int128_t if ((uint64_t)o) { *res = __builtin_ctzll(o); return; } else if (o >> 64) { *res = __builtin_ctzll(o >> 64) + 64; return; } else { *res = -1; return; }
394 #elif defined(HAVE_FFSL) && SIZEOF_UNSIGNED_LONG >= 8
395 #define libc_ffs_int64_t *res = ffsl(o) - 1; return;
396 #define libc_ffs_int128_t if ((uint64_t)o) { *res = ffsl(o) - 1; return; } else if (o >> 64) { *res = ffsl(o >> 64) + 63; return; } else { *res = -1; return; }
397 #elif defined(HAVE_FFSLL) && SIZEOF_UNSIGNED_LONG_LONG >= 8
398 #define libc_ffs_int64_t *res = ffsll(o) - 1; return;
399 #define libc_ffs_int128_t if ((uint64_t)o) { *res = ffsll(o) - 1; return; } else if (o >> 64) { *res = ffsll(o >> 64) + 63; return; } else { *res = -1; return; }
400 #else
401 #define libc_ffs_int64_t
402 #define libc_ffs_int128_t
403 #endif
405 #if defined(HAVE_BUILTIN_CLZ) && SIZEOF_UNSIGNED >= 2 && !(SIZEOF_UNSIGNED & (SIZEOF_UNSIGNED - 1))
406 #define libc_fls_int8_t if (unlikely(!o)) { *res = -1; return; } else { *res = ((unsigned)sizeof(unsigned) * 8 - 1) CLZ_BSR_OP __builtin_clz(o); return; }
407 #define libc_fls_int16_t if (unlikely(!o)) { *res = -1; return; } else { *res = ((unsigned)sizeof(unsigned) * 8 - 1) CLZ_BSR_OP __builtin_clz(o); return; }
408 #elif defined(HAVE_FLS)
409 #define libc_fls_int8_t *res = fls(o) - 1; return;
410 #define libc_fls_int16_t *res = fls(o) - 1; return;
411 #else
412 #define libc_fls_int8_t
413 #define libc_fls_int16_t
414 #endif
415 #if defined(HAVE_BUILTIN_CLZ) && SIZEOF_UNSIGNED >= 4 && !(SIZEOF_UNSIGNED & (SIZEOF_UNSIGNED - 1))
416 #define libc_fls_int32_t if (unlikely(!o)) { *res = -1; return; } else { *res = ((unsigned)sizeof(unsigned) * 8 - 1) CLZ_BSR_OP __builtin_clz(o); return; }
417 #elif defined(HAVE_FLS) && SIZEOF_UNSIGNED >= 4
418 #define libc_fls_int32_t *res = fls(o) - 1; return;
419 #elif defined(HAVE_FLSL)
420 #define libc_fls_int32_t *res = flsl(o) - 1; return;
421 #else
422 #define libc_fls_int32_t
423 #endif
424 #if defined(HAVE_BUILTIN_CLZ) && SIZEOF_UNSIGNED_LONG_LONG == 8
425 #define libc_fls_int64_t if (unlikely(!o)) { *res = -1; return; } else { *res = ((unsigned)sizeof(unsigned long long) * 8 - 1) CLZ_BSR_OP __builtin_clzll(o); return; }
426 #define libc_fls_int128_t if (o >> 64) { *res = (127 CLZ_BSR_OP __builtin_clzll((uint64_t)(o >> 64))); return; } else if (likely((uint64_t)o != 0)) { *res = ((unsigned)sizeof(unsigned long long) * 8 - 1) CLZ_BSR_OP __builtin_clzll((uint64_t)o); return; } else { *res = -1; return; }
427 #elif defined(HAVE_FLSL) && SIZEOF_UNSIGNED_LONG >= 8
428 #define libc_fls_int64_t *res = flsl(o) - 1; return;
429 #define libc_fls_int128_t
430 #elif defined(HAVE_FLSLL) && SIZEOF_UNSIGNED_LONG_LONG >= 8
431 #define libc_fls_int64_t *res = flsll(o) - 1; return;
432 #define libc_fls_int128_t
433 #else
434 #define libc_fls_int64_t
435 #define libc_fls_int128_t
436 #endif
438 #define gen_generic_bsfr_functions(type, utype) \
439 static maybe_inline void attr_unused cat(FIXED_unary_bsf_,type)(const utype *op, utype *res)\
441 int i; \
442 utype o = *op; \
443 cat(libc_ffs_,type) \
444 for (i = 0; i < (int)sizeof(type) * 8; i++) \
445 if (o & ((utype)1 << i)) { \
446 *res = (utype)i; \
447 return; \
449 *res = (utype)-1; \
451 static maybe_inline void attr_unused cat(FIXED_unary_bsr_,type)(const utype *op, utype *res)\
453 int i; \
454 utype o = *op; \
455 cat(libc_fls_,type) \
456 for (i = (int)sizeof(type) * 8 - 1; i >= 0; i--) \
457 if (o & ((utype)1 << i)) { \
458 *res = (utype)i; \
459 return; \
461 *res = (utype)-1; \
464 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_LZCNT)
465 #define FIXED_BSR_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_cmov) | cpu_feature_mask(CPU_FEATURE_lzcnt))
466 #if defined(INLINE_ASM_GCC_I386) || !defined(HAVE_ASSEMBLER___INT128)
467 #define FIXED_BSR_ALT1_TYPES 0xf
468 #else
469 #define FIXED_BSR_ALT1_TYPES 0x1f
470 #endif
472 #define gen_x86_lzcnt(type, utype, internal_type, asmtag) \
473 static ipret_inline void attr_unused cat(FIXED_unary_bsr_alt1_,type)(const utype *op, utype *res)\
475 internal_type r; \
476 __asm__ (" \n\
477 lzcnt"#asmtag" %1, %0 \n\
478 ":"=r"(r):"r"X86_ASM_M((internal_type)*op):"cc"); \
479 *res = (internal_type)(sizeof(internal_type) * 8 - 1 - r); \
481 #define gen_x86_lzcnt_split(type, utype, asmtag, ax, dx, n, ctd) \
482 static ipret_inline void attr_unused cat(FIXED_unary_bsr_alt1_,type)(const utype *op, utype *res)\
484 __asm__ (" \n\
485 test"#asmtag" %%"#dx", %%"#dx" \n\
486 cmovz"#asmtag" %%"#ax", %%"#dx" \n\
487 setz %%cl \n\
488 lzcnt"#asmtag" %%"#dx", %%"#dx" \n\
489 movl $"#n", %%eax \n\
490 shrl %%cl, %%eax \n\
491 sub"#asmtag" %%"#dx", %%"#ax" \n\
492 "#ctd" \n\
493 ":"=A"(*res):"0"(*op):"ecx","cc"); \
495 gen_x86_lzcnt(int8_t, uint8_t, int16_t, w)
496 gen_x86_lzcnt(int16_t, uint16_t, int16_t, w)
497 gen_x86_lzcnt(int32_t, uint32_t, int32_t, l)
498 #if TYPE_FIXED_N >= 4
499 #ifdef INLINE_ASM_GCC_I386
500 gen_x86_lzcnt_split(int64_t, uint64_t, l, eax, edx, 63, cltd)
501 #else
502 gen_x86_lzcnt(int64_t, uint64_t, int64_t, q)
503 #if TYPE_FIXED_N >= 5 && defined(HAVE_ASSEMBLER___INT128)
504 gen_x86_lzcnt_split(int128_t, uint128_t, q, rax, rdx, 127, cqto)
505 #endif
506 #endif
507 #endif
508 #endif
510 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_CLZ) && defined(HAVE_ARM_ASSEMBLER_RBIT)
511 #define FIXED_BSF_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_armv6t2))
512 #define FIXED_BSF_ALT1_TYPES 0xf
514 #define gen_arm_rbit_clz(type, utype) \
515 static ipret_inline void attr_unused cat(FIXED_unary_bsf_alt1_,type)(const utype *op, utype *res)\
517 uint32_t clz; \
518 if (unlikely(!*op)) { *res = -1; return; } \
519 __asm__ (ARM_ASM_PREFIX "rbit %0, %1; clz %0, %0":"=r"(clz):"r"((uint32_t)*op));\
520 *res = clz; \
522 #define gen_arm_rbit_clz_split() \
523 static ipret_inline void attr_unused FIXED_unary_bsf_alt1_int64_t(const uint64_t *op, uint64_t *res)\
525 uint32_t clz; \
526 uint64_t o = *op; \
527 if ((uint32_t)o) { \
528 __asm__ (ARM_ASM_PREFIX "rbit %0, %1; clz %0, %0":"=r"(clz):"r"((uint32_t)o));\
529 *res = clz; \
530 } else { \
531 uint32_t o_hi = o >> 32; \
532 if (unlikely(!o_hi)) { \
533 *res = -1; \
534 return; \
536 __asm__ (ARM_ASM_PREFIX "rbit %0, %1; clz %0, %0":"=r"(clz):"r"(o_hi));\
537 *res = clz + 32; \
540 gen_arm_rbit_clz(int8_t, uint8_t)
541 gen_arm_rbit_clz(int16_t, uint16_t)
542 gen_arm_rbit_clz(int32_t, uint32_t)
543 #if TYPE_FIXED_N >= 4
544 gen_arm_rbit_clz_split()
545 #endif
546 #endif
548 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_CLZ)
549 #define FIXED_BSR_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_armv5))
550 #define FIXED_BSR_ALT1_TYPES 0xf
552 #define gen_arm_clz(type, utype) \
553 static ipret_inline void attr_unused cat(FIXED_unary_bsr_alt1_,type)(const utype *op, utype *res)\
555 int clz; \
556 __asm__ (ARM_ASM_PREFIX "clz %0, %1":"=r"(clz):"r"((uint32_t)*op));\
557 *res = 31 - clz; \
559 #define gen_arm_clz_split() \
560 static ipret_inline void attr_unused FIXED_unary_bsr_alt1_int64_t(const uint64_t *op, uint64_t *res)\
562 int clz; \
563 uint64_t o = *op; \
564 uint32_t o_hi = o >> 32; \
565 if (o_hi) { \
566 __asm__ (ARM_ASM_PREFIX "clz %0, %1":"=r"(clz):"r"(o_hi));\
567 *res = (unsigned)(63 - clz); \
568 } else { \
569 __asm__ (ARM_ASM_PREFIX "clz %0, %1":"=r"(clz):"r"((uint32_t)o));\
570 *res = 31 - clz; \
573 gen_arm_clz(int8_t, uint8_t)
574 gen_arm_clz(int16_t, uint16_t)
575 gen_arm_clz(int32_t, uint32_t)
576 #if TYPE_FIXED_N >= 4
577 gen_arm_clz_split()
578 #endif
579 #endif
582 * POPCNT
585 #if defined(HAVE_BUILTIN_POPCOUNT) && SIZEOF_UNSIGNED >= 4 && SIZEOF_UNSIGNED_LONG_LONG >= 8
586 #define libc_popcnt_int8_t *res = (unsigned)__builtin_popcount(o); return;
587 #define libc_popcnt_int16_t *res = (unsigned)__builtin_popcount(o); return;
588 #define libc_popcnt_int32_t *res = (unsigned)__builtin_popcount(o); return;
589 #define libc_popcnt_int64_t *res = (unsigned)__builtin_popcountll(o); return;
590 #define libc_popcnt_int128_t *res = (unsigned)__builtin_popcountll((uint64_t)o) + (unsigned)__builtin_popcountll((uint64_t)(o >> 64)); return;
591 #else
592 #define libc_popcnt_int8_t
593 #define libc_popcnt_int16_t
594 #define libc_popcnt_int32_t
595 #define libc_popcnt_int64_t
596 #define libc_popcnt_int128_t
597 #endif
599 #define gen_generic_popcnt(type, utype) \
600 static maybe_inline void attr_unused cat(FIXED_unary_popcnt_,type)(const utype *op, utype *res)\
602 unsigned r; \
603 utype o = *op; \
604 cat(libc_popcnt_,type) \
605 r = 0; \
606 while (o) \
607 o &= o - 1, r++; \
608 *res = (utype)r; \
611 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_POPCNT) && !(defined(HAVE_BUILTIN_POPCOUNT) && static_test_popcnt)
612 #define FIXED_POPCNT_ALT1_FEATURES cpu_feature_mask(CPU_FEATURE_popcnt)
613 #if defined(INLINE_ASM_GCC_I386)
614 #define FIXED_POPCNT_ALT1_TYPES 0xf
615 #else
616 #define FIXED_POPCNT_ALT1_TYPES 0x1f
617 #endif
619 #define gen_x86_popcnt(type, utype, internal_type, asmtag) \
620 static ipret_inline void attr_unused cat(FIXED_unary_popcnt_alt1_,type)(const utype *op, utype *res)\
622 internal_type r; \
623 __asm__ (" \n\
624 popcnt"#asmtag" %1, %0 \n\
625 ":"=r"(r):"r"X86_ASM_M((internal_type)*op):"cc"); \
626 *res = r; \
628 #define gen_x86_popcnt_split(type, utype, half, asmtag) \
629 static ipret_inline void attr_unused cat(FIXED_unary_popcnt_alt1_,type)(const utype *op, utype *res)\
631 half r1, r2; \
632 __asm__ (" \n\
633 popcnt"#asmtag" %1, %0 \n\
634 ":"=r"(r1):"r"X86_ASM_M(cast_ptr(half *, op)[0]):"cc"); \
635 __asm__ (" \n\
636 popcnt"#asmtag" %1, %0 \n\
637 ":"=r"(r2):"r"X86_ASM_M(cast_ptr(half *, op)[1]):"cc"); \
638 *res = (unsigned)r1 + (unsigned)r2; \
640 gen_x86_popcnt(int8_t, uint8_t, uint16_t, w)
641 gen_x86_popcnt(int16_t, uint16_t, uint16_t, w)
642 gen_x86_popcnt(int32_t, uint32_t, uint32_t, l)
643 #if TYPE_FIXED_N >= 4
644 #ifdef INLINE_ASM_GCC_I386
645 gen_x86_popcnt_split(int64_t, uint64_t, uint32_t, l)
646 #else
647 gen_x86_popcnt(int64_t, uint64_t, uint64_t, q)
648 #if TYPE_FIXED_N >= 5
649 gen_x86_popcnt_split(int128_t, uint128_t, uint64_t, q)
650 #endif
651 #endif
652 #endif
653 #endif
655 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_VFP)
656 #define FIXED_POPCNT_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_neon))
657 #define FIXED_POPCNT_ALT1_TYPES 0xf
659 #define gen_arm_popcnt(type, utype, wtag, field, vpaddl) \
660 static ipret_inline void attr_unused cat(FIXED_unary_popcnt_alt1_,type)(const utype *op, utype *res)\
662 __asm__ volatile (ARM_ASM_PREFIX " \n\
663 vld1."#wtag" d0"field", [ %1 ] \n\
664 vcnt.8 d0, d0 \n\
665 " vpaddl " \n\
666 vst1."#wtag" d0"field", [ %0 ] \n\
667 ": : "r"(res), "r"(op) : "d0", "memory"); \
670 gen_arm_popcnt(int8_t, uint8_t, 8, "[0]", "")
671 gen_arm_popcnt(int16_t, uint16_t, 16, "[0]", "vpaddl.u8 d0, d0")
672 gen_arm_popcnt(int32_t, uint32_t, 32, "[0]", "vpaddl.u8 d0, d0 \n vpaddl.u16 d0, d0")
673 #if TYPE_FIXED_N >= 4
674 gen_arm_popcnt(int64_t, uint64_t, 64, "", "vpaddl.u8 d0, d0 \n vpaddl.u16 d0, d0 \n vpaddl.u32 d0, d0")
675 #endif
676 #endif
678 #if defined(INLINE_ASM_GCC_ARM64)
679 #define FIXED_POPCNT_ALT1_FEATURES (cpu_feature_mask(CPU_FEATURE_neon))
680 #define FIXED_POPCNT_ALT1_TYPES 0x1f
682 #define gen_arm64_popcnt(type, utype, reg, cntw, vpaddl) \
683 static ipret_inline void attr_unused cat(FIXED_unary_popcnt_alt1_,type)(const utype *op, utype *res)\
685 __asm__ volatile (ARM_ASM_PREFIX " \n\
686 ldr "reg", [ %1 ] \n\
687 cnt v0."cntw"b, v0."cntw"b \n\
688 "vpaddl" \n\
689 str "reg", [ %0 ] \n\
690 ": : "r"(res), "r"(op) : "v0", "memory"); \
693 gen_arm64_popcnt(int8_t, uint8_t, "b0", "8", "")
694 gen_arm64_popcnt(int16_t, uint16_t, "h0", "8", "uaddlp v0.4h, v0.8b")
695 gen_arm64_popcnt(int32_t, uint32_t, "s0", "8", "uaddlv h0, v0.8b")
696 #if TYPE_FIXED_N >= 4
697 gen_arm64_popcnt(int64_t, uint64_t, "d0", "8", "uaddlv h0, v0.8b")
698 #if TYPE_FIXED_N >= 5
699 gen_arm64_popcnt(int128_t, uint128_t, "q0", "16", "uaddlv h0, v0.16b")
700 #endif
701 #endif
703 #endif
705 #define file_inc "arithm-b.inc"
706 #include "for-fix.inc"
708 #endif