2 * Copyright (C) 2024 Mikulas Patocka
4 * This file is part of Ajla.
6 * Ajla is free software: you can redistribute it and/or modify it under the
7 * terms of the GNU General Public License as published by the Free Software
8 * Foundation, either version 3 of the License, or (at your option) any later
11 * Ajla is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
13 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along with
16 * Ajla. If not, see <https://www.gnu.org/licenses/>.
19 #ifndef AJLA_ARITHM_R_H
20 #define AJLA_ARITHM_R_H
25 #define sse_one_param(x) stringify(x)
26 #define avx_two_params(x) stringify(x)", "stringify(x)
28 #define gen_sse_binary(fn, type, v, instr, s, p) \
29 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, type *res)\
32 "v"movs"#s" %1, %%xmm0 \n\
33 "v""#instr"s"#s" %2, "p(%%xmm0)" \n\
34 "v"movs"#s" %%xmm0, %0 \n\
35 " : "=m"(*res) : "m"(*op1), "m"(*op2) X86_ASM_XMM0_CLOB); \
39 #ifdef INLINE_ASM_GCC_LABELS
40 #define gen_sse_logical(fn, type, v, instr, s) \
41 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
43 if (sizeof(ajla_flat_option_t) != 1) \
46 "v"movs"#s" %1, %%xmm0 \n\
47 "v"ucomis"#s" %2, %%xmm0 \n\
50 " : : "m"(*res), "m"(*op1), "m"(*op2) : "memory", "cc" X86_ASM_XMM0_CLOBC : unordered);\
56 #define gen_sse_logical(fn, type, v, instr, s) \
57 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
59 unsigned char unordered, r; \
61 "v"movs"#s" %2, %%xmm0 \n\
62 "v"ucomis"#s" %3, %%xmm0 \n\
65 " : "=r"(r), "=r"(unordered) : "m"(*op1), "m"(*op2) : "cc" X86_ASM_XMM0_CLOBC);\
66 if (unlikely(unordered)) \
73 #define gen_sse_neg(fn, type, v, s, p) \
74 static ipret_inline void attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, type *res)\
76 static const type x = -0.0; \
78 "v"movs"#s" %1, %%xmm0 \n\
79 "v"movs"#s" %2, %%xmm1 \n\
80 "v"xorp"#s" %%xmm1, "p(%%xmm0)" \n\
81 "v"movs"#s" %%xmm0, %0 \n\
82 " : "=m"(*res) : "m"(*op1), "m"(x) X86_ASM_XMM0_CLOB X86_ASM_XMM1_CLOBC);\
85 #define gen_sse_sqrt(fn, type, v, s, p) \
86 static ipret_inline void attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, type *res)\
89 "v"movs"#s" %1, %%xmm0 \n\
90 "v"sqrts"#s" %%xmm0, "p(%%xmm0)" \n\
91 "v"movs"#s" %%xmm0, %0 \n\
92 " : "=m"(*res) : "m"(*op1) X86_ASM_XMM0_CLOB X86_ASM_XMM1_CLOBC);\
95 #define gen_sse_to_int(fn, type, v, s) \
96 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, int_default_t *res)\
100 "v"cvtts"#s"2si %1, %0 \n\
101 " : "=r"(r) : "m"(*op1)); \
102 if (unlikely(r == sign_bit(int_default_t))) \
108 #define gen_sse_from_int(fn, type, v, s, z, p) \
109 static ipret_inline void attr_unused cat4(REAL_unary_,fn,_,type)(const int_default_t *op1, type *res)\
112 "v"cvtsi2s"#s""#z" %1, "p(%%xmm0)" \n\
113 "v"movs"#s" %%xmm0, %0 \n\
114 " : "=m"(*res) : "rm"(*op1) X86_ASM_XMM0_CLOB); \
117 #define gen_sse_is_exception(fn, type, v, s) \
118 static ipret_inline void attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, ajla_flat_option_t *res)\
120 if (sizeof(ajla_flat_option_t) != 1) \
123 "v"movs"#s" %1, %%xmm0 \n\
124 "v"ucomis"#s" %%xmm0, %%xmm0 \n\
126 " : "=m"(*res) : "m"(*op1) : "cc" X86_ASM_XMM0_CLOBC); \
129 #define gen_f16c_binary(fn, type, instr) \
130 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, type *res)\
133 vpinsrw $0, %1, %%xmm7, %%xmm0 \n\
134 vpinsrw $0, %2, %%xmm7, %%xmm1 \n\
135 vcvtph2ps %%xmm0, %%xmm0 \n\
136 vcvtph2ps %%xmm1, %%xmm1 \n\
137 v"#instr"ss %%xmm1, %%xmm0, %%xmm0 \n\
138 vcvtps2ph $4, %%xmm0, %%xmm0 \n\
139 vpextrw $0, %%xmm0, %0 \n\
140 " : "=m"(*res) : "m"(*op1), "m"(*op2) X86_ASM_XMM0_CLOB X86_ASM_XMM1_CLOBC);\
144 #ifdef INLINE_ASM_GCC_LABELS
145 #define gen_f16c_logical(fn, type, instr) \
146 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
148 if (sizeof(ajla_flat_option_t) != 1) \
151 vpinsrw $0, %1, %%xmm7, %%xmm0 \n\
152 vpinsrw $0, %2, %%xmm7, %%xmm1 \n\
153 vcvtph2ps %%xmm0, %%xmm0 \n\
154 vcvtph2ps %%xmm1, %%xmm1 \n\
155 vucomiss %%xmm1, %%xmm0 \n\
158 " : : "m"(*res), "m"(*op1), "m"(*op2) : "memory", "cc" X86_ASM_XMM0_CLOBC X86_ASM_XMM1_CLOBC : unordered);\
164 #define gen_f16c_logical(fn, type, instr) \
165 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
167 unsigned char unordered, r; \
169 vpinsrw $0, %2, %%xmm7, %%xmm0 \n\
170 vpinsrw $0, %3, %%xmm7, %%xmm1 \n\
171 vcvtph2ps %%xmm0, %%xmm0 \n\
172 vcvtph2ps %%xmm1, %%xmm1 \n\
173 vucomiss %%xmm1, %%xmm0 \n\
176 " : "=r"(r), "=r"(unordered) : "m"(*op1), "m"(*op2) : "cc" X86_ASM_XMM0_CLOBC);\
177 if (unlikely(unordered)) \
184 #define gen_f16c_to_int(fn, type) \
185 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, int_default_t *res)\
189 vpinsrw $0, %1, %%xmm7, %%xmm0 \n\
190 vcvtph2ps %%xmm0, %%xmm0 \n\
191 vcvttss2si %%xmm0, %0 \n\
192 " : "=r"(r) : "m"(*op1) X86_ASM_XMM0_CLOB); \
193 if (unlikely(r == sign_bit(int_default_t))) \
199 #define gen_f16c_from_int(fn, type, z) \
200 static ipret_inline void attr_unused cat4(REAL_unary_,fn,_,type)(const int_default_t *op1, type *res)\
203 vcvtsi2ss"#z" %1, %%xmm7, %%xmm0 \n\
204 vcvtps2ph $4, %%xmm0, %%xmm0 \n\
205 vpextrw $0, %%xmm0, %0 \n\
206 " : "=m"(*res) : "rm"(*op1) X86_ASM_XMM0_CLOB); \
209 #define gen_vfp_binary(fn, type, op, f, s) \
210 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, type *res)\
212 __asm__ (ARM_ASM_PREFIX " \n\
213 vldr "s"0, [ %1 ] \n\
214 vldr "s"1, [ %2 ] \n\
215 "op"."f" "s"0, "s"0, "s"1 \n\
216 vstr "s"0, [ %0 ] \n\
217 " :: "r"(res), "r"(op1), "r"(op2) : s"0", s"1", "memory"); \
221 #define gen_vfp_unary(fn, type, op, f, s) \
222 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, type *res)\
224 __asm__ (ARM_ASM_PREFIX " \n\
225 vldr "s"0, [ %1 ] \n\
226 "op"."f" "s"0, "s"0 \n\
227 vstr "s"0, [ %0 ] \n\
228 " :: "r"(res), "r"(op1) : s"0", "memory"); \
232 #ifdef INLINE_ASM_GCC_LABELS
233 #define gen_vfp_logical(fn, type, cond, f, s) \
234 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
236 __asm__ goto (ARM_ASM_PREFIX " \n\
237 vldr "s"0, [ %1 ] \n\
238 vldr "s"1, [ %2 ] \n\
240 vcmp."f" "s"0, "s"1 \n\
241 vmrs APSR_nzcv, fpscr \n\
242 bvs %l[unordered] \n\
244 mov"#cond" r0, #1 \n\
246 " : : "r"(res), "r"(op1), "r"(op2) : s"0", s"1", "r0", "memory", "cc" : unordered);\
251 #define gen_vfp_to_int(fn, type, f, s) \
252 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, int_default_t *res)\
254 __asm__ goto (ARM_ASM_PREFIX " \n\
255 vldr "s"0, [ %1 ] \n\
256 vcmp."f" "s"0, "s"0 \n\
257 vmrs APSR_nzcv, fpscr \n\
258 bvs %l[unordered] \n\
259 vcvt.s32."f" s1, "s"0 \n\
261 add r0, r0, #0x80000000 \n\
262 add r0, r0, #0x00000001 \n\
264 bls %l[unordered] \n\
266 " : : "r"(res), "r"(op1) : s"0", s"1", "r0", "memory", "cc" : unordered);\
272 #define gen_vfp_logical(fn, type, cond, f, s) \
273 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
275 unsigned unordered, r; \
276 __asm__ (ARM_ASM_PREFIX " \n\
277 vldr "s"0, [ %2 ] \n\
278 vldr "s"1, [ %3 ] \n\
281 vcmp."f" "s"0, "s"1 \n\
282 vmrs APSR_nzcv, fpscr \n\
286 mov"#cond" %1, #1 \n\
287 " : "=r"(unordered), "=r"(r) : "r"(op1), "r"(op2) : s"0", s"1", "r0", "memory", "cc");\
288 if (unlikely(unordered)) \
293 #define gen_vfp_to_int(fn, type, f, s) \
294 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, int_default_t *res)\
296 unsigned unordered; \
298 __asm__ (ARM_ASM_PREFIX " \n\
299 vldr "s"0, [ %2 ] \n\
301 vcmp."f" "s"0, "s"0 \n\
302 vmrs APSR_nzcv, fpscr \n\
305 vcvt.s32."f" s0, "s"0 \n\
307 " : "=r"(unordered), "=r"(r) : "r"(op1) : s"0", s"1", "r0", "memory", "cc");\
308 if (unlikely(unordered) || (unlikely((unsigned)r + 0x80000001U < 1)))\
315 #define gen_vfp_from_int(fn, type, f, s) \
316 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const int_default_t *op1, type *res)\
320 vcvt."f".s32 "s"0, s0 \n\
321 vstr "s"0, [ %0 ] \n\
322 " : : "r"(res), "r"(op1) : "d0", "memory"); \
326 #define gen_vfp_is_exception(fn, type, f, s) \
327 static ipret_inline void attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, ajla_flat_option_t *res)\
329 unsigned unordered; \
330 __asm__ (ARM_ASM_PREFIX " \n\
331 vldr "s"0, [ %1 ] \n\
333 vcmp."f" "s"0, "s"0 \n\
334 vmrs APSR_nzcv, fpscr \n\
337 " : "=r"(unordered) : "r"(op1) : s"0", s"1", "cc"); \
341 #define gen_vfp_half_binary(fn, type, op) \
342 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, type *res)\
344 __asm__ (ARM_ASM_PREFIX " \n\
345 vld1.16 d0[0], [ %1 ] \n\
346 vld1.16 d0[2], [ %2 ] \n\
347 vcvtb.f32.f16 s0, s0 \n\
348 vcvtb.f32.f16 s1, s1 \n\
349 "op".f32 s0, s0, s1 \n\
350 vcvtb.f16.f32 s0, s0 \n\
351 vst1.16 d0[0], [ %0 ] \n\
352 " :: "r"(res), "r"(op1), "r"(op2) : "d0", "memory"); \
356 #ifdef INLINE_ASM_GCC_LABELS
357 #define gen_vfp_half_logical(fn, type, cond) \
358 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
360 __asm__ goto (ARM_ASM_PREFIX " \n\
361 vld1.16 d0[0], [ %1 ] \n\
362 vld1.16 d0[2], [ %2 ] \n\
364 vcvtb.f32.f16 s0, s0 \n\
365 vcvtb.f32.f16 s1, s1 \n\
367 vmrs APSR_nzcv, fpscr \n\
368 bvs %l[unordered] \n\
370 mov"#cond" r0, #1 \n\
372 " : : "r"(res), "r"(op1), "r"(op2) : "d0", "r0", "memory", "cc" : unordered);\
377 #define gen_vfp_half_to_int(fn, type) \
378 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, int_default_t *res)\
380 __asm__ goto (ARM_ASM_PREFIX " \n\
381 vld1.16 d0[0], [ %1 ] \n\
382 vcvtb.f32.f16 s0, s0 \n\
384 vmrs APSR_nzcv, fpscr \n\
385 bvs %l[unordered] \n\
386 vcvt.s32.f32 s1, s0 \n\
388 add r0, r0, #0x80000000 \n\
389 add r0, r0, #0x00000001 \n\
391 bls %l[unordered] \n\
393 " : : "r"(res), "r"(op1) : "d0", "r0", "memory", "cc" : unordered);\
399 #define gen_vfp_half_logical(fn, type, cond) \
400 static ipret_inline bool attr_unused cat4(REAL_binary_,fn,_,type)(const type *op1, const type *op2, ajla_flat_option_t *res)\
402 unsigned unordered, r; \
403 __asm__ (ARM_ASM_PREFIX " \n\
404 vld1.16 d0[0], [ %2 ] \n\
405 vld1.16 d0[2], [ %3 ] \n\
408 vcvtb.f32.f16 s0, s0 \n\
409 vcvtb.f32.f16 s1, s1 \n\
411 vmrs APSR_nzcv, fpscr \n\
415 mov"#cond" %1, #1 \n\
416 " : "=r"(unordered), "=r"(r) : "r"(op1), "r"(op2) : "d0", "memory", "cc");\
417 if (unlikely(unordered)) \
422 #define gen_vfp_half_to_int(fn, type) \
423 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const type *op1, int_default_t *res)\
425 unsigned unordered; \
427 __asm__ (ARM_ASM_PREFIX " \n\
428 vld1.16 d0[0], [ %2 ] \n\
430 vcvtb.f32.f16 s0, s0 \n\
432 vmrs APSR_nzcv, fpscr \n\
435 vcvt.s32.f32 s0, s0 \n\
437 " : "=r"(unordered), "=r"(r) : "r"(op1) : "d0", "r0", "memory", "cc");\
438 if (unlikely(unordered) || (unlikely((unsigned)r + 0x80000001U < 1)))\
445 #define gen_vfp_half_from_int(fn, type) \
446 static ipret_inline bool attr_unused cat4(REAL_unary_,fn,_,type)(const int_default_t *op1, type *res)\
448 __asm__ (ARM_ASM_PREFIX " \n\
450 vcvt.f32.s32 s0, s0 \n\
451 vcvtb.f16.f32 s0, s0 \n\
452 vst1.16 d0[0], [ %0 ] \n\
453 " : : "r"(res), "r"(op1) : "d0", "memory"); \
458 #ifdef INT_DEFAULT_BITS
460 #define gen_sse_ops(type, s, z) \
461 gen_sse_binary(add_alt1, type, "", add, s, sse_one_param) \
462 gen_sse_binary(subtract_alt1, type, "", sub, s, sse_one_param) \
463 gen_sse_binary(multiply_alt1, type, "", mul, s, sse_one_param) \
464 gen_sse_binary(divide_alt1, type, "", div, s, sse_one_param) \
465 gen_sse_logical(equal_alt1, type, "", sete, s) \
466 gen_sse_logical(not_equal_alt1, type, "", setne, s) \
467 gen_sse_logical(less_alt1, type, "", setb, s) \
468 gen_sse_logical(less_equal_alt1, type, "", setbe, s) \
469 gen_sse_neg(neg_alt1, type, "", s, sse_one_param) \
470 gen_sse_sqrt(sqrt_alt1, type, "", s, sse_one_param) \
471 gen_sse_to_int(to_int_alt1, type, "", s) \
472 gen_sse_from_int(from_int_alt1, type, "", s, z, sse_one_param) \
473 gen_sse_is_exception(is_exception_alt1, type, "", s)
475 #define gen_avx_ops(type, s, z) \
476 gen_sse_binary(add_alt2, type, "v", add, s, avx_two_params) \
477 gen_sse_binary(subtract_alt2, type, "v", sub, s, avx_two_params) \
478 gen_sse_binary(multiply_alt2, type, "v", mul, s, avx_two_params) \
479 gen_sse_binary(divide_alt2, type, "v", div, s, avx_two_params) \
480 gen_sse_logical(equal_alt2, type, "v", sete, s) \
481 gen_sse_logical(not_equal_alt2, type, "v", setne, s) \
482 gen_sse_logical(less_alt2, type, "v", setb, s) \
483 gen_sse_logical(less_equal_alt2, type, "v", setbe, s) \
484 gen_sse_neg(neg_alt2, type, "v", s, avx_two_params) \
485 gen_sse_sqrt(sqrt_alt2, type, "v", s, avx_two_params) \
486 gen_sse_to_int(to_int_alt2, type, "v", s) \
487 gen_sse_from_int(from_int_alt2, type, "v", s, z, avx_two_params) \
488 gen_sse_is_exception(is_exception_alt2, type, "v", s)
490 #define gen_f16c_ops(z) \
491 gen_f16c_binary(add_alt1, real16_t, add) \
492 gen_f16c_binary(subtract_alt1, real16_t, sub) \
493 gen_f16c_binary(multiply_alt1, real16_t, mul) \
494 gen_f16c_binary(divide_alt1, real16_t, div) \
495 gen_f16c_logical(equal_alt1, real16_t, sete) \
496 gen_f16c_logical(not_equal_alt1, real16_t, setne) \
497 gen_f16c_logical(less_alt1, real16_t, setb) \
498 gen_f16c_logical(less_equal_alt1, real16_t, setbe) \
499 gen_f16c_to_int(to_int_alt1, real16_t) \
500 gen_f16c_from_int(from_int_alt1, real16_t, z)
502 #define gen_vfp_ops(type, f, s) \
503 gen_vfp_binary(add_alt1, type, "vadd", f, s) \
504 gen_vfp_binary(subtract_alt1, type, "vsub", f, s) \
505 gen_vfp_binary(multiply_alt1, type, "vmul", f, s) \
506 gen_vfp_binary(divide_alt1, type, "vdiv", f, s) \
507 gen_vfp_unary(neg_alt1, type, "vneg", f, s) \
508 gen_vfp_unary(sqrt_alt1, type, "vsqrt", f, s) \
509 gen_vfp_logical(equal_alt1, type, eq, f, s) \
510 gen_vfp_logical(not_equal_alt1, type, ne, f, s) \
511 gen_vfp_logical(less_alt1, type, mi, f, s) \
512 gen_vfp_logical(less_equal_alt1, type, ls, f, s) \
513 gen_vfp_to_int(to_int_alt1, type, f, s) \
514 gen_vfp_from_int(from_int_alt1, type, f, s) \
515 gen_vfp_is_exception(is_exception_alt1, type, f, s)
517 #define gen_vfp_half_ops() \
518 gen_vfp_half_binary(add_alt1, real16_t, "vadd") \
519 gen_vfp_half_binary(subtract_alt1, real16_t, "vsub") \
520 gen_vfp_half_binary(multiply_alt1, real16_t, "vmul") \
521 gen_vfp_half_binary(divide_alt1, real16_t, "vdiv") \
522 gen_vfp_half_logical(equal_alt1, real16_t, eq) \
523 gen_vfp_half_logical(not_equal_alt1, real16_t, ne) \
524 gen_vfp_half_logical(less_alt1, real16_t, mi) \
525 gen_vfp_half_logical(less_equal_alt1, real16_t, ls) \
526 gen_vfp_half_to_int(to_int_alt1, real16_t) \
527 gen_vfp_half_from_int(from_int_alt1, real16_t)
531 #define gen_sse_ops(type, s, z)
532 #define gen_avx_ops(type, s, z)
533 #define gen_f16c_ops(z)
534 #define gen_vfp_ops(type, f, s)
535 #define gen_vfp_half_ops()
539 #define file_inc "arithm-r.inc"
540 #include "for-real.inc"