2 * Copyright (C) 2024 Mikulas Patocka
4 * This file is part of Ajla.
6 * Ajla is free software: you can redistribute it and/or modify it under the
7 * terms of the GNU General Public License as published by the Free Software
8 * Foundation, either version 3 of the License, or (at your option) any later
11 * Ajla is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
13 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along with
16 * Ajla. If not, see <https://www.gnu.org/licenses/>.
31 #if GNUC_ATLEAST(3,3,0) && !defined(__OPTIMIZE_SIZE__) && (defined(UNALIGNED_ACCESS_EFFICIENT) || defined(HAVE_BUILTIN_ASSUME_ALIGNED))
35 void attr_fastcall
memcpy_fast(void *dest
, const void *src
, size_t size
)
37 if (unlikely(dest
== src
))
40 #ifdef __BIGGEST_ALIGNMENT__
41 #define al(n) minimum(n, __BIGGEST_ALIGNMENT__)
45 #ifndef UNALIGNED_ACCESS_EFFICIENT
46 #define check_alignment(n) \
47 if (unlikely(((ptr_to_num(dest) | ptr_to_num(src)) & ((al(n)) - 1)) != 0)) break;\
48 dest = __builtin_assume_aligned(dest, al(n)); \
49 src = __builtin_assume_aligned(src, al(n));
51 #define check_alignment(n)
55 (void)memcpy(dest
, src
, 1);
59 (void)memcpy(dest
, src
, 2);
63 (void)memcpy(dest
, src
, 4);
65 #if (defined(__i386__) || (defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 12)) && defined(UNALIGNED_ACCESS_EFFICIENT)
67 (void)memcpy(cast_ptr(char *, dest
) + 8, cast_ptr(const char *, src
) + 8, 4);
72 (void)memcpy(dest
, src
, 8);
76 (void)memcpy(dest
, src
, 16);
80 #undef check_alignment
82 (void)memcpy(dest
, src
, size
);
85 float half_to_float(uint16_t attr_unused x
)
93 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_F16C)
94 if (likely(cpu_test_feature(CPU_FEATURE_f16c
))) {
97 __asm__ ("vmovd %k1, %0; vcvtph2ps %0, %0" : "=x"(r
) : "r"(x
));
99 __asm__ ("vmovd %k1, %%xmm0; vcvtph2ps %%xmm0, %%xmm0; vmovss %%xmm0, %0" : "=m"(r
) : "r"(x
));
105 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_HALF_PRECISION)
106 if (likely(cpu_test_feature(CPU_FEATURE_half
))) {
107 #if defined(__SOFTFP__) || (CLANG_ATLEAST(0,0,0) && !CLANG_ATLEAST(6,0,0))
108 __asm__ (ARM_ASM_PREFIX
"vmov s0, %1; vcvtb.f32.f16 s0, s0; vmov %0, s0" : "=r"(res
) : "r"((uint32_t)x
) : "s0");
110 __asm__ (ARM_ASM_PREFIX
"vcvtb.f32.f16 %0, %1" : "=t"(res
) : "t"((uint32_t)x
));
118 if (likely((uint16_t)(pos
- 0x400) < 0x7800)) {
119 #if defined(HAVE_UNION_FLOAT_UINT32_T) && !defined(UNUSUAL)
124 u
.i
= ((uint32_t)(x
& (uint32_t)0x8000UL
) << 16) | ((pos
+ (uint32_t)0x1c000UL
) << 13);
127 res
= (float)((x
& 0x3ff) | 0x400) * (float)(1. / (1L << 25)) * (float)((int32_t)1 << (pos
>> 10));
129 } else if (pos
< 0x400) {
130 res
= (float)pos
* (float)(1. / (1L << 24));
131 } else if (pos
== 0x7c00) {
145 #if defined(HAVE_COPYSIGNF) && (defined(__x86_64__) ^ defined(UNUSUAL_ARITHMETICS))
146 res
= copysignf(res
, (float)(int16_t)x
);
148 if (unlikely((int16_t)x
< 0))
155 uint16_t float_to_half(float attr_unused x
)
164 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_F16C)
165 if (likely(cpu_test_feature(CPU_FEATURE_f16c
))) {
168 __asm__ ("vcvtps2ph $4, %1, %1; vmovd %1, %0" : "=r"(r
), "+x"(x
));
170 __asm__ ("vmovss %1, %%xmm0; vcvtps2ph $4, %%xmm0, %%xmm0; vmovd %%xmm0, %0" : "=r"(r
) : "m"(x
));
176 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_HALF_PRECISION)
177 if (likely(cpu_test_feature(CPU_FEATURE_half
))) {
179 #if defined(__SOFTFP__) || (CLANG_ATLEAST(0,0,0) && !CLANG_ATLEAST(6,0,0))
180 __asm__ (ARM_ASM_PREFIX
"vmov s0, %1; vcvtb.f16.f32 s0, s0; vmov %0, s0" : "=r"(r
) : "r"(x
) : "s0");
182 __asm__ (ARM_ASM_PREFIX
"vcvtb.f16.f32 %1, %1; vmov %0, %1" : "=r"(r
), "+t"(x
));
188 res
= (uint16_t)!!signbit(x
) << 15;
191 #if defined(use_is_macros) && !defined(UNUSUAL_ARITHMETICS)
192 if (unlikely(isunordered(a
, limit
)))
194 if (unlikely(isnan_real32_t(a
)))
200 #if defined(use_is_macros) && !defined(UNUSUAL_ARITHMETICS)
201 if (unlikely(isgreaterequal(a
, limit
)))
203 if (unlikely(a
>= limit
))
208 } else if (unlikely(a
< (float)(1. / (1 << 14)))) {
209 mant
= a
* (float)(1L << 24);
214 #if defined(HAVE_UNION_FLOAT_UINT32_T) && !defined(UNUSUAL)
220 ex
= (u
.i
>> 23) - 126;
225 mant
= frexpf(a
, &ex
);
228 res
+= (ex
+ 14) << 10;
230 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_SSE) && static_test_sse
231 __asm__ (X86_ASM_V
"cvtss2si %1, %0" : "=r"(im
) :
238 #elif defined(HAVE_LRINTF) && !defined(UNUSUAL_ARITHMETICS)
239 im
= (int)lrintf(mant
);
243 if (mant
> 0.5 || (unlikely(mant
== 0.5) && im
& 1))
253 #ifdef need_signbit_d
254 int signbit_d(double d
)
257 return copysign(1, d
) < 0;
260 if (likely(d
> 0)) return 0;
261 if (likely(d
< 0)) return 1;
269 #define DEFINE_OPCODE_START_LBL(opcode, lbl) \
270 { (opcode) + ARG_MODE * OPCODE_MODE_MULT, stringify(lbl) },
271 const char attr_cold
*decode_opcode(code_t o
, bool allow_invalid
)
273 static const struct {
279 static atomic_type code_t rmap
[ARG_MODE_N
* OPCODE_MODE_MULT
];
283 if (unlikely(o
>= n_array_elements(rmap
)))
287 return table
[rmap
[o
] - 1].string
;
289 for (i
= 0; i
< n_array_elements(table
); i
++) {
290 if (unlikely(table
[i
].opcode
== o
)) {
292 return table
[i
].string
;
296 if (!allow_invalid
) {
297 /*for (i = 0; i < n_array_elements(table); i++) {
298 debug("%04x - %s", table[i].opcode, table[i].string);
300 internal(file_line
, "decode_opcode: invalid opcode %04x", o
);