codegen: use the and instruction when doing zero-extend
[ajla.git] / util.c
blob976d69eca5cdb9d9bc0b850f5593d748e2abe461
1 /*
2 * Copyright (C) 2024 Mikulas Patocka
4 * This file is part of Ajla.
6 * Ajla is free software: you can redistribute it and/or modify it under the
7 * terms of the GNU General Public License as published by the Free Software
8 * Foundation, either version 3 of the License, or (at your option) any later
9 * version.
11 * Ajla is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
13 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along with
16 * Ajla. If not, see <https://www.gnu.org/licenses/>.
19 #include "ajla.h"
21 #include "asm.h"
22 #include "str.h"
23 #include "arithm-b.h"
24 #include "arithm-i.h"
25 #include "arithm-r.h"
27 #include "util.h"
29 #include <stdio.h>
31 #if GNUC_ATLEAST(3,3,0) && !defined(__OPTIMIZE_SIZE__) && (defined(UNALIGNED_ACCESS_EFFICIENT) || defined(HAVE_BUILTIN_ASSUME_ALIGNED))
32 #define MEMCPY_FAST
33 #endif
35 void attr_fastcall memcpy_fast(void *dest, const void *src, size_t size)
37 if (unlikely(dest == src))
38 return;
39 #ifdef MEMCPY_FAST
40 #ifdef __BIGGEST_ALIGNMENT__
41 #define al(n) minimum(n, __BIGGEST_ALIGNMENT__)
42 #else
43 #define al(n) (n)
44 #endif
45 #ifndef UNALIGNED_ACCESS_EFFICIENT
46 #define check_alignment(n) \
47 if (unlikely(((ptr_to_num(dest) | ptr_to_num(src)) & ((al(n)) - 1)) != 0)) break;\
48 dest = __builtin_assume_aligned(dest, al(n)); \
49 src = __builtin_assume_aligned(src, al(n));
50 #else
51 #define check_alignment(n)
52 #endif
53 switch (size) {
54 case 1:
55 (void)memcpy(dest, src, 1);
56 return;
57 case 2:
58 check_alignment(2)
59 (void)memcpy(dest, src, 2);
60 return;
61 case 4:
62 check_alignment(4)
63 (void)memcpy(dest, src, 4);
64 return;
65 #if (defined(__i386__) || (defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 12)) && defined(UNALIGNED_ACCESS_EFFICIENT)
66 case 12:
67 (void)memcpy(cast_ptr(char *, dest) + 8, cast_ptr(const char *, src) + 8, 4);
68 #endif
69 /*-fallthrough*/
70 case 8:
71 check_alignment(8)
72 (void)memcpy(dest, src, 8);
73 return;
74 case 16:
75 check_alignment(16)
76 (void)memcpy(dest, src, 16);
77 return;
79 #undef al
80 #undef check_alignment
81 #endif
82 (void)memcpy(dest, src, size);
85 float half_to_float(uint16_t attr_unused x)
87 #if !REAL_MASK
88 return 0;
89 #else
90 float res;
91 uint16_t pos;
93 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_F16C)
94 if (likely(cpu_test_feature(CPU_FEATURE_f16c))) {
95 float r;
96 #ifdef __SSE__
97 __asm__ ("vmovd %k1, %0; vcvtph2ps %0, %0" : "=x"(r) : "r"(x));
98 #else
99 __asm__ ("vmovd %k1, %%xmm0; vcvtph2ps %%xmm0, %%xmm0; vmovss %%xmm0, %0" : "=m"(r) : "r"(x));
100 #endif
101 return r;
103 #endif
105 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_HALF_PRECISION)
106 if (likely(cpu_test_feature(CPU_FEATURE_half))) {
107 #if defined(__SOFTFP__) || (CLANG_ATLEAST(0,0,0) && !CLANG_ATLEAST(6,0,0))
108 __asm__ (ARM_ASM_PREFIX "vmov s0, %1; vcvtb.f32.f16 s0, s0; vmov %0, s0" : "=r"(res) : "r"((uint32_t)x) : "s0");
109 #else
110 __asm__ (ARM_ASM_PREFIX "vcvtb.f32.f16 %0, %1" : "=t"(res) : "t"((uint32_t)x));
111 #endif
112 return res;
114 #endif
116 res = 0;
117 pos = x & 0x7fff;
118 if (likely((uint16_t)(pos - 0x400) < 0x7800)) {
119 #if defined(HAVE_UNION_FLOAT_UINT32_T) && !defined(UNUSUAL)
120 union {
121 float f;
122 uint32_t i;
123 } u;
124 u.i = ((uint32_t)(x & (uint32_t)0x8000UL) << 16) | ((pos + (uint32_t)0x1c000UL) << 13);
125 return u.f;
126 #else
127 res = (float)((x & 0x3ff) | 0x400) * (float)(1. / (1L << 25)) * (float)((int32_t)1 << (pos >> 10));
128 #endif
129 } else if (pos < 0x400) {
130 res = (float)pos * (float)(1. / (1L << 24));
131 } else if (pos == 0x7c00) {
132 #ifdef HUGE_VAL
133 res = HUGE_VAL;
134 #else
135 res = 1. / 0.;
136 #endif
137 } else {
138 #ifdef NAN
139 res = NAN;
140 #else
141 double z = 0.;
142 res = z / z;
143 #endif
145 #if defined(HAVE_COPYSIGNF) && (defined(__x86_64__) ^ defined(UNUSUAL_ARITHMETICS))
146 res = copysignf(res, (float)(int16_t)x);
147 #else
148 if (unlikely((int16_t)x < 0))
149 res = -res;
150 #endif
151 return res;
152 #endif
155 uint16_t float_to_half(float attr_unused x)
157 #if !REAL_MASK
158 return 0;
159 #else
160 float a, mant;
161 float limit;
162 uint16_t res;
164 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_F16C)
165 if (likely(cpu_test_feature(CPU_FEATURE_f16c))) {
166 uint32_t r;
167 #ifdef __SSE__
168 __asm__ ("vcvtps2ph $4, %1, %1; vmovd %1, %0" : "=r"(r), "+x"(x));
169 #else
170 __asm__ ("vmovss %1, %%xmm0; vcvtps2ph $4, %%xmm0, %%xmm0; vmovd %%xmm0, %0" : "=r"(r) : "m"(x));
171 #endif
172 return r;
174 #endif
176 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_HALF_PRECISION)
177 if (likely(cpu_test_feature(CPU_FEATURE_half))) {
178 uint32_t r;
179 #if defined(__SOFTFP__) || (CLANG_ATLEAST(0,0,0) && !CLANG_ATLEAST(6,0,0))
180 __asm__ (ARM_ASM_PREFIX "vmov s0, %1; vcvtb.f16.f32 s0, s0; vmov %0, s0" : "=r"(r) : "r"(x) : "s0");
181 #else
182 __asm__ (ARM_ASM_PREFIX "vcvtb.f16.f32 %1, %1; vmov %0, %1" : "=r"(r), "+t"(x));
183 #endif
184 return r;
186 #endif
188 res = (uint16_t)!!signbit(x) << 15;
189 a = fabs(x);
190 limit = 65520.;
191 #if defined(use_is_macros) && !defined(UNUSUAL_ARITHMETICS)
192 if (unlikely(isunordered(a, limit)))
193 #else
194 if (unlikely(isnan_real32_t(a)))
195 #endif
197 res |= 0x200;
198 goto inf;
200 #if defined(use_is_macros) && !defined(UNUSUAL_ARITHMETICS)
201 if (unlikely(isgreaterequal(a, limit)))
202 #else
203 if (unlikely(a >= limit))
204 #endif
206 inf:
207 res |= 0x7c00;
208 } else if (unlikely(a < (float)(1. / (1 << 14)))) {
209 mant = a * (float)(1L << 24);
210 res |= 0x400;
211 goto do_round;
212 } else {
213 int ex, im;
214 #if defined(HAVE_UNION_FLOAT_UINT32_T) && !defined(UNUSUAL)
215 union {
216 float f;
217 uint32_t i;
218 } u;
219 u.f = a;
220 ex = (u.i >> 23) - 126;
221 u.i &= 0x007fffffUL;
222 u.i |= 0x44800000UL;
223 mant = u.f;
224 #else
225 mant = frexpf(a, &ex);
226 mant *= 1 << 11;
227 #endif
228 res += (ex + 14) << 10;
229 do_round:
230 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_SSE) && static_test_sse
231 __asm__ (X86_ASM_V"cvtss2si %1, %0" : "=r"(im) :
232 #ifdef __SSE__
233 "x"X86_ASM_M
234 #else
236 #endif
237 (mant));
238 #elif defined(HAVE_LRINTF) && !defined(UNUSUAL_ARITHMETICS)
239 im = (int)lrintf(mant);
240 #else
241 im = (int)mant;
242 mant -= (float)im;
243 if (mant > 0.5 || (unlikely(mant == 0.5) && im & 1))
244 im++;
245 #endif
246 im -= 0x400;
247 res += im;
249 return res;
250 #endif
253 #ifdef need_signbit_d
254 int signbit_d(double d)
256 #ifdef HAVE_COPYSIGN
257 return copysign(1, d) < 0;
258 #else
259 char s[256];
260 if (likely(d > 0)) return 0;
261 if (likely(d < 0)) return 1;
262 sprintf(s, "%f", d);
263 return s[0] == '-';
264 #endif
266 #endif
269 #define DEFINE_OPCODE_START_LBL(opcode, lbl) \
270 { (opcode) + ARG_MODE * OPCODE_MODE_MULT, stringify(lbl) },
271 const char attr_cold *decode_opcode(code_t o, bool allow_invalid)
273 static const struct {
274 code_t opcode;
275 const char *string;
276 } table[] = {
277 #include "ipret.inc"
279 static atomic_type code_t rmap[ARG_MODE_N * OPCODE_MODE_MULT];
281 code_t i;
283 if (unlikely(o >= n_array_elements(rmap)))
284 goto unknown;
286 if (likely(rmap[o]))
287 return table[rmap[o] - 1].string;
289 for (i = 0; i < n_array_elements(table); i++) {
290 if (unlikely(table[i].opcode == o)) {
291 rmap[o] = i + 1;
292 return table[i].string;
295 unknown:
296 if (!allow_invalid) {
297 /*for (i = 0; i < n_array_elements(table); i++) {
298 debug("%04x - %s", table[i].opcode, table[i].string);
300 internal(file_line, "decode_opcode: invalid opcode %04x", o);
302 return NULL;