util.c

   1 /*
   2  * Copyright (C) 2024 Mikulas Patocka
   3  *
   4  * This file is part of Ajla.
   5  *
   6  * Ajla is free software: you can redistribute it and/or modify it under the
   7  * terms of the GNU General Public License as published by the Free Software
   8  * Foundation, either version 3 of the License, or (at your option) any later
   9  * version.
  10  *
  11  * Ajla is distributed in the hope that it will be useful, but WITHOUT ANY
  12  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  13  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along with
  16  * Ajla. If not, see <https://www.gnu.org/licenses/>.
  17  */
  18
  19 #include "ajla.h"
  20
  21 #include "asm.h"
  22 #include "str.h"
  23 #include "arithm-b.h"
  24 #include "arithm-i.h"
  25 #include "arithm-r.h"
  26
  27 #include "util.h"
  28
  29 #include <stdio.h>
  30
  31 #if GNUC_ATLEAST(3,3,0) && !defined(__OPTIMIZE_SIZE__) && (defined(UNALIGNED_ACCESS_EFFICIENT) || defined(HAVE_BUILTIN_ASSUME_ALIGNED))
  32 #define MEMCPY_FAST
  33 #endif
  34
  35 void attr_fastcall memcpy_fast(void *dest, const void *src, size_t size)
  36 {
  37         if (unlikely(dest == src))
  38                 return;
  39 #ifdef MEMCPY_FAST
  40 #ifdef __BIGGEST_ALIGNMENT__
  41 #define al(n)           minimum(n, __BIGGEST_ALIGNMENT__)
  42 #else
  43 #define al(n)           (n)
  44 #endif
  45 #ifndef UNALIGNED_ACCESS_EFFICIENT
  46 #define check_alignment(n)                                              \
  47                         if (unlikely(((ptr_to_num(dest) | ptr_to_num(src)) & ((al(n)) - 1)) != 0)) break;\
  48                         dest = __builtin_assume_aligned(dest, al(n));   \
  49                         src = __builtin_assume_aligned(src, al(n));
  50 #else
  51 #define check_alignment(n)
  52 #endif
  53         switch (size) {
  54                 case 1:
  55                         (void)memcpy(dest, src, 1);
  56                         return;
  57                 case 2:
  58                         check_alignment(2)
  59                         (void)memcpy(dest, src, 2);
  60                         return;
  61                 case 4:
  62                         check_alignment(4)
  63                         (void)memcpy(dest, src, 4);
  64                         return;
  65 #if (defined(__i386__) || (defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 12)) && defined(UNALIGNED_ACCESS_EFFICIENT)
  66                 case 12:
  67                         (void)memcpy(cast_ptr(char *, dest) + 8, cast_ptr(const char *, src) + 8, 4);
  68 #endif
  69                         /*-fallthrough*/
  70                 case 8:
  71                         check_alignment(8)
  72                         (void)memcpy(dest, src, 8);
  73                         return;
  74                 case 16:
  75                         check_alignment(16)
  76                         (void)memcpy(dest, src, 16);
  77                         return;
  78         }
  79 #undef al
  80 #undef check_alignment
  81 #endif
  82         (void)memcpy(dest, src, size);
  83 }
  84
  85 float half_to_float(uint16_t attr_unused x)
  86 {
  87 #if !REAL_MASK
  88         return 0;
  89 #else
  90         float res;
  91         uint16_t pos;
  92
  93 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_F16C)
  94         if (likely(cpu_test_feature(CPU_FEATURE_f16c))) {
  95                 float r;
  96 #ifdef __SSE__
  97                 __asm__ ("vmovd %k1, %0; vcvtph2ps %0, %0" : "=x"(r) : "r"(x));
  98 #else
  99                 __asm__ ("vmovd %k1, %%xmm0; vcvtph2ps %%xmm0, %%xmm0; vmovss %%xmm0, %0" : "=m"(r) : "r"(x));
 100 #endif
 101                 return r;
 102         }
 103 #endif
 104
 105 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_HALF_PRECISION)
 106         if (likely(cpu_test_feature(CPU_FEATURE_half))) {
 107 #if defined(__SOFTFP__) || (CLANG_ATLEAST(0,0,0) && !CLANG_ATLEAST(6,0,0))
 108                 __asm__ (ARM_ASM_PREFIX "vmov s0, %1; vcvtb.f32.f16 s0, s0; vmov %0, s0" : "=r"(res) : "r"((uint32_t)x) : "s0");
 109 #else
 110                 __asm__ (ARM_ASM_PREFIX "vcvtb.f32.f16 %0, %1" : "=t"(res) : "t"((uint32_t)x));
 111 #endif
 112                 return res;
 113         }
 114 #endif
 115
 116         res = 0;
 117         pos = x & 0x7fff;
 118         if (likely((uint16_t)(pos - 0x400) < 0x7800)) {
 119 #if defined(HAVE_UNION_FLOAT_UINT32_T) && !defined(UNUSUAL)
 120                 union {
 121                         float f;
 122                         uint32_t i;
 123                 } u;
 124                 u.i = ((uint32_t)(x & (uint32_t)0x8000UL) << 16) | ((pos + (uint32_t)0x1c000UL) << 13);
 125                 return u.f;
 126 #else
 127                 res = (float)((x & 0x3ff) | 0x400) * (float)(1. / (1L << 25)) * (float)((int32_t)1 << (pos >> 10));
 128 #endif
 129         } else if (pos < 0x400) {
 130                 res = (float)pos * (float)(1. / (1L << 24));
 131         } else if (pos == 0x7c00) {
 132 #ifdef HUGE_VAL
 133                 res = HUGE_VAL;
 134 #else
 135                 res = 1. / 0.;
 136 #endif
 137         } else {
 138 #ifdef NAN
 139                 res = NAN;
 140 #else
 141                 double z = 0.;
 142                 res = z / z;
 143 #endif
 144         }
 145 #if defined(HAVE_COPYSIGNF) && (defined(__x86_64__) ^ defined(UNUSUAL_ARITHMETICS))
 146         res = copysignf(res, (float)(int16_t)x);
 147 #else
 148         if (unlikely((int16_t)x < 0))
 149                 res = -res;
 150 #endif
 151         return res;
 152 #endif
 153 }
 154
 155 uint16_t float_to_half(float attr_unused x)
 156 {
 157 #if !REAL_MASK
 158         return 0;
 159 #else
 160         float a, mant;
 161         float limit;
 162         uint16_t res;
 163
 164 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_F16C)
 165         if (likely(cpu_test_feature(CPU_FEATURE_f16c))) {
 166                 uint32_t r;
 167 #ifdef __SSE__
 168                 __asm__ ("vcvtps2ph $4, %1, %1; vmovd %1, %0" : "=r"(r), "+x"(x));
 169 #else
 170                 __asm__ ("vmovss %1, %%xmm0; vcvtps2ph $4, %%xmm0, %%xmm0; vmovd %%xmm0, %0" : "=r"(r) : "m"(x));
 171 #endif
 172                 return r;
 173         }
 174 #endif
 175
 176 #if defined(INLINE_ASM_GCC_ARM) && defined(HAVE_ARM_ASSEMBLER_HALF_PRECISION)
 177         if (likely(cpu_test_feature(CPU_FEATURE_half))) {
 178                 uint32_t r;
 179 #if defined(__SOFTFP__) || (CLANG_ATLEAST(0,0,0) && !CLANG_ATLEAST(6,0,0))
 180                 __asm__ (ARM_ASM_PREFIX "vmov s0, %1; vcvtb.f16.f32 s0, s0; vmov %0, s0" : "=r"(r) : "r"(x) : "s0");
 181 #else
 182                 __asm__ (ARM_ASM_PREFIX "vcvtb.f16.f32 %1, %1; vmov %0, %1" : "=r"(r), "+t"(x));
 183 #endif
 184                 return r;
 185         }
 186 #endif
 187
 188         res = (uint16_t)!!signbit(x) << 15;
 189         a = fabs(x);
 190         limit = 65520.;
 191 #if defined(use_is_macros) && !defined(UNUSUAL_ARITHMETICS)
 192         if (unlikely(isunordered(a, limit)))
 193 #else
 194         if (unlikely(isnan_real32_t(a)))
 195 #endif
 196         {
 197                 res |= 0x200;
 198                 goto inf;
 199         }
 200 #if defined(use_is_macros) && !defined(UNUSUAL_ARITHMETICS)
 201         if (unlikely(isgreaterequal(a, limit)))
 202 #else
 203         if (unlikely(a >= limit))
 204 #endif
 205         {
 206 inf:
 207                 res |= 0x7c00;
 208         } else if (unlikely(a < (float)(1. / (1 << 14)))) {
 209                 mant = a * (float)(1L << 24);
 210                 res |= 0x400;
 211                 goto do_round;
 212         } else {
 213                 int ex, im;
 214 #if defined(HAVE_UNION_FLOAT_UINT32_T) && !defined(UNUSUAL)
 215                 union {
 216                         float f;
 217                         uint32_t i;
 218                 } u;
 219                 u.f = a;
 220                 ex = (u.i >> 23) - 126;
 221                 u.i &= 0x007fffffUL;
 222                 u.i |= 0x44800000UL;
 223                 mant = u.f;
 224 #else
 225                 mant = frexpf(a, &ex);
 226                 mant *= 1 << 11;
 227 #endif
 228                 res += (ex + 14) << 10;
 229 do_round:
 230 #if defined(INLINE_ASM_GCC_X86) && defined(HAVE_X86_ASSEMBLER_SSE) && static_test_sse
 231                 __asm__ (X86_ASM_V"cvtss2si %1, %0" : "=r"(im) :
 232 #ifdef __SSE__
 233                         "x"X86_ASM_M
 234 #else
 235                         "m"
 236 #endif
 237                         (mant));
 238 #elif defined(HAVE_LRINTF) && !defined(UNUSUAL_ARITHMETICS)
 239                 im = (int)lrintf(mant);
 240 #else
 241                 im = (int)mant;
 242                 mant -= (float)im;
 243                 if (mant > 0.5 || (unlikely(mant == 0.5) && im & 1))
 244                         im++;
 245 #endif
 246                 im -= 0x400;
 247                 res += im;
 248         }
 249         return res;
 250 #endif
 251 }
 252
 253 #ifdef need_signbit_d
 254 int signbit_d(double d)
 255 {
 256 #ifdef HAVE_COPYSIGN
 257         return copysign(1, d) < 0;
 258 #else
 259         char s[256];
 260         if (likely(d > 0)) return 0;
 261         if (likely(d < 0)) return 1;
 262         sprintf(s, "%f", d);
 263         return s[0] == '-';
 264 #endif
 265 }
 266 #endif
 267
 268
 269 #define DEFINE_OPCODE_START_LBL(opcode, lbl)                    \
 270         { (opcode) + ARG_MODE * OPCODE_MODE_MULT, stringify(lbl) },
 271 const char attr_cold *decode_opcode(code_t o, bool allow_invalid)
 272 {
 273         static const struct {
 274                 code_t opcode;
 275                 const char *string;
 276         } table[] = {
 277 #include "ipret.inc"
 278         };
 279         static atomic_type code_t rmap[ARG_MODE_N * OPCODE_MODE_MULT];
 280
 281         code_t i;
 282
 283         if (unlikely(o >= n_array_elements(rmap)))
 284                 goto unknown;
 285
 286         if (likely(rmap[o]))
 287                 return table[rmap[o] - 1].string;
 288
 289         for (i = 0; i < n_array_elements(table); i++) {
 290                 if (unlikely(table[i].opcode == o)) {
 291                         rmap[o] = i + 1;
 292                         return table[i].string;
 293                 }
 294         }
 295 unknown:
 296         if (!allow_invalid) {
 297                 /*for (i = 0; i < n_array_elements(table); i++) {
 298                         debug("%04x - %s", table[i].opcode, table[i].string);
 299                 }*/
 300                 internal(file_line, "decode_opcode: invalid opcode %04x", o);
 301         }
 302         return NULL;
 303 }