src/lj_emit_arm64.h

   1 /*
   2 ** ARM64 instruction emitter.
   3 ** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
   4 **
   5 ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
   6 ** Sponsored by Cisco Systems, Inc.
   7 */
   8
   9 /* -- Constant encoding --------------------------------------------------- */
  10
  11 static uint64_t get_k64val(ASMState *as, IRRef ref)
  12 {
  13   IRIns *ir = IR(ref);
  14   if (ir->o == IR_KINT64) {
  15     return ir_kint64(ir)->u64;
  16   } else if (ir->o == IR_KGC) {
  17     return (uint64_t)ir_kgc(ir);
  18   } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
  19     return (uint64_t)ir_kptr(ir);
  20   } else {
  21     lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
  22                "bad 64 bit const IR op %d", ir->o);
  23     return (uint32_t)ir->i;  /* Zero-extended. */
  24   }
  25 }
  26
  27 /* Encode constant in K12 format for data processing instructions. */
  28 static uint32_t emit_isk12(int64_t n)
  29 {
  30   uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n;
  31   uint32_t m = n < 0 ? 0x40000000 : 0;
  32   if (k < 0x1000) {
  33     return (uint32_t)(A64I_K12|m|A64F_U12(k));
  34   } else if ((k & 0xfff000) == k) {
  35     return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12));
  36   }
  37   return 0;
  38 }
  39
  40 #define emit_clz64(n)   (lj_fls64(n)^63)
  41 #define emit_ctz64(n)   lj_ffs64(n)
  42
  43 /* Encode constant in K13 format for logical data processing instructions. */
  44 static uint32_t emit_isk13(uint64_t n, int is64)
  45 {
  46   /* Thanks to: https://dougallj.wordpress.com/2021/10/30/ */
  47   int rot, ones, size, immr, imms;
  48   if (!is64) n = ((uint64_t)n << 32) | (uint32_t)n;
  49   if ((n+1u) <= 1u) return 0;  /* Neither all-zero nor all-ones are allowed. */
  50   rot = (n & (n+1u)) ? emit_ctz64(n & (n+1u)) : 64;
  51   n = lj_ror(n, rot & 63);
  52   ones = emit_ctz64(~n);
  53   size = emit_clz64(n) + ones;
  54   if (lj_ror(n, size & 63) != n) return 0;  /* Non-repeating? */
  55   immr = -rot & (size - 1);
  56   imms = (-(size << 1) | (ones - 1)) & 63;
  57   return A64I_K13 | A64F_IMMR(immr | (size & 64)) | A64F_IMMS(imms);
  58 }
  59
  60 static uint32_t emit_isfpk64(uint64_t n)
  61 {
  62   uint64_t etop9 = ((n >> 54) & 0x1ff);
  63   if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
  64     return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
  65   }
  66   return ~0u;
  67 }
  68
  69 static uint32_t emit_isfpmovi(uint64_t n)
  70 {
  71   /* Is every byte either 0x00 or 0xff? */
  72   if ((n & U64x(01010101,01010101)) * 0xff != n) return 0;
  73   /* Form 8-bit value by taking one bit from each byte. */
  74   n &= U64x(80402010,08040201);
  75   n = (n * U64x(01010101,01010101)) >> 56;
  76   /* Split into the format expected by movi. */
  77   return ((n & 0xe0) << 6) | 0x700 | (n & 0x1f);
  78 }
  79
  80 /* -- Emit basic instructions --------------------------------------------- */
  81
  82 static void emit_dnma(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm, Reg ra)
  83 {
  84   *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm) | A64F_A(ra);
  85 }
  86
  87 static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
  88 {
  89   *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
  90 }
  91
  92 static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
  93 {
  94   *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
  95 }
  96
  97 static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
  98 {
  99   *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
 100 }
 101
 102 static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
 103 {
 104   *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
 105 }
 106
 107 static void emit_d(ASMState *as, A64Ins ai, Reg rd)
 108 {
 109   *--as->mcp = ai | A64F_D(rd);
 110 }
 111
 112 static void emit_dl(ASMState *as, A64Ins ai, Reg rd, uint32_t l)
 113 {
 114   *--as->mcp = ai | A64F_D(rd) | A64F_S19(l >> 2);
 115 }
 116
 117 static void emit_n(ASMState *as, A64Ins ai, Reg rn)
 118 {
 119   *--as->mcp = ai | A64F_N(rn);
 120 }
 121
 122 static int emit_checkofs(A64Ins ai, int64_t ofs)
 123 {
 124   int scale = (ai >> 30) & 3;
 125   if (ofs < 0 || (ofs & ((1<<scale)-1))) {
 126     return (ofs >= -256 && ofs <= 255) ? -1 : 0;
 127   } else {
 128     return (ofs < (4096<<scale)) ? 1 : 0;
 129   }
 130 }
 131
 132 static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc)
 133 {
 134   if (ofs >= 0) {
 135     return ai | A64F_U12(ofs>>sc);  /* Subsequent lj_ror checks ofs. */
 136   } else if (ofs >= -256) {
 137     return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff);
 138   } else {
 139     return A64F_D(31);  /* Will mismatch prev. */
 140   }
 141 }
 142
 143 static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs64)
 144 {
 145   int ot = emit_checkofs(ai, ofs64), sc = (ai >> 30) & 3, ofs = (int)ofs64;
 146   lj_assertA(ot, "load/store offset %d out of range", ofs);
 147   /* Combine LDR/STR pairs to LDP/STP. */
 148   if ((sc == 2 || sc == 3) &&
 149       (!(ai & 0x400000) || rd != rn) &&
 150       as->mcp != as->mcloop) {
 151     uint32_t prev = *as->mcp & ~A64F_D(31);
 152     int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
 153     A64Ins aip;
 154     if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) {
 155       aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
 156     } else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) {
 157       aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
 158       ofsm = ofs;
 159     } else {
 160       goto nopair;
 161     }
 162     if (lj_ror((unsigned int)ofsm + (64u<<sc), sc) <= 127u) {
 163       *as->mcp = aip | A64F_N(rn) | (((ofsm >> sc) & 0x7f) << 15) |
 164         (ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
 165       return;
 166     }
 167   }
 168 nopair:
 169   if (ot == 1)
 170     *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
 171   else
 172     *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
 173 }
 174
 175 /* -- Emit loads/stores --------------------------------------------------- */
 176
 177 /* Prefer rematerialization of BASE/L from global_State over spills. */
 178 #define emit_canremat(ref)      ((ref) <= REF_BASE)
 179
 180 /* Try to find a one-step delta relative to other consts. */
 181 static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64)
 182 {
 183   RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL);
 184   while (work) {
 185     Reg r = rset_picktop(work);
 186     IRRef ref = regcost_ref(as->cost[r]);
 187     lj_assertA(r != rd, "dest reg %d not free", rd);
 188     if (ref < REF_TRUE) {
 189       uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
 190                                      get_k64val(as, ref);
 191       int64_t delta = (int64_t)(k - kx);
 192       if (!is64) delta = (int64_t)(int32_t)delta;  /* Sign-extend. */
 193       if (delta == 0) {
 194         emit_dm(as, is64|A64I_MOVw, rd, r);
 195         return 1;
 196       } else {
 197         uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta);
 198         if (k12) {
 199           emit_dn(as, (delta < 0 ? A64I_SUBw : A64I_ADDw)^is64^k12, rd, r);
 200           return 1;
 201         }
 202         /* Do other ops or multi-step deltas pay off? Probably not.
 203         ** E.g. XOR rarely helps with pointer consts.
 204         */
 205       }
 206     }
 207     rset_clear(work, r);
 208   }
 209   return 0;  /* Failed. */
 210 }
 211
 212 #define glofs(as, k) \
 213   ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
 214 #define mcpofs(as, k) \
 215   ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
 216 #define checkmcpofs(as, k) \
 217   (A64F_S_OK(mcpofs(as, k)>>2, 19))
 218
 219 /* Try to form a const as ADR or ADRP or ADRP + ADD. */
 220 static int emit_kadrp(ASMState *as, Reg rd, uint64_t k)
 221 {
 222   A64Ins ai = A64I_ADR;
 223   int64_t ofs = mcpofs(as, k);
 224   if (!A64F_S_OK((uint64_t)ofs, 21)) {
 225     uint64_t kpage = k & ~0xfffull;
 226     MCode *adrp = as->mcp - 1 - (k != kpage);
 227     ofs = (int64_t)(kpage - ((uint64_t)adrp & ~0xfffull)) >> 12;
 228     if (!A64F_S_OK(ofs, 21))
 229       return 0;  /* Failed. */
 230     if (k != kpage)
 231       emit_dn(as, (A64I_ADDx^A64I_K12)|A64F_U12(k - kpage), rd, rd);
 232     ai = A64I_ADRP;
 233   }
 234   emit_dl(as, ai|(((uint32_t)ofs&3)<<29), rd, ofs);
 235   return 1;
 236 }
 237
 238 static void emit_loadk(ASMState *as, Reg rd, uint64_t u64)
 239 {
 240   int zeros = 0, ones = 0, neg, lshift = 0;
 241   int is64 = (u64 >> 32) ? A64I_X : 0, i = is64 ? 4 : 2;
 242   /* Count non-homogeneous 16 bit fragments. */
 243   while (--i >= 0) {
 244     uint32_t frag = (u64 >> i*16) & 0xffff;
 245     zeros += (frag != 0);
 246     ones += (frag != 0xffff);
 247   }
 248   neg = ones < zeros;  /* Use MOVN if it pays off. */
 249   if ((neg ? ones : zeros) > 1) {  /* Need 2+ ins. Try 1 ins encodings. */
 250     uint32_t k13 = emit_isk13(u64, is64);
 251     if (k13) {
 252       emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
 253       return;
 254     }
 255     if (emit_kdelta(as, rd, u64, is64)) {
 256       return;
 257     }
 258     if (emit_kadrp(as, rd, u64)) {  /* Either 1 or 2 ins. */
 259       return;
 260     }
 261   }
 262   if (neg) {
 263     u64 = ~u64;
 264     if (!is64) u64 = (uint32_t)u64;
 265   }
 266   if (u64) {
 267     /* Find first/last fragment to be filled. */
 268     int shift = (63-emit_clz64(u64)) & ~15;
 269     lshift = emit_ctz64(u64) & ~15;
 270     for (; shift > lshift; shift -= 16) {
 271       uint32_t frag = (u64 >> shift) & 0xffff;
 272       if (frag == 0) continue; /* Will be correctly filled by MOVN/MOVZ. */
 273       if (neg) frag ^= 0xffff; /* MOVK requires the original value. */
 274       emit_d(as, is64 | A64I_MOVKw | A64F_U16(frag) | A64F_LSL16(shift), rd);
 275     }
 276   }
 277   /* But MOVN needs an inverted value. */
 278   emit_d(as, is64 | (neg ? A64I_MOVNw : A64I_MOVZw) |
 279              A64F_U16((u64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
 280 }
 281
 282 /* Load a 32 bit constant into a GPR. */
 283 #define emit_loadi(as, rd, i)   emit_loadk(as, rd, (uint32_t)i)
 284
 285 /* Load a 64 bit constant into a GPR. */
 286 #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i)
 287
 288 static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
 289
 290 /* Get/set from constant pointer. */
 291 static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
 292 {
 293   Reg base = RID_GL;
 294   int64_t ofs = glofs(as, p);
 295   if (emit_checkofs(ai, ofs)) {
 296     /* GL + offset, might subsequently fuse to LDP/STP. */
 297   } else if (ai == A64I_LDRx && checkmcpofs(as, p)) {
 298     /* IP + offset is cheaper than allock, but address must be in range. */
 299     emit_dl(as, A64I_LDRLx, r, mcpofs(as, p));
 300     return;
 301   } else {  /* Split up into base reg + offset. */
 302     int64_t i64 = i64ptr(p);
 303     base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
 304     ofs = i64 & 0x7fffull;
 305   }
 306   emit_lso(as, ai, r, base, ofs);
 307 }
 308
 309 /* Load 64 bit IR constant into register. */
 310 static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
 311 {
 312   const uint64_t *k = &ir_k64(ir)->u64;
 313   int64_t ofs;
 314   if (r >= RID_MAX_GPR) {
 315     uint32_t fpk = emit_isfpk64(*k);
 316     if (fpk != ~0u) {
 317       emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
 318       return;
 319     } else if ((fpk = emit_isfpmovi(*k))) {
 320       emit_d(as, A64I_MOVI_DI | (fpk << 5), (r & 31));
 321       return;
 322     }
 323   }
 324   ofs = glofs(as, k);
 325   if (emit_checkofs(A64I_LDRx, ofs)) {
 326     emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
 327              (r & 31), RID_GL, ofs);
 328   } else if (checkmcpofs(as, k)) {
 329     emit_dl(as, r >= RID_MAX_GPR ? A64I_LDRLd : A64I_LDRLx,
 330             (r & 31), mcpofs(as, k));
 331   } else {
 332     if (r >= RID_MAX_GPR) {
 333       emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
 334       r = RID_TMP;
 335     }
 336     emit_loadu64(as, r, *k);
 337   }
 338 }
 339
 340 /* Get/set global_State fields. */
 341 #define emit_getgl(as, r, field) \
 342   emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
 343 #define emit_setgl(as, r, field) \
 344   emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
 345
 346 /* Trace number is determined from pc of exit instruction. */
 347 #define emit_setvmstate(as, i)  UNUSED(i)
 348
 349 /* -- Emit control-flow instructions -------------------------------------- */
 350
 351 /* Label for internal jumps. */
 352 typedef MCode *MCLabel;
 353
 354 /* Return label pointing to current PC. */
 355 #define emit_label(as)          ((as)->mcp)
 356
 357 static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
 358 {
 359   MCode *p = --as->mcp;
 360   ptrdiff_t delta = target - p;
 361   lj_assertA(A64F_S_OK(delta, 19), "branch target out of range");
 362   *p = A64I_BCC | A64F_S19(delta) | cond;
 363 }
 364
 365 static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
 366 {
 367   MCode *p = --as->mcp;
 368   ptrdiff_t delta = target - p;
 369   lj_assertA(A64F_S_OK(delta, 26), "branch target out of range");
 370   *p = ai | A64F_S26(delta);
 371 }
 372
 373 static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target)
 374 {
 375   MCode *p = --as->mcp;
 376   ptrdiff_t delta = target - p;
 377   lj_assertA(bit < 63, "bit number out of range");
 378   lj_assertA(A64F_S_OK(delta, 14), "branch target out of range");
 379   if (bit > 31) ai |= A64I_X;
 380   *p = ai | A64F_BIT(bit & 31) | A64F_S14(delta) | r;
 381 }
 382
 383 static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target)
 384 {
 385   MCode *p = --as->mcp;
 386   ptrdiff_t delta = target - p;
 387   lj_assertA(A64F_S_OK(delta, 19), "branch target out of range");
 388   *p = ai | A64F_S19(delta) | r;
 389 }
 390
 391 #define emit_jmp(as, target)    emit_branch(as, A64I_B, (target))
 392
 393 static void emit_call(ASMState *as, ASMFunction target)
 394 {
 395   MCode *p = --as->mcp;
 396 #if LJ_ABI_PAUTH
 397   char *targetp = ptrauth_auth_data((char *)target,
 398                                     ptrauth_key_function_pointer, 0);
 399 #else
 400   char *targetp = (char *)target;
 401 #endif
 402   ptrdiff_t delta = targetp - (char *)p;
 403   if (A64F_S_OK(delta>>2, 26)) {
 404     *p = A64I_BL | A64F_S26(delta>>2);
 405   } else {  /* Target out of range: need indirect call. But don't use R0-R7. */
 406     Reg r = ra_allock(as, i64ptr(target),
 407                       RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
 408     *p = A64I_BLR_AUTH | A64F_N(r);
 409   }
 410 }
 411
 412 /* -- Emit generic operations --------------------------------------------- */
 413
 414 /* Generic move between two regs. */
 415 static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
 416 {
 417   if (dst >= RID_MAX_GPR) {
 418     emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
 419             (dst & 31), (src & 31));
 420     return;
 421   }
 422   if (as->mcp != as->mcloop) {  /* Swap early registers for loads/stores. */
 423     MCode ins = *as->mcp, swp = (src^dst);
 424     if ((ins & 0xbf800000) == 0xb9000000) {
 425       if (!((ins ^ (dst << 5)) & 0x000003e0))
 426         *as->mcp = ins ^ (swp << 5);  /* Swap N in load/store. */
 427       if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
 428         *as->mcp = ins ^ swp;  /* Swap D in store. */
 429     }
 430   }
 431   emit_dm(as, A64I_MOVx, dst, src);
 432 }
 433
 434 /* Generic load of register with base and (small) offset address. */
 435 static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
 436 {
 437   if (r >= RID_MAX_GPR)
 438     emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
 439   else
 440     emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
 441 }
 442
 443 /* Generic store of register with base and (small) offset address. */
 444 static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
 445 {
 446   if (r >= RID_MAX_GPR)
 447     emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
 448   else
 449     emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
 450 }
 451
 452 /* Emit an arithmetic operation with a constant operand. */
 453 static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
 454                      int32_t i, RegSet allow)
 455 {
 456   uint32_t k = emit_isk12(i);
 457   if (k)
 458     emit_dn(as, ai^k, dest, src);
 459   else
 460     emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
 461 }
 462
 463 /* Add offset to pointer. */
 464 static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
 465 {
 466   if (ofs)
 467     emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
 468                  ofs < 0 ? (int32_t)(~(uint32_t)ofs+1u) : ofs,
 469                  rset_exclude(RSET_GPR, r));
 470 }
 471
 472 #define emit_spsub(as, ofs)     emit_addptr(as, RID_SP, -(ofs))
 473