2 ** ARM64 instruction emitter.
3 ** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
5 ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
6 ** Sponsored by Cisco Systems, Inc.
9 /* -- Constant encoding --------------------------------------------------- */
11 static uint64_t get_k64val(ASMState
*as
, IRRef ref
)
14 if (ir
->o
== IR_KINT64
) {
15 return ir_kint64(ir
)->u64
;
16 } else if (ir
->o
== IR_KGC
) {
17 return (uint64_t)ir_kgc(ir
);
18 } else if (ir
->o
== IR_KPTR
|| ir
->o
== IR_KKPTR
) {
19 return (uint64_t)ir_kptr(ir
);
21 lj_assertA(ir
->o
== IR_KINT
|| ir
->o
== IR_KNULL
,
22 "bad 64 bit const IR op %d", ir
->o
);
23 return (uint32_t)ir
->i
; /* Zero-extended. */
27 /* Encode constant in K12 format for data processing instructions. */
28 static uint32_t emit_isk12(int64_t n
)
30 uint64_t k
= n
< 0 ? ~(uint64_t)n
+1u : (uint64_t)n
;
31 uint32_t m
= n
< 0 ? 0x40000000 : 0;
33 return (uint32_t)(A64I_K12
|m
|A64F_U12(k
));
34 } else if ((k
& 0xfff000) == k
) {
35 return (uint32_t)(A64I_K12
|m
|0x400000|A64F_U12(k
>>12));
40 #define emit_clz64(n) (lj_fls64(n)^63)
41 #define emit_ctz64(n) lj_ffs64(n)
43 /* Encode constant in K13 format for logical data processing instructions. */
44 static uint32_t emit_isk13(uint64_t n
, int is64
)
46 /* Thanks to: https://dougallj.wordpress.com/2021/10/30/ */
47 int rot
, ones
, size
, immr
, imms
;
48 if (!is64
) n
= ((uint64_t)n
<< 32) | (uint32_t)n
;
49 if ((n
+1u) <= 1u) return 0; /* Neither all-zero nor all-ones are allowed. */
50 rot
= (n
& (n
+1u)) ? emit_ctz64(n
& (n
+1u)) : 64;
51 n
= lj_ror(n
, rot
& 63);
52 ones
= emit_ctz64(~n
);
53 size
= emit_clz64(n
) + ones
;
54 if (lj_ror(n
, size
& 63) != n
) return 0; /* Non-repeating? */
55 immr
= -rot
& (size
- 1);
56 imms
= (-(size
<< 1) | (ones
- 1)) & 63;
57 return A64I_K13
| A64F_IMMR(immr
| (size
& 64)) | A64F_IMMS(imms
);
60 static uint32_t emit_isfpk64(uint64_t n
)
62 uint64_t etop9
= ((n
>> 54) & 0x1ff);
63 if ((n
<< 16) == 0 && (etop9
== 0x100 || etop9
== 0x0ff)) {
64 return (uint32_t)(((n
>> 48) & 0x7f) | ((n
>> 56) & 0x80));
69 static uint32_t emit_isfpmovi(uint64_t n
)
71 /* Is every byte either 0x00 or 0xff? */
72 if ((n
& U64x(01010101,01010101)) * 0xff != n
) return 0;
73 /* Form 8-bit value by taking one bit from each byte. */
74 n
&= U64x(80402010,08040201);
75 n
= (n
* U64x(01010101,01010101)) >> 56;
76 /* Split into the format expected by movi. */
77 return ((n
& 0xe0) << 6) | 0x700 | (n
& 0x1f);
80 /* -- Emit basic instructions --------------------------------------------- */
82 static void emit_dnma(ASMState
*as
, A64Ins ai
, Reg rd
, Reg rn
, Reg rm
, Reg ra
)
84 *--as
->mcp
= ai
| A64F_D(rd
) | A64F_N(rn
) | A64F_M(rm
) | A64F_A(ra
);
87 static void emit_dnm(ASMState
*as
, A64Ins ai
, Reg rd
, Reg rn
, Reg rm
)
89 *--as
->mcp
= ai
| A64F_D(rd
) | A64F_N(rn
) | A64F_M(rm
);
92 static void emit_dm(ASMState
*as
, A64Ins ai
, Reg rd
, Reg rm
)
94 *--as
->mcp
= ai
| A64F_D(rd
) | A64F_M(rm
);
97 static void emit_dn(ASMState
*as
, A64Ins ai
, Reg rd
, Reg rn
)
99 *--as
->mcp
= ai
| A64F_D(rd
) | A64F_N(rn
);
102 static void emit_nm(ASMState
*as
, A64Ins ai
, Reg rn
, Reg rm
)
104 *--as
->mcp
= ai
| A64F_N(rn
) | A64F_M(rm
);
107 static void emit_d(ASMState
*as
, A64Ins ai
, Reg rd
)
109 *--as
->mcp
= ai
| A64F_D(rd
);
112 static void emit_dl(ASMState
*as
, A64Ins ai
, Reg rd
, uint32_t l
)
114 *--as
->mcp
= ai
| A64F_D(rd
) | A64F_S19(l
>> 2);
117 static void emit_n(ASMState
*as
, A64Ins ai
, Reg rn
)
119 *--as
->mcp
= ai
| A64F_N(rn
);
122 static int emit_checkofs(A64Ins ai
, int64_t ofs
)
124 int scale
= (ai
>> 30) & 3;
125 if (ofs
< 0 || (ofs
& ((1<<scale
)-1))) {
126 return (ofs
>= -256 && ofs
<= 255) ? -1 : 0;
128 return (ofs
< (4096<<scale
)) ? 1 : 0;
132 static LJ_AINLINE
uint32_t emit_lso_pair_candidate(A64Ins ai
, int ofs
, int sc
)
135 return ai
| A64F_U12(ofs
>>sc
); /* Subsequent lj_ror checks ofs. */
136 } else if (ofs
>= -256) {
137 return (ai
^A64I_LS_U
) | A64F_S9(ofs
& 0x1ff);
139 return A64F_D(31); /* Will mismatch prev. */
143 static void emit_lso(ASMState
*as
, A64Ins ai
, Reg rd
, Reg rn
, int64_t ofs64
)
145 int ot
= emit_checkofs(ai
, ofs64
), sc
= (ai
>> 30) & 3, ofs
= (int)ofs64
;
146 lj_assertA(ot
, "load/store offset %d out of range", ofs
);
147 /* Combine LDR/STR pairs to LDP/STP. */
148 if ((sc
== 2 || sc
== 3) &&
149 (!(ai
& 0x400000) || rd
!= rn
) &&
150 as
->mcp
!= as
->mcloop
) {
151 uint32_t prev
= *as
->mcp
& ~A64F_D(31);
152 int ofsm
= ofs
- (1<<sc
), ofsp
= ofs
+ (1<<sc
);
154 if (prev
== emit_lso_pair_candidate(ai
| A64F_N(rn
), ofsm
, sc
)) {
155 aip
= (A64F_A(rd
) | A64F_D(*as
->mcp
& 31));
156 } else if (prev
== emit_lso_pair_candidate(ai
| A64F_N(rn
), ofsp
, sc
)) {
157 aip
= (A64F_D(rd
) | A64F_A(*as
->mcp
& 31));
162 if (lj_ror((unsigned int)ofsm
+ (64u<<sc
), sc
) <= 127u) {
163 *as
->mcp
= aip
| A64F_N(rn
) | (((ofsm
>> sc
) & 0x7f) << 15) |
164 (ai
^ ((ai
== A64I_LDRx
|| ai
== A64I_STRx
) ? 0x50000000 : 0x90000000));
170 *--as
->mcp
= ai
| A64F_D(rd
) | A64F_N(rn
) | A64F_U12(ofs
>> sc
);
172 *--as
->mcp
= (ai
^A64I_LS_U
) | A64F_D(rd
) | A64F_N(rn
) | A64F_S9(ofs
& 0x1ff);
175 /* -- Emit loads/stores --------------------------------------------------- */
177 /* Prefer rematerialization of BASE/L from global_State over spills. */
178 #define emit_canremat(ref) ((ref) <= REF_BASE)
180 /* Try to find a one-step delta relative to other consts. */
181 static int emit_kdelta(ASMState
*as
, Reg rd
, uint64_t k
, int is64
)
183 RegSet work
= (~as
->freeset
& RSET_GPR
) | RID2RSET(RID_GL
);
185 Reg r
= rset_picktop(work
);
186 IRRef ref
= regcost_ref(as
->cost
[r
]);
187 lj_assertA(r
!= rd
, "dest reg %d not free", rd
);
188 if (ref
< REF_TRUE
) {
189 uint64_t kx
= ra_iskref(ref
) ? (uint64_t)ra_krefk(as
, ref
) :
191 int64_t delta
= (int64_t)(k
- kx
);
192 if (!is64
) delta
= (int64_t)(int32_t)delta
; /* Sign-extend. */
194 emit_dm(as
, is64
|A64I_MOVw
, rd
, r
);
197 uint32_t k12
= emit_isk12(delta
< 0 ? (int64_t)(~(uint64_t)delta
+1u) : delta
);
199 emit_dn(as
, (delta
< 0 ? A64I_SUBw
: A64I_ADDw
)^is64
^k12
, rd
, r
);
202 /* Do other ops or multi-step deltas pay off? Probably not.
203 ** E.g. XOR rarely helps with pointer consts.
209 return 0; /* Failed. */
212 #define glofs(as, k) \
213 ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
214 #define mcpofs(as, k) \
215 ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
216 #define checkmcpofs(as, k) \
217 (A64F_S_OK(mcpofs(as, k)>>2, 19))
219 /* Try to form a const as ADR or ADRP or ADRP + ADD. */
220 static int emit_kadrp(ASMState
*as
, Reg rd
, uint64_t k
)
222 A64Ins ai
= A64I_ADR
;
223 int64_t ofs
= mcpofs(as
, k
);
224 if (!A64F_S_OK((uint64_t)ofs
, 21)) {
225 uint64_t kpage
= k
& ~0xfffull
;
226 MCode
*adrp
= as
->mcp
- 1 - (k
!= kpage
);
227 ofs
= (int64_t)(kpage
- ((uint64_t)adrp
& ~0xfffull
)) >> 12;
228 if (!A64F_S_OK(ofs
, 21))
229 return 0; /* Failed. */
231 emit_dn(as
, (A64I_ADDx
^A64I_K12
)|A64F_U12(k
- kpage
), rd
, rd
);
234 emit_dl(as
, ai
|(((uint32_t)ofs
&3)<<29), rd
, ofs
);
238 static void emit_loadk(ASMState
*as
, Reg rd
, uint64_t u64
)
240 int zeros
= 0, ones
= 0, neg
, lshift
= 0;
241 int is64
= (u64
>> 32) ? A64I_X
: 0, i
= is64
? 4 : 2;
242 /* Count non-homogeneous 16 bit fragments. */
244 uint32_t frag
= (u64
>> i
*16) & 0xffff;
245 zeros
+= (frag
!= 0);
246 ones
+= (frag
!= 0xffff);
248 neg
= ones
< zeros
; /* Use MOVN if it pays off. */
249 if ((neg
? ones
: zeros
) > 1) { /* Need 2+ ins. Try 1 ins encodings. */
250 uint32_t k13
= emit_isk13(u64
, is64
);
252 emit_dn(as
, (is64
|A64I_ORRw
)^k13
, rd
, RID_ZERO
);
255 if (emit_kdelta(as
, rd
, u64
, is64
)) {
258 if (emit_kadrp(as
, rd
, u64
)) { /* Either 1 or 2 ins. */
264 if (!is64
) u64
= (uint32_t)u64
;
267 /* Find first/last fragment to be filled. */
268 int shift
= (63-emit_clz64(u64
)) & ~15;
269 lshift
= emit_ctz64(u64
) & ~15;
270 for (; shift
> lshift
; shift
-= 16) {
271 uint32_t frag
= (u64
>> shift
) & 0xffff;
272 if (frag
== 0) continue; /* Will be correctly filled by MOVN/MOVZ. */
273 if (neg
) frag
^= 0xffff; /* MOVK requires the original value. */
274 emit_d(as
, is64
| A64I_MOVKw
| A64F_U16(frag
) | A64F_LSL16(shift
), rd
);
277 /* But MOVN needs an inverted value. */
278 emit_d(as
, is64
| (neg
? A64I_MOVNw
: A64I_MOVZw
) |
279 A64F_U16((u64
>> lshift
) & 0xffff) | A64F_LSL16(lshift
), rd
);
282 /* Load a 32 bit constant into a GPR. */
283 #define emit_loadi(as, rd, i) emit_loadk(as, rd, (uint32_t)i)
285 /* Load a 64 bit constant into a GPR. */
286 #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i)
288 static Reg
ra_allock(ASMState
*as
, intptr_t k
, RegSet allow
);
290 /* Get/set from constant pointer. */
291 static void emit_lsptr(ASMState
*as
, A64Ins ai
, Reg r
, void *p
)
294 int64_t ofs
= glofs(as
, p
);
295 if (emit_checkofs(ai
, ofs
)) {
296 /* GL + offset, might subsequently fuse to LDP/STP. */
297 } else if (ai
== A64I_LDRx
&& checkmcpofs(as
, p
)) {
298 /* IP + offset is cheaper than allock, but address must be in range. */
299 emit_dl(as
, A64I_LDRLx
, r
, mcpofs(as
, p
));
301 } else { /* Split up into base reg + offset. */
302 int64_t i64
= i64ptr(p
);
303 base
= ra_allock(as
, (i64
& ~0x7fffull
), rset_exclude(RSET_GPR
, r
));
304 ofs
= i64
& 0x7fffull
;
306 emit_lso(as
, ai
, r
, base
, ofs
);
309 /* Load 64 bit IR constant into register. */
310 static void emit_loadk64(ASMState
*as
, Reg r
, IRIns
*ir
)
312 const uint64_t *k
= &ir_k64(ir
)->u64
;
314 if (r
>= RID_MAX_GPR
) {
315 uint32_t fpk
= emit_isfpk64(*k
);
317 emit_d(as
, A64I_FMOV_DI
| A64F_FP8(fpk
), (r
& 31));
319 } else if ((fpk
= emit_isfpmovi(*k
))) {
320 emit_d(as
, A64I_MOVI_DI
| (fpk
<< 5), (r
& 31));
325 if (emit_checkofs(A64I_LDRx
, ofs
)) {
326 emit_lso(as
, r
>= RID_MAX_GPR
? A64I_LDRd
: A64I_LDRx
,
327 (r
& 31), RID_GL
, ofs
);
328 } else if (checkmcpofs(as
, k
)) {
329 emit_dl(as
, r
>= RID_MAX_GPR
? A64I_LDRLd
: A64I_LDRLx
,
330 (r
& 31), mcpofs(as
, k
));
332 if (r
>= RID_MAX_GPR
) {
333 emit_dn(as
, A64I_FMOV_D_R
, (r
& 31), RID_TMP
);
336 emit_loadu64(as
, r
, *k
);
340 /* Get/set global_State fields. */
341 #define emit_getgl(as, r, field) \
342 emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
343 #define emit_setgl(as, r, field) \
344 emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
346 /* Trace number is determined from pc of exit instruction. */
347 #define emit_setvmstate(as, i) UNUSED(i)
349 /* -- Emit control-flow instructions -------------------------------------- */
351 /* Label for internal jumps. */
352 typedef MCode
*MCLabel
;
354 /* Return label pointing to current PC. */
355 #define emit_label(as) ((as)->mcp)
357 static void emit_cond_branch(ASMState
*as
, A64CC cond
, MCode
*target
)
359 MCode
*p
= --as
->mcp
;
360 ptrdiff_t delta
= target
- p
;
361 lj_assertA(A64F_S_OK(delta
, 19), "branch target out of range");
362 *p
= A64I_BCC
| A64F_S19(delta
) | cond
;
365 static void emit_branch(ASMState
*as
, A64Ins ai
, MCode
*target
)
367 MCode
*p
= --as
->mcp
;
368 ptrdiff_t delta
= target
- p
;
369 lj_assertA(A64F_S_OK(delta
, 26), "branch target out of range");
370 *p
= ai
| A64F_S26(delta
);
373 static void emit_tnb(ASMState
*as
, A64Ins ai
, Reg r
, uint32_t bit
, MCode
*target
)
375 MCode
*p
= --as
->mcp
;
376 ptrdiff_t delta
= target
- p
;
377 lj_assertA(bit
< 63, "bit number out of range");
378 lj_assertA(A64F_S_OK(delta
, 14), "branch target out of range");
379 if (bit
> 31) ai
|= A64I_X
;
380 *p
= ai
| A64F_BIT(bit
& 31) | A64F_S14(delta
) | r
;
383 static void emit_cnb(ASMState
*as
, A64Ins ai
, Reg r
, MCode
*target
)
385 MCode
*p
= --as
->mcp
;
386 ptrdiff_t delta
= target
- p
;
387 lj_assertA(A64F_S_OK(delta
, 19), "branch target out of range");
388 *p
= ai
| A64F_S19(delta
) | r
;
391 #define emit_jmp(as, target) emit_branch(as, A64I_B, (target))
393 static void emit_call(ASMState
*as
, ASMFunction target
)
395 MCode
*p
= --as
->mcp
;
397 char *targetp
= ptrauth_auth_data((char *)target
,
398 ptrauth_key_function_pointer
, 0);
400 char *targetp
= (char *)target
;
402 ptrdiff_t delta
= targetp
- (char *)p
;
403 if (A64F_S_OK(delta
>>2, 26)) {
404 *p
= A64I_BL
| A64F_S26(delta
>>2);
405 } else { /* Target out of range: need indirect call. But don't use R0-R7. */
406 Reg r
= ra_allock(as
, i64ptr(target
),
407 RSET_RANGE(RID_X8
, RID_MAX_GPR
)-RSET_FIXED
);
408 *p
= A64I_BLR_AUTH
| A64F_N(r
);
412 /* -- Emit generic operations --------------------------------------------- */
414 /* Generic move between two regs. */
415 static void emit_movrr(ASMState
*as
, IRIns
*ir
, Reg dst
, Reg src
)
417 if (dst
>= RID_MAX_GPR
) {
418 emit_dn(as
, irt_isnum(ir
->t
) ? A64I_FMOV_D
: A64I_FMOV_S
,
419 (dst
& 31), (src
& 31));
422 if (as
->mcp
!= as
->mcloop
) { /* Swap early registers for loads/stores. */
423 MCode ins
= *as
->mcp
, swp
= (src
^dst
);
424 if ((ins
& 0xbf800000) == 0xb9000000) {
425 if (!((ins
^ (dst
<< 5)) & 0x000003e0))
426 *as
->mcp
= ins
^ (swp
<< 5); /* Swap N in load/store. */
427 if (!(ins
& 0x00400000) && !((ins
^ dst
) & 0x0000001f))
428 *as
->mcp
= ins
^ swp
; /* Swap D in store. */
431 emit_dm(as
, A64I_MOVx
, dst
, src
);
434 /* Generic load of register with base and (small) offset address. */
435 static void emit_loadofs(ASMState
*as
, IRIns
*ir
, Reg r
, Reg base
, int32_t ofs
)
437 if (r
>= RID_MAX_GPR
)
438 emit_lso(as
, irt_isnum(ir
->t
) ? A64I_LDRd
: A64I_LDRs
, (r
& 31), base
, ofs
);
440 emit_lso(as
, irt_is64(ir
->t
) ? A64I_LDRx
: A64I_LDRw
, r
, base
, ofs
);
443 /* Generic store of register with base and (small) offset address. */
444 static void emit_storeofs(ASMState
*as
, IRIns
*ir
, Reg r
, Reg base
, int32_t ofs
)
446 if (r
>= RID_MAX_GPR
)
447 emit_lso(as
, irt_isnum(ir
->t
) ? A64I_STRd
: A64I_STRs
, (r
& 31), base
, ofs
);
449 emit_lso(as
, irt_is64(ir
->t
) ? A64I_STRx
: A64I_STRw
, r
, base
, ofs
);
452 /* Emit an arithmetic operation with a constant operand. */
453 static void emit_opk(ASMState
*as
, A64Ins ai
, Reg dest
, Reg src
,
454 int32_t i
, RegSet allow
)
456 uint32_t k
= emit_isk12(i
);
458 emit_dn(as
, ai
^k
, dest
, src
);
460 emit_dnm(as
, ai
, dest
, src
, ra_allock(as
, i
, allow
));
463 /* Add offset to pointer. */
464 static void emit_addptr(ASMState
*as
, Reg r
, int32_t ofs
)
467 emit_opk(as
, ofs
< 0 ? A64I_SUBx
: A64I_ADDx
, r
, r
,
468 ofs
< 0 ? (int32_t)(~(uint32_t)ofs
+1u) : ofs
,
469 rset_exclude(RSET_GPR
, r
));
472 #define emit_spsub(as, ofs) emit_addptr(as, RID_SP, -(ofs))