2 ** x86/x64 IR assembler (SSA IR -> machine code).
3 ** Copyright (C) 2005-2025 Mike Pall. See Copyright Notice in luajit.h
6 /* -- Guard handling ------------------------------------------------------ */
8 /* Generate an exit stub group at the bottom of the reserved MCode memory. */
9 static MCode
*asm_exitstub_gen(ASMState
*as
, ExitNo group
)
11 ExitNo i
, groupofs
= (group
*EXITSTUBS_PER_GROUP
) & 0xff;
12 MCode
*mxp
= as
->mcbot
;
13 MCode
*mxpstart
= mxp
;
14 if (mxp
+ (2+2)*EXITSTUBS_PER_GROUP
+8+5 >= as
->mctop
)
16 /* Push low byte of exitno for each exit stub. */
17 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)groupofs
;
18 for (i
= 1; i
< EXITSTUBS_PER_GROUP
; i
++) {
19 *mxp
++ = XI_JMPs
; *mxp
++ = (MCode
)((2+2)*(EXITSTUBS_PER_GROUP
- i
) - 2);
20 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)(groupofs
+ i
);
22 /* Push the high byte of the exitno for each exit stub group. */
23 *mxp
++ = XI_PUSHi8
; *mxp
++ = (MCode
)((group
*EXITSTUBS_PER_GROUP
)>>8);
25 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
27 *mxp
++ = MODRM(XM_OFS8
, 0, RID_ESP
);
28 *mxp
++ = MODRM(XM_SCALE1
, RID_ESP
, RID_ESP
);
29 *mxp
++ = 2*sizeof(void *);
30 *(int32_t *)mxp
= ptr2addr(J2GG(as
->J
)->dispatch
); mxp
+= 4;
32 /* Jump to exit handler which fills in the ExitState. */
33 *mxp
++ = XI_JMP
; mxp
+= 4;
34 *((int32_t *)(mxp
-4)) = jmprel(as
->J
, mxp
, (MCode
*)(void *)lj_vm_exit_handler
);
35 /* Commit the code for this group (even if assembly fails later on). */
36 lj_mcode_commitbot(as
->J
, mxp
);
38 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
42 /* Setup all needed exit stubs. */
43 static void asm_exitstub_setup(ASMState
*as
, ExitNo nexits
)
46 if (nexits
>= EXITSTUBS_PER_GROUP
*LJ_MAX_EXITSTUBGR
)
47 lj_trace_err(as
->J
, LJ_TRERR_SNAPOV
);
48 for (i
= 0; i
< (nexits
+EXITSTUBS_PER_GROUP
-1)/EXITSTUBS_PER_GROUP
; i
++)
49 if (as
->J
->exitstubgroup
[i
] == NULL
)
50 as
->J
->exitstubgroup
[i
] = asm_exitstub_gen(as
, i
);
53 /* Emit conditional branch to exit for guard.
54 ** It's important to emit this *after* all registers have been allocated,
55 ** because rematerializations may invalidate the flags.
57 static void asm_guardcc(ASMState
*as
, int cc
)
59 MCode
*target
= exitstub_addr(as
->J
, as
->snapno
);
61 if (LJ_UNLIKELY(p
== as
->invmcp
)) {
63 *(int32_t *)(p
+1) = jmprel(as
->J
, p
+5, target
);
67 if (LJ_GC64
&& LJ_UNLIKELY(as
->mrm
.base
== RID_RIP
))
68 as
->mrm
.ofs
+= 2; /* Fixup RIP offset for pending fused load. */
69 emit_sjcc(as
, cc
, target
);
73 if (LJ_GC64
&& LJ_UNLIKELY(as
->mrm
.base
== RID_RIP
))
74 as
->mrm
.ofs
+= 6; /* Fixup RIP offset for pending fused load. */
75 emit_jcc(as
, cc
, target
);
78 /* -- Memory operand fusion ----------------------------------------------- */
80 /* Limit linear search to this distance. Avoids O(n^2) behavior. */
81 #define CONFLICT_SEARCH_LIM 31
83 /* Check if a reference is a signed 32 bit constant. */
84 static int asm_isk32(ASMState
*as
, IRRef ref
, int32_t *k
)
89 if (ir
->o
== IR_KNULL
|| !irt_is64(ir
->t
)) {
92 } else if (checki32((int64_t)ir_k64(ir
)->u64
)) {
93 *k
= (int32_t)ir_k64(ir
)->u64
;
97 if (ir
->o
!= IR_KINT64
) {
100 } else if (checki32((int64_t)ir_kint64(ir
)->u64
)) {
101 *k
= (int32_t)ir_kint64(ir
)->u64
;
109 /* Check if there's no conflicting instruction between curins and ref.
110 ** Also avoid fusing loads if there are multiple references.
112 static int noconflict(ASMState
*as
, IRRef ref
, IROp conflict
, int check
)
115 IRRef i
= as
->curins
;
116 if (i
> ref
+ CONFLICT_SEARCH_LIM
)
117 return 0; /* Give up, ref is too far away. */
119 if (ir
[i
].o
== conflict
)
120 return 0; /* Conflict found. */
121 else if ((check
& 1) && (ir
[i
].o
== IR_NEWREF
|| ir
[i
].o
== IR_CALLS
))
123 else if ((check
& 2) && (ir
[i
].op1
== ref
|| ir
[i
].op2
== ref
))
126 return 1; /* Ok, no conflict. */
129 /* Fuse array base into memory operand. */
130 static IRRef
asm_fuseabase(ASMState
*as
, IRRef ref
)
132 IRIns
*irb
= IR(ref
);
134 if (irb
->o
== IR_FLOAD
) {
135 IRIns
*ira
= IR(irb
->op1
);
136 lj_assertA(irb
->op2
== IRFL_TAB_ARRAY
, "expected FLOAD TAB_ARRAY");
137 /* We can avoid the FLOAD of t->array for colocated arrays. */
138 if (ira
->o
== IR_TNEW
&& ira
->op1
<= LJ_MAX_COLOSIZE
&&
139 !neverfuse(as
) && noconflict(as
, irb
->op1
, IR_NEWREF
, 0)) {
140 as
->mrm
.ofs
= (int32_t)sizeof(GCtab
); /* Ofs to colocated array. */
141 return irb
->op1
; /* Table obj. */
143 } else if (irb
->o
== IR_ADD
&& irref_isk(irb
->op2
)) {
144 /* Fuse base offset (vararg load). */
145 IRIns
*irk
= IR(irb
->op2
);
146 as
->mrm
.ofs
= irk
->o
== IR_KINT
? irk
->i
: (int32_t)ir_kint64(irk
)->u64
;
149 return ref
; /* Otherwise use the given array base. */
152 /* Fuse array reference into memory operand. */
153 static void asm_fusearef(ASMState
*as
, IRIns
*ir
, RegSet allow
)
156 lj_assertA(ir
->o
== IR_AREF
, "expected AREF");
157 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, asm_fuseabase(as
, ir
->op1
), allow
);
159 if (irref_isk(ir
->op2
)) {
160 as
->mrm
.ofs
+= 8*irx
->i
;
161 as
->mrm
.idx
= RID_NONE
;
163 rset_clear(allow
, as
->mrm
.base
);
164 as
->mrm
.scale
= XM_SCALE8
;
165 /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
166 ** Doesn't help much without ABCelim, but reduces register pressure.
168 if (!LJ_64
&& /* Has bad effects with negative index on x64. */
169 mayfuse(as
, ir
->op2
) && ra_noreg(irx
->r
) &&
170 irx
->o
== IR_ADD
&& irref_isk(irx
->op2
)) {
171 as
->mrm
.ofs
+= 8*IR(irx
->op2
)->i
;
172 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, irx
->op1
, allow
);
174 as
->mrm
.idx
= (uint8_t)ra_alloc1(as
, ir
->op2
, allow
);
179 /* Fuse array/hash/upvalue reference into memory operand.
180 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
181 ** pass the final allow mask, excluding any GPRs used for other inputs.
182 ** In particular: 2-operand GPR instructions need to call ra_dest() first!
184 static void asm_fuseahuref(ASMState
*as
, IRRef ref
, RegSet allow
)
187 if (ra_noreg(ir
->r
)) {
188 switch ((IROp
)ir
->o
) {
190 if (mayfuse(as
, ref
)) {
191 asm_fusearef(as
, ir
, allow
);
196 if (mayfuse(as
, ref
)) {
197 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
198 as
->mrm
.ofs
= (int32_t)(IR(ir
->op2
)->op2
* sizeof(Node
));
199 as
->mrm
.idx
= RID_NONE
;
204 if (irref_isk(ir
->op1
)) {
205 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
206 GCupval
*uv
= &gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)])->uv
;
208 int64_t ofs
= dispofs(as
, &uv
->tv
);
209 if (checki32(ofs
) && checki32(ofs
+4)) {
210 as
->mrm
.ofs
= (int32_t)ofs
;
211 as
->mrm
.base
= RID_DISPATCH
;
212 as
->mrm
.idx
= RID_NONE
;
216 as
->mrm
.ofs
= ptr2addr(&uv
->tv
);
217 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
224 as
->mrm
.ofs
= (int32_t)dispofs(as
, &J2G(as
->J
)->tmptv
);
225 as
->mrm
.base
= RID_DISPATCH
;
226 as
->mrm
.idx
= RID_NONE
;
228 as
->mrm
.ofs
= igcptr(&J2G(as
->J
)->tmptv
);
229 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
236 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
238 as
->mrm
.idx
= RID_NONE
;
241 /* Fuse FLOAD/FREF reference into memory operand. */
242 static void asm_fusefref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
244 lj_assertA(ir
->o
== IR_FLOAD
|| ir
->o
== IR_FREF
,
245 "bad IR op %d", ir
->o
);
246 as
->mrm
.idx
= RID_NONE
;
247 if (ir
->op1
== REF_NIL
) { /* FLOAD from GG_State with offset. */
249 as
->mrm
.ofs
= (int32_t)(ir
->op2
<< 2) - GG_OFS(dispatch
);
250 as
->mrm
.base
= RID_DISPATCH
;
252 as
->mrm
.ofs
= (int32_t)(ir
->op2
<< 2) + ptr2addr(J2GG(as
->J
));
253 as
->mrm
.base
= RID_NONE
;
257 as
->mrm
.ofs
= field_ofs
[ir
->op2
];
258 if (irref_isk(ir
->op1
)) {
259 IRIns
*op1
= IR(ir
->op1
);
261 if (ir
->op1
== REF_NIL
) {
262 as
->mrm
.ofs
-= GG_OFS(dispatch
);
263 as
->mrm
.base
= RID_DISPATCH
;
265 } else if (op1
->o
== IR_KPTR
|| op1
->o
== IR_KKPTR
) {
266 intptr_t ofs
= dispofs(as
, ir_kptr(op1
));
267 if (checki32(as
->mrm
.ofs
+ ofs
)) {
268 as
->mrm
.ofs
+= (int32_t)ofs
;
269 as
->mrm
.base
= RID_DISPATCH
;
274 as
->mrm
.ofs
+= op1
->i
;
275 as
->mrm
.base
= RID_NONE
;
279 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ir
->op1
, allow
);
282 /* Fuse string reference into memory operand. */
283 static void asm_fusestrref(ASMState
*as
, IRIns
*ir
, RegSet allow
)
286 lj_assertA(ir
->o
== IR_STRREF
, "bad IR op %d", ir
->o
);
287 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
288 as
->mrm
.scale
= XM_SCALE1
;
289 as
->mrm
.ofs
= sizeof(GCstr
);
290 if (!LJ_GC64
&& irref_isk(ir
->op1
)) {
291 as
->mrm
.ofs
+= IR(ir
->op1
)->i
;
293 Reg r
= ra_alloc1(as
, ir
->op1
, allow
);
294 rset_clear(allow
, r
);
295 as
->mrm
.base
= (uint8_t)r
;
298 if (irref_isk(ir
->op2
)) {
299 as
->mrm
.ofs
+= irr
->i
;
302 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
303 if (!LJ_64
&& /* Has bad effects with negative index on x64. */
304 mayfuse(as
, ir
->op2
) && irr
->o
== IR_ADD
&& irref_isk(irr
->op2
)) {
305 as
->mrm
.ofs
+= IR(irr
->op2
)->i
;
306 r
= ra_alloc1(as
, irr
->op1
, allow
);
308 r
= ra_alloc1(as
, ir
->op2
, allow
);
310 if (as
->mrm
.base
== RID_NONE
)
311 as
->mrm
.base
= (uint8_t)r
;
313 as
->mrm
.idx
= (uint8_t)r
;
317 static void asm_fusexref(ASMState
*as
, IRRef ref
, RegSet allow
)
320 as
->mrm
.idx
= RID_NONE
;
321 if (ir
->o
== IR_KPTR
|| ir
->o
== IR_KKPTR
) {
323 intptr_t ofs
= dispofs(as
, ir_kptr(ir
));
325 as
->mrm
.ofs
= (int32_t)ofs
;
326 as
->mrm
.base
= RID_DISPATCH
;
332 as
->mrm
.base
= RID_NONE
;
333 } else if (ir
->o
== IR_STRREF
) {
334 asm_fusestrref(as
, ir
, allow
);
338 if (canfuse(as
, ir
) && ir
->o
== IR_ADD
&& ra_noreg(ir
->r
)) {
339 /* Gather (base+idx*sz)+ofs as emitted by cdata ptr/array indexing. */
343 if (asm_isk32(as
, ir
->op2
, &as
->mrm
.ofs
)) { /* Recognize x+ofs. */
346 if (!(ir
->o
== IR_ADD
&& canfuse(as
, ir
) && ra_noreg(ir
->r
)))
349 as
->mrm
.scale
= XM_SCALE1
;
353 if (!(irx
->o
== IR_BSHL
|| irx
->o
== IR_ADD
)) { /* Try other operand. */
358 if (canfuse(as
, irx
) && ra_noreg(irx
->r
)) {
359 if (irx
->o
== IR_BSHL
&& irref_isk(irx
->op2
) && IR(irx
->op2
)->i
<= 3) {
360 /* Recognize idx<<b with b = 0-3, corresponding to sz = (1),2,4,8. */
362 as
->mrm
.scale
= (uint8_t)(IR(irx
->op2
)->i
<< 6);
363 } else if (irx
->o
== IR_ADD
&& irx
->op1
== irx
->op2
) {
364 /* FOLD does idx*2 ==> idx<<1 ==> idx+idx. */
366 as
->mrm
.scale
= XM_SCALE2
;
369 r
= ra_alloc1(as
, idx
, allow
);
370 rset_clear(allow
, r
);
371 as
->mrm
.idx
= (uint8_t)r
;
374 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, ref
, allow
);
378 /* Fuse load of 64 bit IR constant into memory operand. */
379 static Reg
asm_fuseloadk64(ASMState
*as
, IRIns
*ir
)
381 const uint64_t *k
= &ir_k64(ir
)->u64
;
382 if (!LJ_GC64
|| checki32((intptr_t)k
)) {
383 as
->mrm
.ofs
= ptr2addr(k
);
384 as
->mrm
.base
= RID_NONE
;
386 } else if (checki32(dispofs(as
, k
))) {
387 as
->mrm
.ofs
= (int32_t)dispofs(as
, k
);
388 as
->mrm
.base
= RID_DISPATCH
;
389 } else if (checki32(mcpofs(as
, k
)) && checki32(mcpofs(as
, k
+1)) &&
390 checki32(mctopofs(as
, k
)) && checki32(mctopofs(as
, k
+1))) {
391 as
->mrm
.ofs
= (int32_t)mcpofs(as
, k
);
392 as
->mrm
.base
= RID_RIP
;
393 } else { /* Intern 64 bit constant at bottom of mcode. */
395 lj_assertA(*k
== *(uint64_t*)(as
->mctop
- ir
->i
),
396 "bad interned 64 bit constant");
398 while ((uintptr_t)as
->mcbot
& 7) *as
->mcbot
++ = XI_INT3
;
399 *(uint64_t*)as
->mcbot
= *k
;
400 ir
->i
= (int32_t)(as
->mctop
- as
->mcbot
);
402 as
->mclim
= as
->mcbot
+ MCLIM_REDZONE
;
403 lj_mcode_commitbot(as
->J
, as
->mcbot
);
405 as
->mrm
.ofs
= (int32_t)mcpofs(as
, as
->mctop
- ir
->i
);
406 as
->mrm
.base
= RID_RIP
;
409 as
->mrm
.idx
= RID_NONE
;
413 /* Fuse load into memory operand.
415 ** Important caveat: this may emit RIP-relative loads! So don't place any
416 ** code emitters between this function and the use of its result.
417 ** The only permitted exception is asm_guardcc().
419 static Reg
asm_fuseload(ASMState
*as
, IRRef ref
, RegSet allow
)
422 if (ra_hasreg(ir
->r
)) {
423 if (allow
!= RSET_EMPTY
) { /* Fast path. */
424 ra_noweak(as
, ir
->r
);
428 /* Force a spill if only memory operands are allowed (asm_x87load). */
429 as
->mrm
.base
= RID_ESP
;
430 as
->mrm
.ofs
= ra_spill(as
, ir
);
431 as
->mrm
.idx
= RID_NONE
;
434 if (ir
->o
== IR_KNUM
) {
435 RegSet avail
= as
->freeset
& ~as
->modset
& RSET_FPR
;
436 lj_assertA(allow
!= RSET_EMPTY
, "no register allowed");
437 if (!(avail
& (avail
-1))) /* Fuse if less than two regs available. */
438 return asm_fuseloadk64(as
, ir
);
439 } else if (ref
== REF_BASE
|| ir
->o
== IR_KINT64
) {
440 RegSet avail
= as
->freeset
& ~as
->modset
& RSET_GPR
;
441 lj_assertA(allow
!= RSET_EMPTY
, "no register allowed");
442 if (!(avail
& (avail
-1))) { /* Fuse if less than two regs available. */
443 if (ref
== REF_BASE
) {
445 as
->mrm
.ofs
= (int32_t)dispofs(as
, &J2G(as
->J
)->jit_base
);
446 as
->mrm
.base
= RID_DISPATCH
;
448 as
->mrm
.ofs
= ptr2addr(&J2G(as
->J
)->jit_base
);
449 as
->mrm
.base
= RID_NONE
;
451 as
->mrm
.idx
= RID_NONE
;
454 return asm_fuseloadk64(as
, ir
);
457 } else if (mayfuse(as
, ref
)) {
458 RegSet xallow
= (allow
& RSET_GPR
) ? allow
: RSET_GPR
;
459 if (ir
->o
== IR_SLOAD
) {
460 if (!(ir
->op2
& (IRSLOAD_PARENT
|IRSLOAD_CONVERT
)) &&
461 noconflict(as
, ref
, IR_RETF
, 2) &&
462 !(LJ_GC64
&& irt_isaddr(ir
->t
))) {
463 as
->mrm
.base
= (uint8_t)ra_alloc1(as
, REF_BASE
, xallow
);
464 as
->mrm
.ofs
= 8*((int32_t)ir
->op1
-1-LJ_FR2
) +
465 (!LJ_FR2
&& (ir
->op2
& IRSLOAD_FRAME
) ? 4 : 0);
466 as
->mrm
.idx
= RID_NONE
;
469 } else if (ir
->o
== IR_FLOAD
) {
470 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
471 if ((irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
)) &&
472 noconflict(as
, ref
, IR_FSTORE
, 2)) {
473 asm_fusefref(as
, ir
, xallow
);
476 } else if (ir
->o
== IR_ALOAD
|| ir
->o
== IR_HLOAD
|| ir
->o
== IR_ULOAD
) {
477 if (noconflict(as
, ref
, ir
->o
+ IRDELTA_L2S
, 2+(ir
->o
!= IR_ULOAD
)) &&
478 !(LJ_GC64
&& irt_isaddr(ir
->t
))) {
479 asm_fuseahuref(as
, ir
->op1
, xallow
);
482 } else if (ir
->o
== IR_XLOAD
) {
483 /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp).
484 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
486 if ((!irt_typerange(ir
->t
, IRT_I8
, IRT_U16
)) &&
487 noconflict(as
, ref
, IR_XSTORE
, 2)) {
488 asm_fusexref(as
, ir
->op1
, xallow
);
491 } else if (ir
->o
== IR_VLOAD
&& IR(ir
->op1
)->o
== IR_AREF
&&
492 !(LJ_GC64
&& irt_isaddr(ir
->t
))) {
493 asm_fuseahuref(as
, ir
->op1
, xallow
);
494 as
->mrm
.ofs
+= 8 * ir
->op2
;
498 if (ir
->o
== IR_FLOAD
&& ir
->op1
== REF_NIL
) {
499 asm_fusefref(as
, ir
, RSET_EMPTY
);
502 if (!(as
->freeset
& allow
) && !emit_canremat(ref
) &&
503 (allow
== RSET_EMPTY
|| ra_hasspill(ir
->s
) || iscrossref(as
, ref
)))
505 return ra_allocref(as
, ref
, allow
);
509 /* Don't fuse a 32 bit load into a 64 bit operation. */
510 static Reg
asm_fuseloadm(ASMState
*as
, IRRef ref
, RegSet allow
, int is64
)
512 if (is64
&& !irt_is64(IR(ref
)->t
))
513 return ra_alloc1(as
, ref
, allow
);
514 return asm_fuseload(as
, ref
, allow
);
517 #define asm_fuseloadm(as, ref, allow, is64) asm_fuseload(as, (ref), (allow))
520 /* -- Calls --------------------------------------------------------------- */
522 /* Count the required number of stack slots for a call. */
523 static int asm_count_call_slots(ASMState
*as
, const CCallInfo
*ci
, IRRef
*args
)
525 uint32_t i
, nargs
= CCI_XNARGS(ci
);
529 nslots
= (int)(nargs
*2); /* Only matters for more than four args. */
531 int ngpr
= REGARG_NUMGPR
, nfpr
= REGARG_NUMFPR
;
532 for (i
= 0; i
< nargs
; i
++)
533 if (args
[i
] && irt_isfp(IR(args
[i
])->t
)) {
534 if (nfpr
> 0) nfpr
--; else nslots
+= 2;
536 if (ngpr
> 0) ngpr
--; else nslots
+= 2;
541 if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_FASTCALL
)
543 else if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_THISCALL
)
545 for (i
= 0; i
< nargs
; i
++)
546 if (args
[i
] && irt_isfp(IR(args
[i
])->t
)) {
547 nslots
+= irt_isnum(IR(args
[i
])->t
) ? 2 : 1;
549 if (ngpr
> 0) ngpr
--; else nslots
++;
555 /* Generate a call to a C function. */
556 static void asm_gencall(ASMState
*as
, const CCallInfo
*ci
, IRRef
*args
)
558 uint32_t n
, nargs
= CCI_XNARGS(ci
);
559 int32_t ofs
= STACKARG_OFS
;
561 uint32_t gprs
= REGARG_GPRS
;
562 Reg fpr
= REGARG_FIRSTFPR
;
564 MCode
*patchnfpr
= NULL
;
568 if ((ci
->flags
& CCI_CC_MASK
) != CCI_CC_CDECL
) {
569 if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_THISCALL
)
570 gprs
= (REGARG_GPRS
& 31);
571 else if ((ci
->flags
& CCI_CC_MASK
) == CCI_CC_FASTCALL
)
575 if ((void *)ci
->func
)
576 emit_call(as
, ci
->func
);
578 if ((ci
->flags
& CCI_VARARG
)) { /* Special handling for vararg calls. */
580 for (n
= 0; n
< 4 && n
< nargs
; n
++) {
581 IRIns
*ir
= IR(args
[n
]);
582 if (irt_isfp(ir
->t
)) /* Duplicate FPRs in GPRs. */
583 emit_rr(as
, XO_MOVDto
, (irt_isnum(ir
->t
) ? REX_64
: 0) | (fpr
+n
),
584 ((gprs
>> (n
*5)) & 31)); /* Either MOVD or MOVQ. */
587 patchnfpr
= --as
->mcp
; /* Indicate number of used FPRs in register al. */
588 *--as
->mcp
= XI_MOVrib
| RID_EAX
;
592 for (n
= 0; n
< nargs
; n
++) { /* Setup args. */
596 #if LJ_64 && LJ_ABI_WIN
597 /* Windows/x64 argument registers are strictly positional. */
598 r
= irt_isfp(ir
->t
) ? (fpr
<= REGARG_LASTFPR
? fpr
: 0) : (gprs
& 31);
601 /* POSIX/x64 argument registers are used in order of appearance. */
602 if (irt_isfp(ir
->t
)) {
603 r
= fpr
<= REGARG_LASTFPR
? fpr
++ : 0;
605 r
= gprs
& 31; gprs
>>= 5;
608 if (ref
&& irt_isfp(ir
->t
)) {
611 r
= gprs
& 31; gprs
>>= 5;
615 if (r
) { /* Argument is in a register. */
616 if (r
< RID_MAX_GPR
&& ref
< ASMREF_TMP1
) {
618 if (LJ_GC64
? !(ir
->o
== IR_KINT
|| ir
->o
== IR_KNULL
) : ir
->o
== IR_KINT64
)
619 emit_loadu64(as
, r
, ir_k64(ir
)->u64
);
622 emit_loadi(as
, r
, ir
->i
);
624 /* Must have been evicted. */
625 lj_assertA(rset_test(as
->freeset
, r
), "reg %d not free", r
);
626 if (ra_hasreg(ir
->r
)) {
627 ra_noweak(as
, ir
->r
);
628 emit_movrr(as
, ir
, r
, ir
->r
);
630 ra_allocref(as
, ref
, RID2RSET(r
));
633 } else if (irt_isfp(ir
->t
)) { /* FP argument is on stack. */
634 lj_assertA(!(irt_isfloat(ir
->t
) && irref_isk(ref
)),
635 "unexpected float constant");
636 if (LJ_32
&& (ofs
& 4) && irref_isk(ref
)) {
637 /* Split stores for unaligned FP consts. */
638 emit_movmroi(as
, RID_ESP
, ofs
, (int32_t)ir_knum(ir
)->u32
.lo
);
639 emit_movmroi(as
, RID_ESP
, ofs
+4, (int32_t)ir_knum(ir
)->u32
.hi
);
641 r
= ra_alloc1(as
, ref
, RSET_FPR
);
642 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_MOVSDto
: XO_MOVSSto
,
645 ofs
+= (LJ_32
&& irt_isfloat(ir
->t
)) ? 4 : 8;
646 } else { /* Non-FP argument is on stack. */
647 if (LJ_32
&& ref
< ASMREF_TMP1
) {
648 emit_movmroi(as
, RID_ESP
, ofs
, ir
->i
);
650 r
= ra_alloc1(as
, ref
, RSET_GPR
);
651 emit_movtomro(as
, REX_64
+ r
, RID_ESP
, ofs
);
653 ofs
+= sizeof(intptr_t);
657 #if LJ_64 && !LJ_ABI_WIN
658 if (patchnfpr
) *patchnfpr
= fpr
- REGARG_FIRSTFPR
;
662 /* Setup result reg/sp for call. Evict scratch regs. */
663 static void asm_setupresult(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
665 RegSet drop
= RSET_SCRATCH
;
666 int hiop
= ((ir
+1)->o
== IR_HIOP
&& !irt_isnil((ir
+1)->t
));
667 if ((ci
->flags
& CCI_NOFPRCLOBBER
))
669 if (ra_hasreg(ir
->r
))
670 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
671 if (hiop
&& ra_hasreg((ir
+1)->r
))
672 rset_clear(drop
, (ir
+1)->r
); /* Dest reg handled below. */
673 ra_evictset(as
, drop
); /* Evictions must be performed first. */
675 if (irt_isfp(ir
->t
)) {
676 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
678 if ((ci
->flags
& CCI_CASTU64
)) {
680 if (ra_hasreg(dest
)) {
682 ra_modified(as
, dest
);
683 emit_rr(as
, XO_MOVD
, dest
|REX_64
, RID_RET
); /* Really MOVQ. */
685 if (ofs
) emit_movtomro(as
, RID_RET
|REX_64
, RID_ESP
, ofs
);
687 ra_destreg(as
, ir
, RID_FPRET
);
690 /* Number result is in x87 st0 for x86 calling convention. */
692 if (ra_hasreg(dest
)) {
694 ra_modified(as
, dest
);
695 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_MOVSD
: XO_MOVSS
,
698 if ((ci
->flags
& CCI_CASTU64
)) {
699 emit_movtomro(as
, RID_RETLO
, RID_ESP
, ofs
);
700 emit_movtomro(as
, RID_RETHI
, RID_ESP
, ofs
+4);
702 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_FSTPq
: XO_FSTPd
,
703 irt_isnum(ir
->t
) ? XOg_FSTPq
: XOg_FSTPd
, RID_ESP
, ofs
);
709 lj_assertA(!irt_ispri(ir
->t
), "PRI dest");
710 ra_destreg(as
, ir
, RID_RET
);
712 } else if (LJ_32
&& irt_isfp(ir
->t
) && !(ci
->flags
& CCI_CASTU64
)) {
713 emit_x87op(as
, XI_FPOP
); /* Pop unused result from x87 st0. */
717 /* Return a constant function pointer or NULL for indirect calls. */
718 static void *asm_callx_func(ASMState
*as
, IRIns
*irf
, IRRef func
)
723 return (void *)irf
->i
;
725 if (irref_isk(func
)) {
727 if (irf
->o
== IR_KINT64
)
728 p
= (MCode
*)(void *)ir_k64(irf
)->u64
;
730 p
= (MCode
*)(void *)(uintptr_t)(uint32_t)irf
->i
;
731 if (p
- as
->mcp
== (int32_t)(p
- as
->mcp
))
732 return p
; /* Call target is still in +-2GB range. */
733 /* Avoid the indirect case of emit_call(). Try to hoist func addr. */
739 static void asm_callx(ASMState
*as
, IRIns
*ir
)
741 IRRef args
[CCI_NARGS_MAX
*2];
746 ci
.flags
= asm_callx_flags(as
, ir
);
747 asm_collectargs(as
, ir
, &ci
, args
);
748 asm_setupresult(as
, ir
, &ci
);
750 /* Have to readjust stack after non-cdecl calls due to callee cleanup. */
751 if ((ci
.flags
& CCI_CC_MASK
) != CCI_CC_CDECL
)
752 spadj
= 4 * asm_count_call_slots(as
, &ci
, args
);
754 func
= ir
->op2
; irf
= IR(func
);
755 if (irf
->o
== IR_CARG
) { func
= irf
->op1
; irf
= IR(func
); }
756 ci
.func
= (ASMFunction
)asm_callx_func(as
, irf
, func
);
757 if (!(void *)ci
.func
) {
758 /* Use a (hoistable) non-scratch register for indirect calls. */
759 RegSet allow
= (RSET_GPR
& ~RSET_SCRATCH
);
760 Reg r
= ra_alloc1(as
, func
, allow
);
761 if (LJ_32
) emit_spsub(as
, spadj
); /* Above code may cause restores! */
762 emit_rr(as
, XO_GROUP5
, XOg_CALL
, r
);
764 emit_spsub(as
, spadj
);
766 asm_gencall(as
, &ci
, args
);
769 /* -- Returns ------------------------------------------------------------- */
771 /* Return to lower frame. Guard that it goes to the right spot. */
772 static void asm_retf(ASMState
*as
, IRIns
*ir
)
774 Reg base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
776 Reg rpc
= ra_scratch(as
, rset_exclude(RSET_GPR
, base
));
778 void *pc
= ir_kptr(IR(ir
->op2
));
779 int32_t delta
= 1+LJ_FR2
+bc_a(*((const BCIns
*)pc
- 1));
780 as
->topslot
-= (BCReg
)delta
;
781 if ((int32_t)as
->topslot
< 0) as
->topslot
= 0;
782 irt_setmark(IR(REF_BASE
)->t
); /* Children must not coalesce with BASE reg. */
783 emit_setgl(as
, base
, jit_base
);
784 emit_addptr(as
, base
, -8*delta
);
785 asm_guardcc(as
, CC_NE
);
787 emit_rmro(as
, XO_CMP
, rpc
|REX_GC64
, base
, -8);
788 emit_loadu64(as
, rpc
, u64ptr(pc
));
790 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), base
, -4, ptr2addr(pc
));
794 /* -- Buffer operations --------------------------------------------------- */
797 static void asm_bufhdr_write(ASMState
*as
, Reg sb
)
799 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, sb
));
801 irgc
.ot
= IRT(0, IRT_PGC
); /* GC type. */
802 emit_storeofs(as
, &irgc
, tmp
, sb
, offsetof(SBuf
, L
));
803 emit_opgl(as
, XO_ARITH(XOg_OR
), tmp
|REX_GC64
, cur_L
);
804 emit_gri(as
, XG_ARITHi(XOg_AND
), tmp
, SBUF_MASK_FLAG
);
805 emit_loadofs(as
, &irgc
, tmp
, sb
, offsetof(SBuf
, L
));
809 /* -- Type conversions ---------------------------------------------------- */
811 static void asm_tointg(ASMState
*as
, IRIns
*ir
, Reg left
)
813 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_FPR
, left
));
814 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
815 asm_guardcc(as
, CC_P
);
816 asm_guardcc(as
, CC_NE
);
817 emit_rr(as
, XO_UCOMISD
, left
, tmp
);
818 emit_rr(as
, XO_CVTSI2SD
, tmp
, dest
);
819 emit_rr(as
, XO_XORPS
, tmp
, tmp
); /* Avoid partial register stall. */
821 emit_rr(as
, XO_CVTTSD2SI
, dest
, left
);
822 /* Can't fuse since left is needed twice. */
825 static void asm_tobit(ASMState
*as
, IRIns
*ir
)
827 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
828 Reg tmp
= ra_noreg(IR(ir
->op1
)->r
) ?
829 ra_alloc1(as
, ir
->op1
, RSET_FPR
) :
830 ra_scratch(as
, RSET_FPR
);
832 emit_rr(as
, XO_MOVDto
, tmp
, dest
);
833 right
= asm_fuseload(as
, ir
->op2
, rset_exclude(RSET_FPR
, tmp
));
834 emit_mrm(as
, XO_ADDSD
, tmp
, right
);
835 ra_left(as
, tmp
, ir
->op1
);
838 static void asm_conv(ASMState
*as
, IRIns
*ir
)
840 IRType st
= (IRType
)(ir
->op2
& IRCONV_SRCMASK
);
841 int st64
= (st
== IRT_I64
|| st
== IRT_U64
|| (LJ_64
&& st
== IRT_P64
));
842 int stfp
= (st
== IRT_NUM
|| st
== IRT_FLOAT
);
843 IRRef lref
= ir
->op1
;
844 lj_assertA(irt_type(ir
->t
) != st
, "inconsistent types for CONV");
845 lj_assertA(!(LJ_32
&& (irt_isint64(ir
->t
) || st64
)),
846 "IR %04d has unsplit 64 bit type",
847 (int)(ir
- as
->ir
) - REF_BIAS
);
848 if (irt_isfp(ir
->t
)) {
849 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
850 if (stfp
) { /* FP to FP conversion. */
851 Reg left
= asm_fuseload(as
, lref
, RSET_FPR
);
852 emit_mrm(as
, st
== IRT_NUM
? XO_CVTSD2SS
: XO_CVTSS2SD
, dest
, left
);
853 if (left
== dest
) return; /* Avoid the XO_XORPS. */
854 } else if (LJ_32
&& st
== IRT_U32
) { /* U32 to FP conversion on x86. */
855 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
856 cTValue
*k
= &as
->J
->k64
[LJ_K64_TOBIT
];
857 Reg bias
= ra_scratch(as
, rset_exclude(RSET_FPR
, dest
));
858 if (irt_isfloat(ir
->t
))
859 emit_rr(as
, XO_CVTSD2SS
, dest
, dest
);
860 emit_rr(as
, XO_SUBSD
, dest
, bias
); /* Subtract 2^52+2^51 bias. */
861 emit_rr(as
, XO_XORPS
, dest
, bias
); /* Merge bias and integer. */
862 emit_rma(as
, XO_MOVSD
, bias
, k
);
864 emit_mrm(as
, XO_MOVD
, dest
, asm_fuseload(as
, lref
, RSET_GPR
));
866 } else { /* Integer to FP conversion. */
867 Reg left
= (LJ_64
&& (st
== IRT_U32
|| st
== IRT_U64
)) ?
868 ra_alloc1(as
, lref
, RSET_GPR
) :
869 asm_fuseloadm(as
, lref
, RSET_GPR
, st64
);
870 if (LJ_64
&& st
== IRT_U64
) {
871 MCLabel l_end
= emit_label(as
);
872 cTValue
*k
= &as
->J
->k64
[LJ_K64_2P64
];
873 emit_rma(as
, XO_ADDSD
, dest
, k
); /* Add 2^64 to compensate. */
874 emit_sjcc(as
, CC_NS
, l_end
);
875 emit_rr(as
, XO_TEST
, left
|REX_64
, left
); /* Check if u64 >= 2^63. */
877 emit_mrm(as
, irt_isnum(ir
->t
) ? XO_CVTSI2SD
: XO_CVTSI2SS
,
878 dest
|((LJ_64
&& (st64
|| st
== IRT_U32
)) ? REX_64
: 0), left
);
880 emit_rr(as
, XO_XORPS
, dest
, dest
); /* Avoid partial register stall. */
881 } else if (stfp
) { /* FP to integer conversion. */
882 if (irt_isguard(ir
->t
)) {
883 /* Checked conversions are only supported from number to int. */
884 lj_assertA(irt_isint(ir
->t
) && st
== IRT_NUM
,
885 "bad type for checked CONV");
886 asm_tointg(as
, ir
, ra_alloc1(as
, lref
, RSET_FPR
));
888 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
889 x86Op op
= st
== IRT_NUM
? XO_CVTTSD2SI
: XO_CVTTSS2SI
;
890 if (LJ_64
? irt_isu64(ir
->t
) : irt_isu32(ir
->t
)) {
891 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */
892 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */
893 Reg tmp
= ra_noreg(IR(lref
)->r
) ? ra_alloc1(as
, lref
, RSET_FPR
) :
894 ra_scratch(as
, RSET_FPR
);
895 MCLabel l_end
= emit_label(as
);
897 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
, (int32_t)0x80000000);
898 emit_rr(as
, op
, dest
|REX_64
, tmp
);
900 emit_rma(as
, XO_ADDSD
, tmp
, &as
->J
->k64
[LJ_K64_M2P64_31
]);
902 emit_rma(as
, XO_ADDSS
, tmp
, &as
->J
->k32
[LJ_K32_M2P64_31
]);
903 emit_sjcc(as
, CC_NS
, l_end
);
904 emit_rr(as
, XO_TEST
, dest
|REX_64
, dest
); /* Check if dest negative. */
905 emit_rr(as
, op
, dest
|REX_64
, tmp
);
906 ra_left(as
, tmp
, lref
);
908 if (LJ_64
&& irt_isu32(ir
->t
))
909 emit_rr(as
, XO_MOV
, dest
, dest
); /* Zero hiword. */
912 (irt_is64(ir
->t
) || irt_isu32(ir
->t
))) ? REX_64
: 0),
913 asm_fuseload(as
, lref
, RSET_FPR
));
916 } else if (st
>= IRT_I8
&& st
<= IRT_U16
) { /* Extend to 32 bit integer. */
917 Reg left
, dest
= ra_dest(as
, ir
, RSET_GPR
);
918 RegSet allow
= RSET_GPR
;
920 lj_assertA(irt_isint(ir
->t
) || irt_isu32(ir
->t
), "bad type for CONV EXT");
922 op
= XO_MOVSXb
; allow
= RSET_GPR8
; dest
|= FORCE_REX
;
923 } else if (st
== IRT_U8
) {
924 op
= XO_MOVZXb
; allow
= RSET_GPR8
; dest
|= FORCE_REX
;
925 } else if (st
== IRT_I16
) {
930 left
= asm_fuseload(as
, lref
, allow
);
931 /* Add extra MOV if source is already in wrong register. */
932 if (!LJ_64
&& left
!= RID_MRM
&& !rset_test(allow
, left
)) {
933 Reg tmp
= ra_scratch(as
, allow
);
934 emit_rr(as
, op
, dest
, tmp
);
935 emit_rr(as
, XO_MOV
, tmp
, left
);
937 emit_mrm(as
, op
, dest
, left
);
939 } else { /* 32/64 bit integer conversions. */
940 if (LJ_32
) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
941 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
942 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
943 } else if (irt_is64(ir
->t
)) {
944 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
945 if (st64
|| !(ir
->op2
& IRCONV_SEXT
)) {
946 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
947 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
948 } else { /* 32 to 64 bit sign extension. */
949 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
950 emit_mrm(as
, XO_MOVSXd
, dest
|REX_64
, left
);
953 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
954 if (st64
&& !(ir
->op2
& IRCONV_NONE
)) {
955 Reg left
= asm_fuseload(as
, lref
, RSET_GPR
);
956 /* This is either a 32 bit reg/reg mov which zeroes the hiword
957 ** or a load of the loword from a 64 bit address.
959 emit_mrm(as
, XO_MOV
, dest
, left
);
960 } else { /* 32/32 bit no-op (cast). */
961 ra_left(as
, dest
, lref
); /* Do nothing, but may need to move regs. */
967 #if LJ_32 && LJ_HASFFI
968 /* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
970 /* 64 bit integer to FP conversion in 32 bit mode. */
971 static void asm_conv_fp_int64(ASMState
*as
, IRIns
*ir
)
973 Reg hi
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
974 Reg lo
= ra_alloc1(as
, (ir
-1)->op1
, rset_exclude(RSET_GPR
, hi
));
975 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
977 if (ra_hasreg(dest
)) {
979 ra_modified(as
, dest
);
980 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_MOVSD
: XO_MOVSS
, dest
, RID_ESP
, ofs
);
982 emit_rmro(as
, irt_isnum(ir
->t
) ? XO_FSTPq
: XO_FSTPd
,
983 irt_isnum(ir
->t
) ? XOg_FSTPq
: XOg_FSTPd
, RID_ESP
, ofs
);
984 if (((ir
-1)->op2
& IRCONV_SRCMASK
) == IRT_U64
) {
985 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
986 MCLabel l_end
= emit_label(as
);
987 emit_rma(as
, XO_FADDq
, XOg_FADDq
, &as
->J
->k64
[LJ_K64_2P64
]);
988 emit_sjcc(as
, CC_NS
, l_end
);
989 emit_rr(as
, XO_TEST
, hi
, hi
); /* Check if u64 >= 2^63. */
991 lj_assertA(((ir
-1)->op2
& IRCONV_SRCMASK
) == IRT_I64
, "bad type for CONV");
993 emit_rmro(as
, XO_FILDq
, XOg_FILDq
, RID_ESP
, 0);
994 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
995 emit_rmro(as
, XO_MOVto
, hi
, RID_ESP
, 4);
996 emit_rmro(as
, XO_MOVto
, lo
, RID_ESP
, 0);
999 /* FP to 64 bit integer conversion in 32 bit mode. */
1000 static void asm_conv_int64_fp(ASMState
*as
, IRIns
*ir
)
1002 IRType st
= (IRType
)((ir
-1)->op2
& IRCONV_SRCMASK
);
1003 IRType dt
= (((ir
-1)->op2
& IRCONV_DSTMASK
) >> IRCONV_DSH
);
1005 lj_assertA(st
== IRT_NUM
|| st
== IRT_FLOAT
, "bad type for CONV");
1006 lj_assertA(dt
== IRT_I64
|| dt
== IRT_U64
, "bad type for CONV");
1007 hi
= ra_dest(as
, ir
, RSET_GPR
);
1008 lo
= ra_dest(as
, ir
-1, rset_exclude(RSET_GPR
, hi
));
1009 if (ra_used(ir
-1)) emit_rmro(as
, XO_MOV
, lo
, RID_ESP
, 0);
1010 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1011 if (!(as
->flags
& JIT_F_SSE3
)) { /* Set FPU rounding mode to default. */
1012 emit_rmro(as
, XO_FLDCW
, XOg_FLDCW
, RID_ESP
, 4);
1013 emit_rmro(as
, XO_MOVto
, lo
, RID_ESP
, 4);
1014 emit_gri(as
, XG_ARITHi(XOg_AND
), lo
, 0xf3ff);
1016 if (dt
== IRT_U64
) {
1017 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1018 MCLabel l_pop
, l_end
= emit_label(as
);
1019 emit_x87op(as
, XI_FPOP
);
1020 l_pop
= emit_label(as
);
1021 emit_sjmp(as
, l_end
);
1022 emit_rmro(as
, XO_MOV
, hi
, RID_ESP
, 4);
1023 if ((as
->flags
& JIT_F_SSE3
))
1024 emit_rmro(as
, XO_FISTTPq
, XOg_FISTTPq
, RID_ESP
, 0);
1026 emit_rmro(as
, XO_FISTPq
, XOg_FISTPq
, RID_ESP
, 0);
1027 emit_rma(as
, XO_FADDq
, XOg_FADDq
, &as
->J
->k64
[LJ_K64_M2P64
]);
1028 emit_sjcc(as
, CC_NS
, l_pop
);
1029 emit_rr(as
, XO_TEST
, hi
, hi
); /* Check if out-of-range (2^63). */
1031 emit_rmro(as
, XO_MOV
, hi
, RID_ESP
, 4);
1032 if ((as
->flags
& JIT_F_SSE3
)) { /* Truncation is easy with SSE3. */
1033 emit_rmro(as
, XO_FISTTPq
, XOg_FISTTPq
, RID_ESP
, 0);
1034 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1035 emit_rmro(as
, XO_FISTPq
, XOg_FISTPq
, RID_ESP
, 0);
1036 emit_rmro(as
, XO_FLDCW
, XOg_FLDCW
, RID_ESP
, 0);
1037 emit_rmro(as
, XO_MOVtow
, lo
, RID_ESP
, 0);
1038 emit_rmro(as
, XO_ARITHw(XOg_OR
), lo
, RID_ESP
, 0);
1039 emit_loadi(as
, lo
, 0xc00);
1040 emit_rmro(as
, XO_FNSTCW
, XOg_FNSTCW
, RID_ESP
, 0);
1043 emit_x87op(as
, XI_FDUP
);
1044 emit_mrm(as
, st
== IRT_NUM
? XO_FLDq
: XO_FLDd
,
1045 st
== IRT_NUM
? XOg_FLDq
: XOg_FLDd
,
1046 asm_fuseload(as
, ir
->op1
, RSET_EMPTY
));
1049 static void asm_conv64(ASMState
*as
, IRIns
*ir
)
1051 if (irt_isfp(ir
->t
))
1052 asm_conv_fp_int64(as
, ir
);
1054 asm_conv_int64_fp(as
, ir
);
1058 static void asm_strto(ASMState
*as
, IRIns
*ir
)
1060 /* Force a spill slot for the destination register (if any). */
1061 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_strscan_num
];
1063 RegSet drop
= RSET_SCRATCH
;
1064 if ((drop
& RSET_FPR
) != RSET_FPR
&& ra_hasreg(ir
->r
))
1065 rset_set(drop
, ir
->r
); /* WIN64 doesn't spill all FPRs. */
1066 ra_evictset(as
, drop
);
1067 asm_guardcc(as
, CC_E
);
1068 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
); /* Test return status. */
1069 args
[0] = ir
->op1
; /* GCstr *str */
1070 args
[1] = ASMREF_TMP1
; /* TValue *n */
1071 asm_gencall(as
, ci
, args
);
1072 /* Store the result to the spill slot or temp slots. */
1073 emit_rmro(as
, XO_LEA
, ra_releasetmp(as
, ASMREF_TMP1
)|REX_64
,
1074 RID_ESP
, sps_scale(ir
->s
));
1077 /* -- Memory references --------------------------------------------------- */
1079 /* Get pointer to TValue. */
1080 static void asm_tvptr(ASMState
*as
, Reg dest
, IRRef ref
, MSize mode
)
1082 if ((mode
& IRTMPREF_IN1
)) {
1083 IRIns
*ir
= IR(ref
);
1084 if (irt_isnum(ir
->t
)) {
1085 if (irref_isk(ref
) && !(mode
& IRTMPREF_OUT1
)) {
1086 /* Use the number constant itself as a TValue. */
1087 emit_loada(as
, dest
, ir_knum(ir
));
1090 emit_rmro(as
, XO_MOVSDto
, ra_alloc1(as
, ref
, RSET_FPR
), dest
, 0);
1093 if (irref_isk(ref
)) {
1095 lj_ir_kvalue(as
->J
->L
, &k
, ir
);
1096 emit_movmroi(as
, dest
, 4, k
.u32
.hi
);
1097 emit_movmroi(as
, dest
, 0, k
.u32
.lo
);
1099 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1100 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, dest
));
1101 if (irt_is64(ir
->t
)) {
1102 emit_u32(as
, irt_toitype(ir
->t
) << 15);
1103 emit_rmro(as
, XO_ARITHi
, XOg_OR
, dest
, 4);
1105 emit_movmroi(as
, dest
, 4, (irt_toitype(ir
->t
) << 15));
1107 emit_movtomro(as
, REX_64IR(ir
, src
), dest
, 0);
1110 if (!irref_isk(ref
)) {
1111 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, dest
));
1112 emit_movtomro(as
, REX_64IR(ir
, src
), dest
, 0);
1113 } else if (!irt_ispri(ir
->t
)) {
1114 emit_movmroi(as
, dest
, 0, ir
->i
);
1116 if (!(LJ_64
&& irt_islightud(ir
->t
)))
1117 emit_movmroi(as
, dest
, 4, irt_toitype(ir
->t
));
1121 emit_loada(as
, dest
, &J2G(as
->J
)->tmptv
); /* g->tmptv holds the TValue(s). */
1124 static void asm_aref(ASMState
*as
, IRIns
*ir
)
1126 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1127 asm_fusearef(as
, ir
, RSET_GPR
);
1128 if (!(as
->mrm
.idx
== RID_NONE
&& as
->mrm
.ofs
== 0))
1129 emit_mrm(as
, XO_LEA
, dest
|REX_GC64
, RID_MRM
);
1130 else if (as
->mrm
.base
!= dest
)
1131 emit_rr(as
, XO_MOV
, dest
|REX_GC64
, as
->mrm
.base
);
1134 /* Inlined hash lookup. Specialized for key type and for const keys.
1135 ** The equivalent C code is:
1136 ** Node *n = hashkey(t, key);
1138 ** if (lj_obj_equal(&n->key, key)) return &n->val;
1139 ** } while ((n = nextnode(n)));
1142 static void asm_href(ASMState
*as
, IRIns
*ir
, IROp merge
)
1144 RegSet allow
= RSET_GPR
;
1145 int destused
= ra_used(ir
);
1146 Reg dest
= ra_dest(as
, ir
, allow
);
1147 Reg tab
= ra_alloc1(as
, ir
->op1
, rset_clear(allow
, dest
));
1148 Reg key
= RID_NONE
, tmp
= RID_NONE
;
1149 IRIns
*irkey
= IR(ir
->op2
);
1150 int isk
= irref_isk(ir
->op2
);
1151 IRType1 kt
= irkey
->t
;
1153 MCLabel l_end
, l_loop
, l_next
;
1156 rset_clear(allow
, tab
);
1157 key
= ra_alloc1(as
, ir
->op2
, irt_isnum(kt
) ? RSET_FPR
: allow
);
1158 if (LJ_GC64
|| !irt_isstr(kt
))
1159 tmp
= ra_scratch(as
, rset_exclude(allow
, key
));
1162 /* Key not found in chain: jump to exit (if merged) or load niltv. */
1163 l_end
= emit_label(as
);
1165 asm_guardcc(as
, CC_E
); /* XI_JMP is not found by lj_asm_patchexit. */
1167 emit_loada(as
, dest
, niltvg(J2G(as
->J
)));
1169 /* Follow hash chain until the end. */
1170 l_loop
= emit_sjcc_label(as
, CC_NZ
);
1171 emit_rr(as
, XO_TEST
, dest
|REX_GC64
, dest
);
1172 emit_rmro(as
, XO_MOV
, dest
|REX_GC64
, dest
, offsetof(Node
, next
));
1173 l_next
= emit_label(as
);
1175 /* Type and value comparison. */
1177 asm_guardcc(as
, CC_E
);
1179 emit_sjcc(as
, CC_E
, l_end
);
1181 if (irt_isnum(kt
)) {
1183 /* Assumes -0.0 is already canonicalized to +0.0. */
1184 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.lo
),
1185 (int32_t)ir_knum(irkey
)->u32
.lo
);
1186 emit_sjcc(as
, CC_NE
, l_next
);
1187 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.hi
),
1188 (int32_t)ir_knum(irkey
)->u32
.hi
);
1190 emit_sjcc(as
, CC_P
, l_next
);
1191 emit_rmro(as
, XO_UCOMISD
, key
, dest
, offsetof(Node
, key
.n
));
1192 emit_sjcc(as
, CC_AE
, l_next
);
1193 /* The type check avoids NaN penalties and complaints from Valgrind. */
1194 #if LJ_64 && !LJ_GC64
1195 emit_u32(as
, LJ_TISNUM
);
1196 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1198 emit_i8(as
, LJ_TISNUM
);
1199 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1202 #if LJ_64 && !LJ_GC64
1203 } else if (irt_islightud(kt
)) {
1204 emit_rmro(as
, XO_CMP
, key
|REX_64
, dest
, offsetof(Node
, key
.u64
));
1207 } else if (irt_isaddr(kt
)) {
1210 k
.u64
= ((uint64_t)irt_toitype(irkey
->t
) << 47) | irkey
[1].tv
.u64
;
1211 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.lo
),
1213 emit_sjcc(as
, CC_NE
, l_next
);
1214 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.u32
.hi
),
1217 emit_rmro(as
, XO_CMP
, tmp
|REX_64
, dest
, offsetof(Node
, key
.u64
));
1220 lj_assertA(irt_ispri(kt
) && !irt_isnil(kt
), "bad HREF key type");
1221 emit_u32(as
, (irt_toitype(kt
)<<15)|0x7fff);
1222 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1225 if (!irt_ispri(kt
)) {
1226 lj_assertA(irt_isaddr(kt
), "bad HREF key type");
1228 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), dest
, offsetof(Node
, key
.gcr
),
1229 ptr2addr(ir_kgc(irkey
)));
1231 emit_rmro(as
, XO_CMP
, key
, dest
, offsetof(Node
, key
.gcr
));
1232 emit_sjcc(as
, CC_NE
, l_next
);
1234 lj_assertA(!irt_isnil(kt
), "bad HREF key type");
1235 emit_i8(as
, irt_toitype(kt
));
1236 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, dest
, offsetof(Node
, key
.it
));
1239 emit_sfixup(as
, l_loop
);
1241 if (!isk
&& irt_isaddr(kt
)) {
1242 emit_rr(as
, XO_OR
, tmp
|REX_64
, key
);
1243 emit_loadu64(as
, tmp
, (uint64_t)irt_toitype(kt
) << 47);
1247 /* Load main position relative to tab->node into dest. */
1248 khash
= isk
? ir_khash(as
, irkey
) : 1;
1250 emit_rmro(as
, XO_MOV
, dest
|REX_GC64
, tab
, offsetof(GCtab
, node
));
1252 emit_rmro(as
, XO_ARITH(XOg_ADD
), dest
|REX_GC64
, tab
, offsetof(GCtab
,node
));
1253 emit_shifti(as
, XOg_SHL
, dest
, 3);
1254 emit_rmrxo(as
, XO_LEA
, dest
, dest
, dest
, XM_SCALE2
, 0);
1256 emit_gri(as
, XG_ARITHi(XOg_AND
), dest
, (int32_t)khash
);
1257 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
1258 } else if (irt_isstr(kt
)) {
1259 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, key
, offsetof(GCstr
, sid
));
1260 emit_rmro(as
, XO_MOV
, dest
, tab
, offsetof(GCtab
, hmask
));
1261 } else { /* Must match with hashrot() in lj_tab.c. */
1262 emit_rmro(as
, XO_ARITH(XOg_AND
), dest
, tab
, offsetof(GCtab
, hmask
));
1263 emit_rr(as
, XO_ARITH(XOg_SUB
), dest
, tmp
);
1264 emit_shifti(as
, XOg_ROL
, tmp
, HASH_ROT3
);
1265 emit_rr(as
, XO_ARITH(XOg_XOR
), dest
, tmp
);
1267 emit_shifti(as
, XOg_ROL
, dest
, HASH_ROT2
);
1268 emit_rr(as
, XO_ARITH(XOg_SUB
), tmp
, dest
);
1269 emit_shifti(as
, XOg_ROL
, dest
, HASH_ROT1
);
1270 emit_rr(as
, XO_ARITH(XOg_XOR
), tmp
, dest
);
1271 if (irt_isnum(kt
)) {
1272 emit_rr(as
, XO_ARITH(XOg_ADD
), dest
, dest
);
1274 emit_shifti(as
, XOg_SHR
|REX_64
, dest
, 32);
1275 emit_rr(as
, XO_MOV
, tmp
, dest
);
1276 emit_rr(as
, XO_MOVDto
, key
|REX_64
, dest
);
1278 emit_rmro(as
, XO_MOV
, dest
, RID_ESP
, ra_spill(as
, irkey
)+4);
1279 emit_rr(as
, XO_MOVDto
, key
, tmp
);
1282 emit_rr(as
, XO_MOV
, tmp
, key
);
1284 emit_gri(as
, XG_ARITHi(XOg_XOR
), dest
, irt_toitype(kt
) << 15);
1285 if ((as
->flags
& JIT_F_BMI2
)) {
1287 emit_mrm(as
, XV_RORX
|VEX_64
, dest
, key
);
1289 emit_shifti(as
, XOg_SHR
|REX_64
, dest
, 32);
1290 emit_rr(as
, XO_MOV
, dest
|REX_64
, key
|REX_64
);
1293 emit_rmro(as
, XO_LEA
, dest
, key
, HASH_BIAS
);
1300 static void asm_hrefk(ASMState
*as
, IRIns
*ir
)
1302 IRIns
*kslot
= IR(ir
->op2
);
1303 IRIns
*irkey
= IR(kslot
->op1
);
1304 int32_t ofs
= (int32_t)(kslot
->op2
* sizeof(Node
));
1305 Reg dest
= ra_used(ir
) ? ra_dest(as
, ir
, RSET_GPR
) : RID_NONE
;
1306 Reg node
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1310 lj_assertA(ofs
% sizeof(Node
) == 0, "unaligned HREFK slot");
1311 if (ra_hasreg(dest
)) {
1314 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
|REX_GC64
, ofs
);
1316 emit_rmro(as
, XO_LEA
, dest
|REX_GC64
, node
, ofs
);
1317 } else if (dest
!= node
) {
1318 emit_rr(as
, XO_MOV
, dest
|REX_GC64
, node
);
1321 asm_guardcc(as
, CC_NE
);
1323 if (!irt_ispri(irkey
->t
)) {
1324 Reg key
= ra_scratch(as
, rset_exclude(RSET_GPR
, node
));
1325 emit_rmro(as
, XO_CMP
, key
|REX_64
, node
,
1326 ofs
+ (int32_t)offsetof(Node
, key
.u64
));
1327 lj_assertA(irt_isnum(irkey
->t
) || irt_isgcv(irkey
->t
),
1328 "bad HREFK key type");
1329 /* Assumes -0.0 is already canonicalized to +0.0. */
1330 emit_loadu64(as
, key
, irt_isnum(irkey
->t
) ? ir_knum(irkey
)->u64
:
1332 ((uint64_t)irt_toitype(irkey
->t
) << 47) |
1333 (uint64_t)ir_kgc(irkey
));
1335 ((uint64_t)irt_toitype(irkey
->t
) << 32) |
1336 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey
)));
1339 lj_assertA(!irt_isnil(irkey
->t
), "bad HREFK key type");
1341 emit_i32(as
, (irt_toitype(irkey
->t
)<<15)|0x7fff);
1342 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, node
,
1343 ofs
+ (int32_t)offsetof(Node
, key
.it
));
1345 emit_i8(as
, irt_toitype(irkey
->t
));
1346 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
1347 ofs
+ (int32_t)offsetof(Node
, key
.it
));
1351 l_exit
= emit_label(as
);
1352 if (irt_isnum(irkey
->t
)) {
1353 /* Assumes -0.0 is already canonicalized to +0.0. */
1354 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1355 ofs
+ (int32_t)offsetof(Node
, key
.u32
.lo
),
1356 (int32_t)ir_knum(irkey
)->u32
.lo
);
1357 emit_sjcc(as
, CC_NE
, l_exit
);
1358 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1359 ofs
+ (int32_t)offsetof(Node
, key
.u32
.hi
),
1360 (int32_t)ir_knum(irkey
)->u32
.hi
);
1362 if (!irt_ispri(irkey
->t
)) {
1363 lj_assertA(irt_isgcv(irkey
->t
), "bad HREFK key type");
1364 emit_gmroi(as
, XG_ARITHi(XOg_CMP
), node
,
1365 ofs
+ (int32_t)offsetof(Node
, key
.gcr
),
1366 ptr2addr(ir_kgc(irkey
)));
1367 emit_sjcc(as
, CC_NE
, l_exit
);
1369 lj_assertA(!irt_isnil(irkey
->t
), "bad HREFK key type");
1370 emit_i8(as
, irt_toitype(irkey
->t
));
1371 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, node
,
1372 ofs
+ (int32_t)offsetof(Node
, key
.it
));
1377 static void asm_uref(ASMState
*as
, IRIns
*ir
)
1379 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1380 int guarded
= (irt_t(ir
->t
) & (IRT_GUARD
|IRT_TYPE
)) == (IRT_GUARD
|IRT_PGC
);
1381 if (irref_isk(ir
->op1
) && !guarded
) {
1382 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
1383 MRef
*v
= &gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)])->uv
.v
;
1384 emit_rma(as
, XO_MOV
, dest
|REX_GC64
, v
);
1386 Reg uv
= ra_scratch(as
, RSET_GPR
);
1387 if (ir
->o
== IR_UREFC
)
1388 emit_rmro(as
, XO_LEA
, dest
|REX_GC64
, uv
, offsetof(GCupval
, tv
));
1390 emit_rmro(as
, XO_MOV
, dest
|REX_GC64
, uv
, offsetof(GCupval
, v
));
1392 asm_guardcc(as
, ir
->o
== IR_UREFC
? CC_E
: CC_NE
);
1394 emit_rmro(as
, XO_ARITHib
, XOg_CMP
, uv
, offsetof(GCupval
, closed
));
1396 if (irref_isk(ir
->op1
)) {
1397 GCfunc
*fn
= ir_kfunc(IR(ir
->op1
));
1398 GCobj
*o
= gcref(fn
->l
.uvptr
[(ir
->op2
>> 8)]);
1399 emit_loada(as
, uv
, o
);
1401 emit_rmro(as
, XO_MOV
, uv
|REX_GC64
, ra_alloc1(as
, ir
->op1
, RSET_GPR
),
1402 (int32_t)offsetof(GCfuncL
, uvptr
) +
1403 (int32_t)sizeof(MRef
) * (int32_t)(ir
->op2
>> 8));
1408 static void asm_fref(ASMState
*as
, IRIns
*ir
)
1410 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1411 asm_fusefref(as
, ir
, RSET_GPR
);
1412 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
1415 static void asm_strref(ASMState
*as
, IRIns
*ir
)
1417 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1418 asm_fusestrref(as
, ir
, RSET_GPR
);
1419 if (as
->mrm
.base
== RID_NONE
)
1420 emit_loadi(as
, dest
, as
->mrm
.ofs
);
1421 else if (as
->mrm
.base
== dest
&& as
->mrm
.idx
== RID_NONE
)
1422 emit_gri(as
, XG_ARITHi(XOg_ADD
), dest
|REX_GC64
, as
->mrm
.ofs
);
1424 emit_mrm(as
, XO_LEA
, dest
|REX_GC64
, RID_MRM
);
1427 /* -- Loads and stores ---------------------------------------------------- */
1429 static void asm_fxload(ASMState
*as
, IRIns
*ir
)
1431 Reg dest
= ra_dest(as
, ir
, irt_isfp(ir
->t
) ? RSET_FPR
: RSET_GPR
);
1433 if (ir
->o
== IR_FLOAD
)
1434 asm_fusefref(as
, ir
, RSET_GPR
);
1436 asm_fusexref(as
, ir
->op1
, RSET_GPR
);
1437 /* ir->op2 is ignored -- unaligned loads are ok on x86. */
1438 switch (irt_type(ir
->t
)) {
1439 case IRT_I8
: xo
= XO_MOVSXb
; break;
1440 case IRT_U8
: xo
= XO_MOVZXb
; break;
1441 case IRT_I16
: xo
= XO_MOVSXw
; break;
1442 case IRT_U16
: xo
= XO_MOVZXw
; break;
1443 case IRT_NUM
: xo
= XO_MOVSD
; break;
1444 case IRT_FLOAT
: xo
= XO_MOVSS
; break;
1446 if (LJ_64
&& irt_is64(ir
->t
))
1449 lj_assertA(irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
),
1450 "unsplit 64 bit load");
1454 emit_mrm(as
, xo
, dest
, RID_MRM
);
1457 #define asm_fload(as, ir) asm_fxload(as, ir)
1458 #define asm_xload(as, ir) asm_fxload(as, ir)
1460 static void asm_fxstore(ASMState
*as
, IRIns
*ir
)
1462 RegSet allow
= RSET_GPR
;
1463 Reg src
= RID_NONE
, osrc
= RID_NONE
;
1465 if (ir
->r
== RID_SINK
)
1467 /* The IRT_I16/IRT_U16 stores should never be simplified for constant
1468 ** values since mov word [mem], imm16 has a length-changing prefix.
1470 if (irt_isi16(ir
->t
) || irt_isu16(ir
->t
) || irt_isfp(ir
->t
) ||
1471 !asm_isk32(as
, ir
->op2
, &k
)) {
1472 RegSet allow8
= irt_isfp(ir
->t
) ? RSET_FPR
:
1473 (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) ? RSET_GPR8
: RSET_GPR
;
1474 src
= osrc
= ra_alloc1(as
, ir
->op2
, allow8
);
1475 if (!LJ_64
&& !rset_test(allow8
, src
)) { /* Already in wrong register. */
1476 rset_clear(allow
, osrc
);
1477 src
= ra_scratch(as
, allow8
);
1479 rset_clear(allow
, src
);
1481 if (ir
->o
== IR_FSTORE
) {
1482 asm_fusefref(as
, IR(ir
->op1
), allow
);
1484 asm_fusexref(as
, ir
->op1
, allow
);
1485 if (LJ_32
&& ir
->o
== IR_HIOP
) as
->mrm
.ofs
+= 4;
1487 if (ra_hasreg(src
)) {
1489 switch (irt_type(ir
->t
)) {
1490 case IRT_I8
: case IRT_U8
: xo
= XO_MOVtob
; src
|= FORCE_REX
; break;
1491 case IRT_I16
: case IRT_U16
: xo
= XO_MOVtow
; break;
1492 case IRT_NUM
: xo
= XO_MOVSDto
; break;
1493 case IRT_FLOAT
: xo
= XO_MOVSSto
; break;
1494 #if LJ_64 && !LJ_GC64
1496 /* NYI: mask 64 bit lightuserdata. */
1497 lj_assertA(0, "store of lightuserdata");
1500 if (LJ_64
&& irt_is64(ir
->t
))
1503 lj_assertA(irt_isint(ir
->t
) || irt_isu32(ir
->t
) || irt_isaddr(ir
->t
),
1504 "unsplit 64 bit store");
1508 emit_mrm(as
, xo
, src
, RID_MRM
);
1509 if (!LJ_64
&& src
!= osrc
) {
1510 ra_noweak(as
, osrc
);
1511 emit_rr(as
, XO_MOV
, src
, osrc
);
1514 if (irt_isi8(ir
->t
) || irt_isu8(ir
->t
)) {
1516 emit_mrm(as
, XO_MOVmib
, 0, RID_MRM
);
1518 lj_assertA(irt_is64(ir
->t
) || irt_isint(ir
->t
) || irt_isu32(ir
->t
) ||
1519 irt_isaddr(ir
->t
), "bad store type");
1521 emit_mrm(as
, XO_MOVmi
, REX_64IR(ir
, 0), RID_MRM
);
1526 #define asm_fstore(as, ir) asm_fxstore(as, ir)
1527 #define asm_xstore(as, ir) asm_fxstore(as, ir)
1529 #if LJ_64 && !LJ_GC64
1530 static Reg
asm_load_lightud64(ASMState
*as
, IRIns
*ir
, int typecheck
)
1532 if (ra_used(ir
) || typecheck
) {
1533 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
1535 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, dest
));
1536 asm_guardcc(as
, CC_NE
);
1538 emit_rr(as
, XO_ARITHi8
, XOg_CMP
, tmp
);
1539 emit_shifti(as
, XOg_SAR
|REX_64
, tmp
, 47);
1540 emit_rr(as
, XO_MOV
, tmp
|REX_64
, dest
);
1549 static void asm_ahuvload(ASMState
*as
, IRIns
*ir
)
1554 lj_assertA(irt_isnum(ir
->t
) || irt_ispri(ir
->t
) || irt_isaddr(ir
->t
) ||
1555 (LJ_DUALNUM
&& irt_isint(ir
->t
)),
1556 "bad load type %d", irt_type(ir
->t
));
1557 #if LJ_64 && !LJ_GC64
1558 if (irt_islightud(ir
->t
)) {
1559 Reg dest
= asm_load_lightud64(as
, ir
, 1);
1560 if (ra_hasreg(dest
)) {
1562 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1563 if (ir
->o
== IR_VLOAD
) as
->mrm
.ofs
+= 8 * ir
->op2
;
1564 emit_mrm(as
, XO_MOV
, dest
|REX_64
, RID_MRM
);
1570 RegSet allow
= irt_isnum(ir
->t
) ? RSET_FPR
: RSET_GPR
;
1571 Reg dest
= ra_dest(as
, ir
, allow
);
1572 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1573 if (ir
->o
== IR_VLOAD
) as
->mrm
.ofs
+= 8 * ir
->op2
;
1575 if (irt_isaddr(ir
->t
)) {
1576 emit_shifti(as
, XOg_SHR
|REX_64
, dest
, 17);
1577 asm_guardcc(as
, CC_NE
);
1578 emit_i8(as
, irt_toitype(ir
->t
));
1579 emit_rr(as
, XO_ARITHi8
, XOg_CMP
, dest
);
1580 emit_i8(as
, XI_O16
);
1581 if ((as
->flags
& JIT_F_BMI2
)) {
1583 emit_mrm(as
, XV_RORX
|VEX_64
, dest
, RID_MRM
);
1585 emit_shifti(as
, XOg_ROR
|REX_64
, dest
, 47);
1586 emit_mrm(as
, XO_MOV
, dest
|REX_64
, RID_MRM
);
1591 emit_mrm(as
, dest
< RID_MAX_GPR
? XO_MOV
: XO_MOVSD
, dest
, RID_MRM
);
1593 RegSet gpr
= RSET_GPR
;
1595 if (irt_isaddr(ir
->t
)) {
1596 tmp
= ra_scratch(as
, RSET_GPR
);
1597 gpr
= rset_exclude(gpr
, tmp
);
1600 asm_fuseahuref(as
, ir
->op1
, gpr
);
1601 if (ir
->o
== IR_VLOAD
) as
->mrm
.ofs
+= 8 * ir
->op2
;
1603 /* Always do the type check, even if the load result is unused. */
1605 asm_guardcc(as
, irt_isnum(ir
->t
) ? CC_AE
: CC_NE
);
1606 if (LJ_64
&& irt_type(ir
->t
) >= IRT_NUM
) {
1607 lj_assertA(irt_isinteger(ir
->t
) || irt_isnum(ir
->t
),
1608 "bad load type %d", irt_type(ir
->t
));
1611 emit_u32(as
, LJ_TISNUM
<< 15);
1613 emit_u32(as
, LJ_TISNUM
);
1615 emit_mrm(as
, XO_ARITHi
, XOg_CMP
, RID_MRM
);
1617 } else if (irt_isaddr(ir
->t
)) {
1619 emit_i8(as
, irt_toitype(ir
->t
));
1620 emit_mrm(as
, XO_ARITHi8
, XOg_CMP
, tmp
);
1621 emit_shifti(as
, XOg_SAR
|REX_64
, tmp
, 47);
1622 emit_mrm(as
, XO_MOV
, tmp
|REX_64
, RID_MRM
);
1623 } else if (irt_isnil(ir
->t
)) {
1626 emit_mrm(as
, XO_ARITHi8
, XOg_CMP
|REX_64
, RID_MRM
);
1628 emit_u32(as
, (irt_toitype(ir
->t
) << 15) | 0x7fff);
1629 emit_mrm(as
, XO_ARITHi
, XOg_CMP
, RID_MRM
);
1632 emit_i8(as
, irt_toitype(ir
->t
));
1633 emit_mrm(as
, XO_ARITHi8
, XOg_CMP
, RID_MRM
);
1638 static void asm_ahustore(ASMState
*as
, IRIns
*ir
)
1640 if (ir
->r
== RID_SINK
)
1642 if (irt_isnum(ir
->t
)) {
1643 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_FPR
);
1644 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1645 emit_mrm(as
, XO_MOVSDto
, src
, RID_MRM
);
1646 #if LJ_64 && !LJ_GC64
1647 } else if (irt_islightud(ir
->t
)) {
1648 Reg src
= ra_alloc1(as
, ir
->op2
, RSET_GPR
);
1649 asm_fuseahuref(as
, ir
->op1
, rset_exclude(RSET_GPR
, src
));
1650 emit_mrm(as
, XO_MOVto
, src
|REX_64
, RID_MRM
);
1653 } else if (irref_isk(ir
->op2
)) {
1655 lj_ir_kvalue(as
->J
->L
, &k
, IR(ir
->op2
));
1656 asm_fuseahuref(as
, ir
->op1
, RSET_GPR
);
1659 emit_mrm(as
, XO_MOVmi
, REX_64
, RID_MRM
);
1661 emit_u32(as
, k
.u32
.lo
);
1662 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1664 emit_u32(as
, k
.u32
.hi
);
1665 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1669 IRIns
*irr
= IR(ir
->op2
);
1670 RegSet allow
= RSET_GPR
;
1672 if (!irref_isk(ir
->op2
)) {
1673 src
= ra_alloc1(as
, ir
->op2
, allow
);
1674 rset_clear(allow
, src
);
1676 asm_fuseahuref(as
, ir
->op1
, allow
);
1677 if (ra_hasreg(src
)) {
1679 if (!(LJ_DUALNUM
&& irt_isinteger(ir
->t
))) {
1680 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1682 emit_u32(as
, irt_toitype(ir
->t
) << 15);
1683 emit_mrm(as
, XO_ARITHi
, XOg_OR
, RID_MRM
);
1685 emit_mrm(as
, XO_MOVto
, src
|REX_64
, RID_MRM
);
1689 emit_mrm(as
, XO_MOVto
, src
, RID_MRM
);
1690 } else if (!irt_ispri(irr
->t
)) {
1691 lj_assertA(irt_isaddr(ir
->t
) || (LJ_DUALNUM
&& irt_isinteger(ir
->t
)),
1693 emit_i32(as
, irr
->i
);
1694 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1698 lj_assertA(LJ_DUALNUM
&& irt_isinteger(ir
->t
), "bad store type");
1699 emit_i32(as
, LJ_TNUMX
<< 15);
1701 emit_i32(as
, (int32_t)irt_toitype(ir
->t
));
1703 emit_mrm(as
, XO_MOVmi
, 0, RID_MRM
);
1707 static void asm_sload(ASMState
*as
, IRIns
*ir
)
1709 int32_t ofs
= 8*((int32_t)ir
->op1
-1-LJ_FR2
) +
1710 (!LJ_FR2
&& (ir
->op2
& IRSLOAD_FRAME
) ? 4 : 0);
1713 lj_assertA(!(ir
->op2
& IRSLOAD_PARENT
),
1714 "bad parent SLOAD"); /* Handled by asm_head_side(). */
1715 lj_assertA(irt_isguard(t
) || !(ir
->op2
& IRSLOAD_TYPECHECK
),
1716 "inconsistent SLOAD variant");
1717 lj_assertA(LJ_DUALNUM
||
1719 (ir
->op2
& (IRSLOAD_CONVERT
|IRSLOAD_FRAME
|IRSLOAD_KEYINDEX
)),
1721 if ((ir
->op2
& IRSLOAD_CONVERT
) && irt_isguard(t
) && irt_isint(t
)) {
1722 Reg left
= ra_scratch(as
, RSET_FPR
);
1723 asm_tointg(as
, ir
, left
); /* Frees dest reg. Do this before base alloc. */
1724 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1725 emit_rmro(as
, XO_MOVSD
, left
, base
, ofs
);
1726 t
.irt
= IRT_NUM
; /* Continue with a regular number type check. */
1727 #if LJ_64 && !LJ_GC64
1728 } else if (irt_islightud(t
)) {
1729 Reg dest
= asm_load_lightud64(as
, ir
, (ir
->op2
& IRSLOAD_TYPECHECK
));
1730 if (ra_hasreg(dest
)) {
1731 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1732 emit_rmro(as
, XO_MOV
, dest
|REX_64
, base
, ofs
);
1736 } else if (ra_used(ir
)) {
1737 RegSet allow
= irt_isnum(t
) ? RSET_FPR
: RSET_GPR
;
1738 Reg dest
= ra_dest(as
, ir
, allow
);
1739 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1740 lj_assertA(irt_isnum(t
) || irt_isint(t
) || irt_isaddr(t
),
1741 "bad SLOAD type %d", irt_type(t
));
1742 if ((ir
->op2
& IRSLOAD_CONVERT
)) {
1743 t
.irt
= irt_isint(t
) ? IRT_NUM
: IRT_INT
; /* Check for original type. */
1744 emit_rmro(as
, irt_isint(t
) ? XO_CVTSI2SD
: XO_CVTTSD2SI
, dest
, base
, ofs
);
1747 if (irt_isaddr(t
)) {
1748 /* LJ_GC64 type check + tag removal without BMI2 and with BMI2:
1750 ** mov r64, [addr] rorx r64, [addr], 47
1752 ** cmp r16, itype cmp r16, itype
1753 ** jne ->exit jne ->exit
1754 ** shr r64, 16 shr r64, 16
1756 emit_shifti(as
, XOg_SHR
|REX_64
, dest
, 17);
1757 if ((ir
->op2
& IRSLOAD_TYPECHECK
)) {
1758 asm_guardcc(as
, CC_NE
);
1759 emit_i8(as
, irt_toitype(t
));
1760 emit_rr(as
, XO_ARITHi8
, XOg_CMP
, dest
);
1761 emit_i8(as
, XI_O16
);
1763 if ((as
->flags
& JIT_F_BMI2
)) {
1765 emit_rmro(as
, XV_RORX
|VEX_64
, dest
, base
, ofs
);
1767 if ((ir
->op2
& IRSLOAD_TYPECHECK
))
1768 emit_shifti(as
, XOg_ROR
|REX_64
, dest
, 47);
1770 emit_shifti(as
, XOg_SHL
|REX_64
, dest
, 17);
1771 emit_rmro(as
, XO_MOV
, dest
|REX_64
, base
, ofs
);
1776 emit_rmro(as
, irt_isnum(t
) ? XO_MOVSD
: XO_MOV
, dest
, base
, ofs
);
1779 if (!(ir
->op2
& IRSLOAD_TYPECHECK
))
1780 return; /* No type check: avoid base alloc. */
1781 base
= ra_alloc1(as
, REF_BASE
, RSET_GPR
);
1783 if ((ir
->op2
& IRSLOAD_TYPECHECK
)) {
1784 /* Need type check, even if the load result is unused. */
1785 asm_guardcc(as
, irt_isnum(t
) ? CC_AE
: CC_NE
);
1786 if ((LJ_64
&& irt_type(t
) >= IRT_NUM
) || (ir
->op2
& IRSLOAD_KEYINDEX
)) {
1787 lj_assertA(irt_isinteger(t
) || irt_isnum(t
),
1788 "bad SLOAD type %d", irt_type(t
));
1789 emit_u32(as
, (ir
->op2
& IRSLOAD_KEYINDEX
) ? LJ_KEYINDEX
:
1790 LJ_GC64
? (LJ_TISNUM
<< 15) : LJ_TISNUM
);
1791 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, base
, ofs
+4);
1793 } else if (irt_isnil(t
)) {
1794 /* LJ_GC64 type check for nil:
1796 ** cmp qword [addr], -1
1800 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
|REX_64
, base
, ofs
);
1801 } else if (irt_ispri(t
)) {
1802 emit_u32(as
, (irt_toitype(t
) << 15) | 0x7fff);
1803 emit_rmro(as
, XO_ARITHi
, XOg_CMP
, base
, ofs
+4);
1805 /* LJ_GC64 type check only:
1812 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, base
));
1813 emit_i8(as
, irt_toitype(t
));
1814 emit_rr(as
, XO_ARITHi8
, XOg_CMP
, tmp
);
1815 emit_shifti(as
, XOg_SAR
|REX_64
, tmp
, 47);
1816 emit_rmro(as
, XO_MOV
, tmp
|REX_64
, base
, ofs
);
1819 emit_i8(as
, irt_toitype(t
));
1820 emit_rmro(as
, XO_ARITHi8
, XOg_CMP
, base
, ofs
+4);
1826 /* -- Allocations --------------------------------------------------------- */
1829 static void asm_cnew(ASMState
*as
, IRIns
*ir
)
1831 CTState
*cts
= ctype_ctsG(J2G(as
->J
));
1832 CTypeID id
= (CTypeID
)IR(ir
->op1
)->i
;
1834 CTInfo info
= lj_ctype_info(cts
, id
, &sz
);
1835 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_mem_newgco
];
1837 lj_assertA(sz
!= CTSIZE_INVALID
|| (ir
->o
== IR_CNEW
&& ir
->op2
!= REF_NIL
),
1838 "bad CNEW/CNEWI operands");
1841 asm_setupresult(as
, ir
, ci
); /* GCcdata * */
1843 /* Initialize immutable cdata object. */
1844 if (ir
->o
== IR_CNEWI
) {
1845 RegSet allow
= (RSET_GPR
& ~RSET_SCRATCH
);
1847 Reg r64
= sz
== 8 ? REX_64
: 0;
1848 if (irref_isk(ir
->op2
)) {
1849 IRIns
*irk
= IR(ir
->op2
);
1850 uint64_t k
= (irk
->o
== IR_KINT64
||
1851 (LJ_GC64
&& (irk
->o
== IR_KPTR
|| irk
->o
== IR_KKPTR
))) ?
1852 ir_k64(irk
)->u64
: (uint64_t)(uint32_t)irk
->i
;
1853 if (sz
== 4 || checki32((int64_t)k
)) {
1854 emit_i32(as
, (int32_t)k
);
1855 emit_rmro(as
, XO_MOVmi
, r64
, RID_RET
, sizeof(GCcdata
));
1857 emit_movtomro(as
, RID_ECX
+ r64
, RID_RET
, sizeof(GCcdata
));
1858 emit_loadu64(as
, RID_ECX
, k
);
1861 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
1862 emit_movtomro(as
, r
+ r64
, RID_RET
, sizeof(GCcdata
));
1865 int32_t ofs
= sizeof(GCcdata
);
1868 lj_assertA(ir
->o
== IR_HIOP
, "missing CNEWI HIOP");
1871 if (irref_isk(ir
->op2
)) {
1872 emit_movmroi(as
, RID_RET
, ofs
, IR(ir
->op2
)->i
);
1874 Reg r
= ra_alloc1(as
, ir
->op2
, allow
);
1875 emit_movtomro(as
, r
, RID_RET
, ofs
);
1876 rset_clear(allow
, r
);
1878 if (ofs
== sizeof(GCcdata
)) break;
1882 lj_assertA(sz
== 4 || sz
== 8, "bad CNEWI size %d", sz
);
1883 } else if (ir
->op2
!= REF_NIL
) { /* Create VLA/VLS/aligned cdata. */
1884 ci
= &lj_ir_callinfo
[IRCALL_lj_cdata_newv
];
1885 args
[0] = ASMREF_L
; /* lua_State *L */
1886 args
[1] = ir
->op1
; /* CTypeID id */
1887 args
[2] = ir
->op2
; /* CTSize sz */
1888 args
[3] = ASMREF_TMP1
; /* CTSize align */
1889 asm_gencall(as
, ci
, args
);
1890 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP1
), (int32_t)ctype_align(info
));
1894 /* Combine initialization of marked, gct and ctypeid. */
1895 emit_movtomro(as
, RID_ECX
, RID_RET
, offsetof(GCcdata
, marked
));
1896 emit_gri(as
, XG_ARITHi(XOg_OR
), RID_ECX
,
1897 (int32_t)((~LJ_TCDATA
<<8)+(id
<<16)));
1898 emit_gri(as
, XG_ARITHi(XOg_AND
), RID_ECX
, LJ_GC_WHITES
);
1899 emit_opgl(as
, XO_MOVZXb
, RID_ECX
, gc
.currentwhite
);
1901 args
[0] = ASMREF_L
; /* lua_State *L */
1902 args
[1] = ASMREF_TMP1
; /* MSize size */
1903 asm_gencall(as
, ci
, args
);
1904 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP1
), (int32_t)(sz
+sizeof(GCcdata
)));
1908 /* -- Write barriers ------------------------------------------------------ */
1910 static void asm_tbar(ASMState
*as
, IRIns
*ir
)
1912 Reg tab
= ra_alloc1(as
, ir
->op1
, RSET_GPR
);
1913 Reg tmp
= ra_scratch(as
, rset_exclude(RSET_GPR
, tab
));
1914 MCLabel l_end
= emit_label(as
);
1915 emit_movtomro(as
, tmp
|REX_GC64
, tab
, offsetof(GCtab
, gclist
));
1916 emit_setgl(as
, tab
, gc
.grayagain
);
1917 emit_getgl(as
, tmp
, gc
.grayagain
);
1918 emit_i8(as
, ~LJ_GC_BLACK
);
1919 emit_rmro(as
, XO_ARITHib
, XOg_AND
, tab
, offsetof(GCtab
, marked
));
1920 emit_sjcc(as
, CC_Z
, l_end
);
1921 emit_i8(as
, LJ_GC_BLACK
);
1922 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, tab
, offsetof(GCtab
, marked
));
1925 static void asm_obar(ASMState
*as
, IRIns
*ir
)
1927 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_barrieruv
];
1931 /* No need for other object barriers (yet). */
1932 lj_assertA(IR(ir
->op1
)->o
== IR_UREFC
, "bad OBAR type");
1933 ra_evictset(as
, RSET_SCRATCH
);
1934 l_end
= emit_label(as
);
1935 args
[0] = ASMREF_TMP1
; /* global_State *g */
1936 args
[1] = ir
->op1
; /* TValue *tv */
1937 asm_gencall(as
, ci
, args
);
1938 emit_loada(as
, ra_releasetmp(as
, ASMREF_TMP1
), J2G(as
->J
));
1939 obj
= IR(ir
->op1
)->r
;
1940 emit_sjcc(as
, CC_Z
, l_end
);
1941 emit_i8(as
, LJ_GC_WHITES
);
1942 if (irref_isk(ir
->op2
)) {
1943 GCobj
*vp
= ir_kgc(IR(ir
->op2
));
1944 emit_rma(as
, XO_GROUP3b
, XOg_TEST
, &vp
->gch
.marked
);
1946 Reg val
= ra_alloc1(as
, ir
->op2
, rset_exclude(RSET_SCRATCH
&RSET_GPR
, obj
));
1947 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, val
, (int32_t)offsetof(GChead
, marked
));
1949 emit_sjcc(as
, CC_Z
, l_end
);
1950 emit_i8(as
, LJ_GC_BLACK
);
1951 emit_rmro(as
, XO_GROUP3b
, XOg_TEST
, obj
,
1952 (int32_t)offsetof(GCupval
, marked
)-(int32_t)offsetof(GCupval
, tv
));
1955 /* -- FP/int arithmetic and logic operations ------------------------------ */
1957 /* Load reference onto x87 stack. Force a spill to memory if needed. */
1958 static void asm_x87load(ASMState
*as
, IRRef ref
)
1960 IRIns
*ir
= IR(ref
);
1961 if (ir
->o
== IR_KNUM
) {
1962 cTValue
*tv
= ir_knum(ir
);
1963 if (tvispzero(tv
)) /* Use fldz only for +0. */
1964 emit_x87op(as
, XI_FLDZ
);
1965 else if (tvispone(tv
))
1966 emit_x87op(as
, XI_FLD1
);
1968 emit_rma(as
, XO_FLDq
, XOg_FLDq
, tv
);
1969 } else if (ir
->o
== IR_CONV
&& ir
->op2
== IRCONV_NUM_INT
&& !ra_used(ir
) &&
1970 !irref_isk(ir
->op1
) && mayfuse(as
, ir
->op1
)) {
1971 IRIns
*iri
= IR(ir
->op1
);
1972 emit_rmro(as
, XO_FILDd
, XOg_FILDd
, RID_ESP
, ra_spill(as
, iri
));
1974 emit_mrm(as
, XO_FLDq
, XOg_FLDq
, asm_fuseload(as
, ref
, RSET_EMPTY
));
1978 static void asm_fpmath(ASMState
*as
, IRIns
*ir
)
1980 IRFPMathOp fpm
= (IRFPMathOp
)ir
->op2
;
1981 if (fpm
== IRFPM_SQRT
) {
1982 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
1983 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
1984 emit_mrm(as
, XO_SQRTSD
, dest
, left
);
1985 } else if (fpm
<= IRFPM_TRUNC
) {
1986 if (as
->flags
& JIT_F_SSE4_1
) { /* SSE4.1 has a rounding instruction. */
1987 Reg dest
= ra_dest(as
, ir
, RSET_FPR
);
1988 Reg left
= asm_fuseload(as
, ir
->op1
, RSET_FPR
);
1989 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
1990 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
1991 ** This is atrocious, but the alternatives are much worse.
1993 /* Round down/up/trunc == 1001/1010/1011. */
1994 emit_i8(as
, 0x09 + fpm
);
1995 emit_mrm(as
, XO_ROUNDSD
, dest
, left
);
1996 if (LJ_64
&& as
->mcp
[1] != (MCode
)(XO_ROUNDSD
>> 16)) {
1997 as
->mcp
[0] = as
->mcp
[1]; as
->mcp
[1] = 0x0f; /* Swap 0F and REX. */
1999 *--as
->mcp
= 0x66; /* 1st byte of ROUNDSD opcode. */
2000 } else { /* Call helper functions for SSE2 variant. */
2001 /* The modified regs must match with the *.dasc implementation. */
2002 RegSet drop
= RSET_RANGE(RID_XMM0
, RID_XMM3
+1)|RID2RSET(RID_EAX
);
2003 if (ra_hasreg(ir
->r
))
2004 rset_clear(drop
, ir
->r
); /* Dest reg handled below. */
2005 ra_evictset(as
, drop
);
2006 ra_destreg(as
, ir
, RID_XMM0
);
2007 emit_call(as
, fpm
== IRFPM_FLOOR
? lj_vm_floor_sse
:
2008 fpm
== IRFPM_CEIL
? lj_vm_ceil_sse
: lj_vm_trunc_sse
);
2009 ra_left(as
, RID_XMM0
, ir
->op1
);
2012 asm_callid(as
, ir
, IRCALL_lj_vm_floor
+ fpm
);
2016 static void asm_ldexp(ASMState
*as
, IRIns
*ir
)
2018 int32_t ofs
= sps_scale(ir
->s
); /* Use spill slot or temp slots. */
2020 if (ra_hasreg(dest
)) {
2022 ra_modified(as
, dest
);
2023 emit_rmro(as
, XO_MOVSD
, dest
, RID_ESP
, ofs
);
2025 emit_rmro(as
, XO_FSTPq
, XOg_FSTPq
, RID_ESP
, ofs
);
2026 emit_x87op(as
, XI_FPOP1
);
2027 emit_x87op(as
, XI_FSCALE
);
2028 asm_x87load(as
, ir
->op1
);
2029 asm_x87load(as
, ir
->op2
);
2032 static int asm_swapops(ASMState
*as
, IRIns
*ir
)
2034 IRIns
*irl
= IR(ir
->op1
);
2035 IRIns
*irr
= IR(ir
->op2
);
2036 lj_assertA(ra_noreg(irr
->r
), "bad usage");
2037 if (!irm_iscomm(lj_ir_mode
[ir
->o
]))
2038 return 0; /* Can't swap non-commutative operations. */
2039 if (irref_isk(ir
->op2
))
2040 return 0; /* Don't swap constants to the left. */
2041 if (ra_hasreg(irl
->r
))
2042 return 1; /* Swap if left already has a register. */
2043 if (ra_samehint(ir
->r
, irr
->r
))
2044 return 1; /* Swap if dest and right have matching hints. */
2045 if (as
->curins
> as
->loopref
) { /* In variant part? */
2046 if (ir
->op2
< as
->loopref
&& !irt_isphi(irr
->t
))
2047 return 0; /* Keep invariants on the right. */
2048 if (ir
->op1
< as
->loopref
&& !irt_isphi(irl
->t
))
2049 return 1; /* Swap invariants to the right. */
2051 if (opisfusableload(irl
->o
))
2052 return 1; /* Swap fusable loads to the right. */
2053 return 0; /* Otherwise don't swap. */
2056 static void asm_fparith(ASMState
*as
, IRIns
*ir
, x86Op xo
)
2058 IRRef lref
= ir
->op1
;
2059 IRRef rref
= ir
->op2
;
2060 RegSet allow
= RSET_FPR
;
2062 Reg right
= IR(rref
)->r
;
2063 if (ra_hasreg(right
)) {
2064 rset_clear(allow
, right
);
2065 ra_noweak(as
, right
);
2067 dest
= ra_dest(as
, ir
, allow
);
2070 } else if (ra_noreg(right
)) {
2071 if (asm_swapops(as
, ir
)) {
2072 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2074 right
= asm_fuseload(as
, rref
, rset_clear(allow
, dest
));
2076 emit_mrm(as
, xo
, dest
, right
);
2077 ra_left(as
, dest
, lref
);
2080 static void asm_intarith(ASMState
*as
, IRIns
*ir
, x86Arith xa
)
2082 IRRef lref
= ir
->op1
;
2083 IRRef rref
= ir
->op2
;
2084 RegSet allow
= RSET_GPR
;
2087 if (as
->flagmcp
== as
->mcp
) { /* Drop test r,r instruction. */
2088 MCode
*p
= as
->mcp
+ ((LJ_64
&& *as
->mcp
< XI_TESTb
) ? 3 : 2);
2089 MCode
*q
= p
[0] == 0x0f ? p
+1 : p
;
2090 if ((*q
& 15) < 14) {
2091 if ((*q
& 15) >= 12) *q
-= 4; /* L <->S, NL <-> NS */
2094 } /* else: cannot transform LE/NLE to cc without use of OF. */
2096 right
= IR(rref
)->r
;
2097 if (ra_hasreg(right
)) {
2098 rset_clear(allow
, right
);
2099 ra_noweak(as
, right
);
2101 dest
= ra_dest(as
, ir
, allow
);
2104 } else if (ra_noreg(right
) && !asm_isk32(as
, rref
, &k
)) {
2105 if (asm_swapops(as
, ir
)) {
2106 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2108 right
= asm_fuseloadm(as
, rref
, rset_clear(allow
, dest
), irt_is64(ir
->t
));
2110 if (irt_isguard(ir
->t
)) /* For IR_ADDOV etc. */
2111 asm_guardcc(as
, CC_O
);
2112 if (xa
!= XOg_X_IMUL
) {
2113 if (ra_hasreg(right
))
2114 emit_mrm(as
, XO_ARITH(xa
), REX_64IR(ir
, dest
), right
);
2116 emit_gri(as
, XG_ARITHi(xa
), REX_64IR(ir
, dest
), k
);
2117 } else if (ra_hasreg(right
)) { /* IMUL r, mrm. */
2118 emit_mrm(as
, XO_IMUL
, REX_64IR(ir
, dest
), right
);
2119 } else { /* IMUL r, r, k. */
2120 /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */
2121 Reg left
= asm_fuseloadm(as
, lref
, RSET_GPR
, irt_is64(ir
->t
));
2123 if (checki8(k
)) { emit_i8(as
, k
); xo
= XO_IMULi8
;
2124 } else { emit_i32(as
, k
); xo
= XO_IMULi
; }
2125 emit_mrm(as
, xo
, REX_64IR(ir
, dest
), left
);
2128 ra_left(as
, dest
, lref
);
2131 /* LEA is really a 4-operand ADD with an independent destination register,
2132 ** up to two source registers and an immediate. One register can be scaled
2133 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
2136 ** Currently only a few common cases are supported:
2137 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated
2138 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b
2139 ** - Right ADD fusion: y = a+(b+k)
2140 ** The ommited variants have already been reduced by FOLD.
2142 ** There are more fusion opportunities, like gathering shifts or joining
2143 ** common references. But these are probably not worth the trouble, since
2144 ** array indexing is not decomposed and already makes use of all fields
2145 ** of the ModRM operand.
2147 static int asm_lea(ASMState
*as
, IRIns
*ir
)
2149 IRIns
*irl
= IR(ir
->op1
);
2150 IRIns
*irr
= IR(ir
->op2
);
2151 RegSet allow
= RSET_GPR
;
2153 as
->mrm
.base
= as
->mrm
.idx
= RID_NONE
;
2154 as
->mrm
.scale
= XM_SCALE1
;
2156 if (ra_hasreg(irl
->r
)) {
2157 rset_clear(allow
, irl
->r
);
2158 ra_noweak(as
, irl
->r
);
2159 as
->mrm
.base
= irl
->r
;
2160 if (irref_isk(ir
->op2
) || ra_hasreg(irr
->r
)) {
2161 /* The PHI renaming logic does a better job in some cases. */
2162 if (ra_hasreg(ir
->r
) &&
2163 ((irt_isphi(irl
->t
) && as
->phireg
[ir
->r
] == ir
->op1
) ||
2164 (irt_isphi(irr
->t
) && as
->phireg
[ir
->r
] == ir
->op2
)))
2166 if (irref_isk(ir
->op2
)) {
2167 as
->mrm
.ofs
= irr
->i
;
2169 rset_clear(allow
, irr
->r
);
2170 ra_noweak(as
, irr
->r
);
2171 as
->mrm
.idx
= irr
->r
;
2173 } else if (irr
->o
== IR_ADD
&& mayfuse(as
, ir
->op2
) &&
2174 irref_isk(irr
->op2
)) {
2175 Reg idx
= ra_alloc1(as
, irr
->op1
, allow
);
2176 rset_clear(allow
, idx
);
2177 as
->mrm
.idx
= (uint8_t)idx
;
2178 as
->mrm
.ofs
= IR(irr
->op2
)->i
;
2182 } else if (ir
->op1
!= ir
->op2
&& irl
->o
== IR_ADD
&& mayfuse(as
, ir
->op1
) &&
2183 (irref_isk(ir
->op2
) || irref_isk(irl
->op2
))) {
2184 Reg idx
, base
= ra_alloc1(as
, irl
->op1
, allow
);
2185 rset_clear(allow
, base
);
2186 as
->mrm
.base
= (uint8_t)base
;
2187 if (irref_isk(ir
->op2
)) {
2188 as
->mrm
.ofs
= irr
->i
;
2189 idx
= ra_alloc1(as
, irl
->op2
, allow
);
2191 as
->mrm
.ofs
= IR(irl
->op2
)->i
;
2192 idx
= ra_alloc1(as
, ir
->op2
, allow
);
2194 rset_clear(allow
, idx
);
2195 as
->mrm
.idx
= (uint8_t)idx
;
2199 dest
= ra_dest(as
, ir
, allow
);
2200 emit_mrm(as
, XO_LEA
, dest
, RID_MRM
);
2201 return 1; /* Success. */
2204 static void asm_add(ASMState
*as
, IRIns
*ir
)
2206 if (irt_isnum(ir
->t
))
2207 asm_fparith(as
, ir
, XO_ADDSD
);
2208 else if (as
->flagmcp
== as
->mcp
|| irt_is64(ir
->t
) || !asm_lea(as
, ir
))
2209 asm_intarith(as
, ir
, XOg_ADD
);
2212 static void asm_sub(ASMState
*as
, IRIns
*ir
)
2214 if (irt_isnum(ir
->t
))
2215 asm_fparith(as
, ir
, XO_SUBSD
);
2216 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2217 asm_intarith(as
, ir
, XOg_SUB
);
2220 static void asm_mul(ASMState
*as
, IRIns
*ir
)
2222 if (irt_isnum(ir
->t
))
2223 asm_fparith(as
, ir
, XO_MULSD
);
2225 asm_intarith(as
, ir
, XOg_X_IMUL
);
2228 #define asm_fpdiv(as, ir) asm_fparith(as, ir, XO_DIVSD)
2230 static void asm_neg_not(ASMState
*as
, IRIns
*ir
, x86Group3 xg
)
2232 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2233 emit_rr(as
, XO_GROUP3
, REX_64IR(ir
, xg
), dest
);
2234 ra_left(as
, dest
, ir
->op1
);
2237 static void asm_neg(ASMState
*as
, IRIns
*ir
)
2239 if (irt_isnum(ir
->t
))
2240 asm_fparith(as
, ir
, XO_XORPS
);
2242 asm_neg_not(as
, ir
, XOg_NEG
);
2245 #define asm_abs(as, ir) asm_fparith(as, ir, XO_ANDPS)
2247 static void asm_intmin_max(ASMState
*as
, IRIns
*ir
, int cc
)
2249 Reg right
, dest
= ra_dest(as
, ir
, RSET_GPR
);
2250 IRRef lref
= ir
->op1
, rref
= ir
->op2
;
2251 if (irref_isk(rref
)) { lref
= rref
; rref
= ir
->op1
; }
2252 right
= ra_alloc1(as
, rref
, rset_exclude(RSET_GPR
, dest
));
2253 emit_rr(as
, XO_CMOV
+ (cc
<<24), REX_64IR(ir
, dest
), right
);
2254 emit_rr(as
, XO_CMP
, REX_64IR(ir
, dest
), right
);
2255 ra_left(as
, dest
, lref
);
2258 static void asm_min(ASMState
*as
, IRIns
*ir
)
2260 if (irt_isnum(ir
->t
))
2261 asm_fparith(as
, ir
, XO_MINSD
);
2263 asm_intmin_max(as
, ir
, CC_G
);
2266 static void asm_max(ASMState
*as
, IRIns
*ir
)
2268 if (irt_isnum(ir
->t
))
2269 asm_fparith(as
, ir
, XO_MAXSD
);
2271 asm_intmin_max(as
, ir
, CC_L
);
2274 /* Note: don't use LEA for overflow-checking arithmetic! */
2275 #define asm_addov(as, ir) asm_intarith(as, ir, XOg_ADD)
2276 #define asm_subov(as, ir) asm_intarith(as, ir, XOg_SUB)
2277 #define asm_mulov(as, ir) asm_intarith(as, ir, XOg_X_IMUL)
2279 #define asm_bnot(as, ir) asm_neg_not(as, ir, XOg_NOT)
2281 static void asm_bswap(ASMState
*as
, IRIns
*ir
)
2283 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2284 as
->mcp
= emit_op(XO_BSWAP
+ ((dest
&7) << 24),
2285 REX_64IR(ir
, 0), dest
, 0, as
->mcp
, 1);
2286 ra_left(as
, dest
, ir
->op1
);
2289 #define asm_band(as, ir) asm_intarith(as, ir, XOg_AND)
2290 #define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR)
2291 #define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR)
2293 static void asm_bitshift(ASMState
*as
, IRIns
*ir
, x86Shift xs
, x86Op xv
)
2295 IRRef rref
= ir
->op2
;
2296 IRIns
*irr
= IR(rref
);
2298 if (irref_isk(rref
)) { /* Constant shifts. */
2300 dest
= ra_dest(as
, ir
, RSET_GPR
);
2301 shift
= irr
->i
& (irt_is64(ir
->t
) ? 63 : 31);
2302 if (!xv
&& shift
&& (as
->flags
& JIT_F_BMI2
)) {
2303 Reg left
= asm_fuseloadm(as
, ir
->op1
, RSET_GPR
, irt_is64(ir
->t
));
2304 if (left
!= dest
) { /* BMI2 rotate right by constant. */
2305 emit_i8(as
, xs
== XOg_ROL
? -shift
: shift
);
2306 emit_mrm(as
, VEX_64IR(ir
, XV_RORX
), dest
, left
);
2312 case 1: emit_rr(as
, XO_SHIFT1
, REX_64IR(ir
, xs
), dest
); break;
2313 default: emit_shifti(as
, REX_64IR(ir
, xs
), dest
, shift
); break;
2315 } else if ((as
->flags
& JIT_F_BMI2
) && xv
) { /* BMI2 variable shifts. */
2317 dest
= ra_dest(as
, ir
, RSET_GPR
);
2318 right
= ra_alloc1(as
, rref
, RSET_GPR
);
2319 left
= asm_fuseloadm(as
, ir
->op1
, rset_exclude(RSET_GPR
, right
),
2321 emit_mrm(as
, VEX_64IR(ir
, xv
) ^ (right
<< 19), dest
, left
);
2323 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
2325 dest
= ra_dest(as
, ir
, rset_exclude(RSET_GPR
, RID_ECX
));
2326 if (dest
== RID_ECX
) {
2327 dest
= ra_scratch(as
, rset_exclude(RSET_GPR
, RID_ECX
));
2328 emit_rr(as
, XO_MOV
, REX_64IR(ir
, RID_ECX
), dest
);
2331 if (ra_noreg(right
))
2332 right
= ra_allocref(as
, rref
, RID2RSET(RID_ECX
));
2333 else if (right
!= RID_ECX
)
2334 ra_scratch(as
, RID2RSET(RID_ECX
));
2335 emit_rr(as
, XO_SHIFTcl
, REX_64IR(ir
, xs
), dest
);
2336 ra_noweak(as
, right
);
2337 if (right
!= RID_ECX
)
2338 emit_rr(as
, XO_MOV
, RID_ECX
, right
);
2340 ra_left(as
, dest
, ir
->op1
);
2342 ** Note: avoid using the flags resulting from a shift or rotate!
2343 ** All of them cause a partial flag stall, except for r,1 shifts
2344 ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
2348 #define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL, XV_SHLX)
2349 #define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR, XV_SHRX)
2350 #define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR, XV_SARX)
2351 #define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL, 0)
2352 #define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR, 0)
2354 /* -- Comparisons --------------------------------------------------------- */
2356 /* Virtual flags for unordered FP comparisons. */
2357 #define VCC_U 0x1000 /* Unordered. */
2358 #define VCC_P 0x2000 /* Needs extra CC_P branch. */
2359 #define VCC_S 0x4000 /* Swap avoids CC_P branch. */
2360 #define VCC_PS (VCC_P|VCC_S)
2362 /* Map of comparisons to flags. ORDER IR. */
2363 #define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
2364 static const uint16_t asm_compmap
[IR_ABC
+1] = {
2365 /* signed non-eq unsigned flags */
2366 /* LT */ COMPFLAGS(CC_GE
, CC_G
, CC_AE
, VCC_PS
),
2367 /* GE */ COMPFLAGS(CC_L
, CC_L
, CC_B
, 0),
2368 /* LE */ COMPFLAGS(CC_G
, CC_G
, CC_A
, VCC_PS
),
2369 /* GT */ COMPFLAGS(CC_LE
, CC_L
, CC_BE
, 0),
2370 /* ULT */ COMPFLAGS(CC_AE
, CC_A
, CC_AE
, VCC_U
),
2371 /* UGE */ COMPFLAGS(CC_B
, CC_B
, CC_B
, VCC_U
|VCC_PS
),
2372 /* ULE */ COMPFLAGS(CC_A
, CC_A
, CC_A
, VCC_U
),
2373 /* UGT */ COMPFLAGS(CC_BE
, CC_B
, CC_BE
, VCC_U
|VCC_PS
),
2374 /* EQ */ COMPFLAGS(CC_NE
, CC_NE
, CC_NE
, VCC_P
),
2375 /* NE */ COMPFLAGS(CC_E
, CC_E
, CC_E
, VCC_U
|VCC_P
),
2376 /* ABC */ COMPFLAGS(CC_BE
, CC_B
, CC_BE
, VCC_U
|VCC_PS
) /* Same as UGT. */
2379 /* FP and integer comparisons. */
2380 static void asm_comp(ASMState
*as
, IRIns
*ir
)
2382 uint32_t cc
= asm_compmap
[ir
->o
];
2383 if (irt_isnum(ir
->t
)) {
2384 IRRef lref
= ir
->op1
;
2385 IRRef rref
= ir
->op2
;
2389 ** An extra CC_P branch is required to preserve ordered/unordered
2390 ** semantics for FP comparisons. This can be avoided by swapping
2391 ** the operands and inverting the condition (except for EQ and UNE).
2392 ** So always try to swap if possible.
2394 ** Another option would be to swap operands to achieve better memory
2395 ** operand fusion. But it's unlikely that this outweighs the cost
2396 ** of the extra branches.
2398 if (cc
& VCC_S
) { /* Swap? */
2399 IRRef tmp
= lref
; lref
= rref
; rref
= tmp
;
2400 cc
^= (VCC_PS
|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
2402 left
= ra_alloc1(as
, lref
, RSET_FPR
);
2403 l_around
= emit_label(as
);
2404 asm_guardcc(as
, cc
>> 4);
2405 if (cc
& VCC_P
) { /* Extra CC_P branch required? */
2406 if (!(cc
& VCC_U
)) {
2407 asm_guardcc(as
, CC_P
); /* Branch to exit for ordered comparisons. */
2408 } else if (l_around
!= as
->invmcp
) {
2409 emit_sjcc(as
, CC_P
, l_around
); /* Branch around for unordered. */
2411 /* Patched to mcloop by asm_loop_fixup. */
2414 emit_sjcc(as
, CC_P
, as
->mcp
);
2416 emit_jcc(as
, CC_P
, as
->mcp
);
2419 right
= asm_fuseload(as
, rref
, rset_exclude(RSET_FPR
, left
));
2420 emit_mrm(as
, XO_UCOMISD
, left
, right
);
2422 IRRef lref
= ir
->op1
, rref
= ir
->op2
;
2423 IROp leftop
= (IROp
)(IR(lref
)->o
);
2424 Reg r64
= REX_64IR(ir
, 0);
2426 lj_assertA(irt_is64(ir
->t
) || irt_isint(ir
->t
) ||
2427 irt_isu32(ir
->t
) || irt_isaddr(ir
->t
) || irt_isu8(ir
->t
),
2428 "bad comparison data type %d", irt_type(ir
->t
));
2429 /* Swap constants (only for ABC) and fusable loads to the right. */
2430 if (irref_isk(lref
) || (!irref_isk(rref
) && opisfusableload(leftop
))) {
2431 if ((cc
& 0xc) == 0xc) cc
^= 0x53; /* L <-> G, LE <-> GE */
2432 else if ((cc
& 0xa) == 0x2) cc
^= 0x55; /* A <-> B, AE <-> BE */
2433 lref
= ir
->op2
; rref
= ir
->op1
;
2435 if (asm_isk32(as
, rref
, &imm
)) {
2436 IRIns
*irl
= IR(lref
);
2437 /* Check wether we can use test ins. Not for unsigned, since CF=0. */
2438 int usetest
= (imm
== 0 && (cc
& 0xa) != 0x2);
2439 if (usetest
&& irl
->o
== IR_BAND
&& irl
+1 == ir
&& !ra_used(irl
)) {
2440 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
2441 Reg right
, left
= RID_NONE
;
2442 RegSet allow
= RSET_GPR
;
2443 if (!asm_isk32(as
, irl
->op2
, &imm
)) {
2444 left
= ra_alloc1(as
, irl
->op2
, allow
);
2445 rset_clear(allow
, left
);
2446 } else { /* Try to Fuse IRT_I8/IRT_U8 loads, too. See below. */
2447 IRIns
*irll
= IR(irl
->op1
);
2448 if (opisfusableload((IROp
)irll
->o
) &&
2449 (irt_isi8(irll
->t
) || irt_isu8(irll
->t
))) {
2450 IRType1 origt
= irll
->t
; /* Temporarily flip types. */
2451 irll
->t
.irt
= (irll
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
2452 as
->curins
--; /* Skip to BAND to avoid failing in noconflict(). */
2453 right
= asm_fuseload(as
, irl
->op1
, RSET_GPR
);
2456 if (right
!= RID_MRM
) goto test_nofuse
;
2457 /* Fusion succeeded, emit test byte mrm, imm8. */
2458 asm_guardcc(as
, cc
);
2459 emit_i8(as
, (imm
& 0xff));
2460 emit_mrm(as
, XO_GROUP3b
, XOg_TEST
, RID_MRM
);
2464 as
->curins
--; /* Skip to BAND to avoid failing in noconflict(). */
2465 right
= asm_fuseloadm(as
, irl
->op1
, allow
, r64
);
2466 as
->curins
++; /* Undo the above. */
2468 asm_guardcc(as
, cc
);
2469 if (ra_noreg(left
)) {
2471 emit_mrm(as
, XO_GROUP3
, r64
+ XOg_TEST
, right
);
2473 emit_mrm(as
, XO_TEST
, r64
+ left
, right
);
2477 if (opisfusableload((IROp
)irl
->o
) &&
2478 ((irt_isu8(irl
->t
) && checku8(imm
)) ||
2479 ((irt_isi8(irl
->t
) || irt_isi16(irl
->t
)) && checki8(imm
)) ||
2480 (irt_isu16(irl
->t
) && checku16(imm
) && checki8((int16_t)imm
)))) {
2481 /* Only the IRT_INT case is fused by asm_fuseload.
2482 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
2483 ** are handled here.
2484 ** Note that cmp word [mem], imm16 should not be generated,
2485 ** since it has a length-changing prefix. Compares of a word
2486 ** against a sign-extended imm8 are ok, however.
2488 IRType1 origt
= irl
->t
; /* Temporarily flip types. */
2489 irl
->t
.irt
= (irl
->t
.irt
& ~IRT_TYPE
) | IRT_INT
;
2490 left
= asm_fuseload(as
, lref
, RSET_GPR
);
2492 if (left
== RID_MRM
) { /* Fusion succeeded? */
2493 if (irt_isu8(irl
->t
) || irt_isu16(irl
->t
))
2494 cc
>>= 4; /* Need unsigned compare. */
2495 asm_guardcc(as
, cc
);
2497 emit_mrm(as
, (irt_isi8(origt
) || irt_isu8(origt
)) ?
2498 XO_ARITHib
: XO_ARITHiw8
, r64
+ XOg_CMP
, RID_MRM
);
2500 } /* Otherwise handle register case as usual. */
2502 left
= asm_fuseloadm(as
, lref
,
2503 irt_isu8(ir
->t
) ? RSET_GPR8
: RSET_GPR
, r64
);
2505 asm_guardcc(as
, cc
);
2506 if (usetest
&& left
!= RID_MRM
) {
2507 /* Use test r,r instead of cmp r,0. */
2509 if (irt_isu8(ir
->t
)) {
2510 lj_assertA(ir
->o
== IR_EQ
|| ir
->o
== IR_NE
, "bad usage");
2512 if (!rset_test(RSET_RANGE(RID_EAX
, RID_EBX
+1), left
)) {
2517 emit_mrm(as
, XO_GROUP3
, XOg_TEST
, left
);
2522 emit_rr(as
, xo
, r64
+ left
, left
);
2523 if (irl
+1 == ir
) /* Referencing previous ins? */
2524 as
->flagmcp
= as
->mcp
; /* Set flag to drop test r,r if possible. */
2526 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), r64
+ left
, imm
);
2530 Reg left
= ra_alloc1(as
, lref
, RSET_GPR
);
2531 Reg right
= asm_fuseloadm(as
, rref
, rset_exclude(RSET_GPR
, left
), r64
);
2532 asm_guardcc(as
, cc
);
2533 emit_mrm(as
, XO_CMP
, r64
+ left
, right
);
2538 #define asm_equal(as, ir) asm_comp(as, ir)
2540 #if LJ_32 && LJ_HASFFI
2541 /* 64 bit integer comparisons in 32 bit mode. */
2542 static void asm_comp_int64(ASMState
*as
, IRIns
*ir
)
2544 uint32_t cc
= asm_compmap
[(ir
-1)->o
];
2545 RegSet allow
= RSET_GPR
;
2546 Reg lefthi
= RID_NONE
, leftlo
= RID_NONE
;
2547 Reg righthi
= RID_NONE
, rightlo
= RID_NONE
;
2551 as
->curins
--; /* Skip loword ins. Avoids failing in noconflict(), too. */
2553 /* Allocate/fuse hiword operands. */
2554 if (irref_isk(ir
->op2
)) {
2555 lefthi
= asm_fuseload(as
, ir
->op1
, allow
);
2557 lefthi
= ra_alloc1(as
, ir
->op1
, allow
);
2558 rset_clear(allow
, lefthi
);
2559 righthi
= asm_fuseload(as
, ir
->op2
, allow
);
2560 if (righthi
== RID_MRM
) {
2561 if (as
->mrm
.base
!= RID_NONE
) rset_clear(allow
, as
->mrm
.base
);
2562 if (as
->mrm
.idx
!= RID_NONE
) rset_clear(allow
, as
->mrm
.idx
);
2564 rset_clear(allow
, righthi
);
2567 mrm
= as
->mrm
; /* Save state for hiword instruction. */
2569 /* Allocate/fuse loword operands. */
2570 if (irref_isk((ir
-1)->op2
)) {
2571 leftlo
= asm_fuseload(as
, (ir
-1)->op1
, allow
);
2573 leftlo
= ra_alloc1(as
, (ir
-1)->op1
, allow
);
2574 rset_clear(allow
, leftlo
);
2575 rightlo
= asm_fuseload(as
, (ir
-1)->op2
, allow
);
2578 /* All register allocations must be performed _before_ this point. */
2579 l_around
= emit_label(as
);
2580 as
->invmcp
= as
->flagmcp
= NULL
; /* Cannot use these optimizations. */
2582 /* Loword comparison and branch. */
2583 asm_guardcc(as
, cc
>> 4); /* Always use unsigned compare for loword. */
2584 if (ra_noreg(rightlo
)) {
2585 int32_t imm
= IR((ir
-1)->op2
)->i
;
2586 if (imm
== 0 && ((cc
>> 4) & 0xa) != 0x2 && leftlo
!= RID_MRM
)
2587 emit_rr(as
, XO_TEST
, leftlo
, leftlo
);
2589 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), leftlo
, imm
);
2591 emit_mrm(as
, XO_CMP
, leftlo
, rightlo
);
2594 /* Hiword comparison and branches. */
2595 if ((cc
& 15) != CC_NE
)
2596 emit_sjcc(as
, CC_NE
, l_around
); /* Hiword unequal: skip loword compare. */
2597 if ((cc
& 15) != CC_E
)
2598 asm_guardcc(as
, cc
>> 8); /* Hiword compare without equality check. */
2599 as
->mrm
= mrm
; /* Restore state. */
2600 if (ra_noreg(righthi
)) {
2601 int32_t imm
= IR(ir
->op2
)->i
;
2602 if (imm
== 0 && (cc
& 0xa) != 0x2 && lefthi
!= RID_MRM
)
2603 emit_rr(as
, XO_TEST
, lefthi
, lefthi
);
2605 emit_gmrmi(as
, XG_ARITHi(XOg_CMP
), lefthi
, imm
);
2607 emit_mrm(as
, XO_CMP
, lefthi
, righthi
);
2612 /* -- Split register ops -------------------------------------------------- */
2614 /* Hiword op of a split 32/32 or 64/64 bit op. Previous op is the loword op. */
2615 static void asm_hiop(ASMState
*as
, IRIns
*ir
)
2617 /* HIOP is marked as a store because it needs its own DCE logic. */
2618 int uselo
= ra_used(ir
-1), usehi
= ra_used(ir
); /* Loword/hiword used? */
2619 if (LJ_UNLIKELY(!(as
->flags
& JIT_F_OPT_DCE
))) uselo
= usehi
= 1;
2620 #if LJ_32 && LJ_HASFFI
2621 if ((ir
-1)->o
== IR_CONV
) { /* Conversions to/from 64 bit. */
2622 as
->curins
--; /* Always skip the CONV. */
2626 } else if ((ir
-1)->o
<= IR_NE
) { /* 64 bit integer comparisons. ORDER IR. */
2627 asm_comp_int64(as
, ir
);
2629 } else if ((ir
-1)->o
== IR_XSTORE
) {
2630 if ((ir
-1)->r
!= RID_SINK
)
2631 asm_fxstore(as
, ir
);
2635 if (!usehi
) return; /* Skip unused hiword op for all remaining ops. */
2636 switch ((ir
-1)->o
) {
2637 #if LJ_32 && LJ_HASFFI
2641 asm_intarith(as
, ir
, XOg_ADC
);
2642 asm_intarith(as
, ir
-1, XOg_ADD
);
2647 asm_intarith(as
, ir
, XOg_SBB
);
2648 asm_intarith(as
, ir
-1, XOg_SUB
);
2651 Reg dest
= ra_dest(as
, ir
, RSET_GPR
);
2652 emit_rr(as
, XO_GROUP3
, XOg_NEG
, dest
);
2654 emit_rr(as
, XO_ARITHi8
, XOg_ADC
, dest
);
2655 ra_left(as
, dest
, ir
->op1
);
2657 asm_neg_not(as
, ir
-1, XOg_NEG
);
2661 /* Nothing to do here. Handled by CNEWI itself. */
2664 case IR_CALLN
: case IR_CALLL
: case IR_CALLS
: case IR_CALLXS
:
2666 ra_allocref(as
, ir
->op1
, RID2RSET(RID_RETLO
)); /* Mark lo op as used. */
2668 default: lj_assertA(0, "bad HIOP for op %d", (ir
-1)->o
); break;
2672 /* -- Profiling ----------------------------------------------------------- */
2674 static void asm_prof(ASMState
*as
, IRIns
*ir
)
2677 asm_guardcc(as
, CC_NE
);
2678 emit_i8(as
, HOOK_PROFILE
);
2679 emit_rma(as
, XO_GROUP3b
, XOg_TEST
, &J2G(as
->J
)->hookmask
);
2682 /* -- Stack handling ------------------------------------------------------ */
2684 /* Check Lua stack size for overflow. Use exit handler as fallback. */
2685 static void asm_stack_check(ASMState
*as
, BCReg topslot
,
2686 IRIns
*irp
, RegSet allow
, ExitNo exitno
)
2688 /* Try to get an unused temp. register, otherwise spill/restore eax. */
2689 Reg pbase
= irp
? irp
->r
: RID_BASE
;
2690 Reg r
= allow
? rset_pickbot(allow
) : RID_EAX
;
2691 emit_jcc(as
, CC_B
, exitstub_addr(as
->J
, exitno
));
2692 if (allow
== RSET_EMPTY
) /* Restore temp. register. */
2693 emit_rmro(as
, XO_MOV
, r
|REX_64
, RID_ESP
, 0);
2696 emit_gri(as
, XG_ARITHi(XOg_CMP
), r
|REX_GC64
, (int32_t)(8*topslot
));
2697 if (ra_hasreg(pbase
) && pbase
!= r
)
2698 emit_rr(as
, XO_ARITH(XOg_SUB
), r
|REX_GC64
, pbase
);
2701 emit_rmro(as
, XO_ARITH(XOg_SUB
), r
|REX_64
, RID_DISPATCH
,
2702 (int32_t)dispofs(as
, &J2G(as
->J
)->jit_base
));
2704 emit_rmro(as
, XO_ARITH(XOg_SUB
), r
, RID_NONE
,
2705 ptr2addr(&J2G(as
->J
)->jit_base
));
2707 emit_rmro(as
, XO_MOV
, r
|REX_GC64
, r
, offsetof(lua_State
, maxstack
));
2708 emit_getgl(as
, r
, cur_L
);
2709 if (allow
== RSET_EMPTY
) /* Spill temp. register. */
2710 emit_rmro(as
, XO_MOVto
, r
|REX_64
, RID_ESP
, 0);
2713 /* Restore Lua stack from on-trace state. */
2714 static void asm_stack_restore(ASMState
*as
, SnapShot
*snap
)
2716 SnapEntry
*map
= &as
->T
->snapmap
[snap
->mapofs
];
2717 #if !LJ_FR2 || defined(LUA_USE_ASSERT)
2718 SnapEntry
*flinks
= &as
->T
->snapmap
[snap_nextofs(as
->T
, snap
)-1-LJ_FR2
];
2720 MSize n
, nent
= snap
->nent
;
2721 /* Store the value of all modified slots to the Lua stack. */
2722 for (n
= 0; n
< nent
; n
++) {
2723 SnapEntry sn
= map
[n
];
2724 BCReg s
= snap_slot(sn
);
2725 int32_t ofs
= 8*((int32_t)s
-1-LJ_FR2
);
2726 IRRef ref
= snap_ref(sn
);
2727 IRIns
*ir
= IR(ref
);
2728 if ((sn
& SNAP_NORESTORE
))
2730 if ((sn
& SNAP_KEYINDEX
)) {
2731 emit_movmroi(as
, RID_BASE
, ofs
+4, LJ_KEYINDEX
);
2732 if (irref_isk(ref
)) {
2733 emit_movmroi(as
, RID_BASE
, ofs
, ir
->i
);
2735 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, RID_BASE
));
2736 emit_movtomro(as
, src
, RID_BASE
, ofs
);
2738 } else if (irt_isnum(ir
->t
)) {
2739 Reg src
= ra_alloc1(as
, ref
, RSET_FPR
);
2740 emit_rmro(as
, XO_MOVSDto
, src
, RID_BASE
, ofs
);
2742 lj_assertA(irt_ispri(ir
->t
) || irt_isaddr(ir
->t
) ||
2743 (LJ_DUALNUM
&& irt_isinteger(ir
->t
)),
2744 "restore of IR type %d", irt_type(ir
->t
));
2745 if (!irref_isk(ref
)) {
2746 Reg src
= ra_alloc1(as
, ref
, rset_exclude(RSET_GPR
, RID_BASE
));
2748 if (irt_is64(ir
->t
)) {
2749 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
2750 emit_u32(as
, irt_toitype(ir
->t
) << 15);
2751 emit_rmro(as
, XO_ARITHi
, XOg_OR
, RID_BASE
, ofs
+4);
2752 } else if (LJ_DUALNUM
&& irt_isinteger(ir
->t
)) {
2753 emit_movmroi(as
, RID_BASE
, ofs
+4, LJ_TISNUM
<< 15);
2755 emit_movmroi(as
, RID_BASE
, ofs
+4, (irt_toitype(ir
->t
)<<15)|0x7fff);
2758 emit_movtomro(as
, REX_64IR(ir
, src
), RID_BASE
, ofs
);
2762 lj_ir_kvalue(as
->J
->L
, &k
, ir
);
2765 emit_rmro(as
, XO_MOVmi
, REX_64
, RID_BASE
, ofs
);
2767 emit_movmroi(as
, RID_BASE
, ofs
+4, k
.u32
.hi
);
2768 emit_movmroi(as
, RID_BASE
, ofs
, k
.u32
.lo
);
2771 } else if (!irt_ispri(ir
->t
)) {
2772 emit_movmroi(as
, RID_BASE
, ofs
, ir
->i
);
2775 if ((sn
& (SNAP_CONT
|SNAP_FRAME
))) {
2777 if (s
!= 0) /* Do not overwrite link to previous frame. */
2778 emit_movmroi(as
, RID_BASE
, ofs
+4, (int32_t)(*flinks
--));
2782 if (!(LJ_64
&& irt_islightud(ir
->t
)))
2783 emit_movmroi(as
, RID_BASE
, ofs
+4, irt_toitype(ir
->t
));
2789 lj_assertA(map
+ nent
== flinks
, "inconsistent frames in snapshot");
2792 /* -- GC handling --------------------------------------------------------- */
2794 /* Check GC threshold and do one or more GC steps. */
2795 static void asm_gc_check(ASMState
*as
)
2797 const CCallInfo
*ci
= &lj_ir_callinfo
[IRCALL_lj_gc_step_jit
];
2801 ra_evictset(as
, RSET_SCRATCH
);
2802 l_end
= emit_label(as
);
2803 /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
2804 asm_guardcc(as
, CC_NE
); /* Assumes asm_snap_prep() already done. */
2805 emit_rr(as
, XO_TEST
, RID_RET
, RID_RET
);
2806 args
[0] = ASMREF_TMP1
; /* global_State *g */
2807 args
[1] = ASMREF_TMP2
; /* MSize steps */
2808 asm_gencall(as
, ci
, args
);
2809 tmp
= ra_releasetmp(as
, ASMREF_TMP1
);
2811 emit_rmro(as
, XO_LEA
, tmp
|REX_64
, RID_DISPATCH
, GG_DISP2G
);
2813 emit_loada(as
, tmp
, J2G(as
->J
));
2815 emit_loadi(as
, ra_releasetmp(as
, ASMREF_TMP2
), as
->gcsteps
);
2816 /* Jump around GC step if GC total < GC threshold. */
2817 emit_sjcc(as
, CC_B
, l_end
);
2818 emit_opgl(as
, XO_ARITH(XOg_CMP
), tmp
|REX_GC64
, gc
.threshold
);
2819 emit_getgl(as
, tmp
, gc
.total
);
2824 /* -- Loop handling ------------------------------------------------------- */
2826 /* Fixup the loop branch. */
2827 static void asm_loop_fixup(ASMState
*as
)
2829 MCode
*p
= as
->mctop
;
2830 MCode
*target
= as
->mcp
;
2831 if (as
->realign
) { /* Realigned loops use short jumps. */
2832 as
->realign
= NULL
; /* Stop another retry. */
2833 lj_assertA(((intptr_t)target
& 15) == 0, "loop realign failed");
2834 if (as
->loopinv
) { /* Inverted loop branch? */
2837 lj_assertA(target
- p
>= -128, "loop realign failed");
2838 p
[-1] = (MCode
)(target
- p
); /* Patch sjcc. */
2839 if (as
->loopinv
== 2)
2840 p
[-3] = (MCode
)(target
- p
+ 2); /* Patch opt. short jp. */
2842 lj_assertA(target
- p
>= -128, "loop realign failed");
2843 p
[-1] = (MCode
)(int8_t)(target
- p
); /* Patch short jmp. */
2849 if (as
->loopinv
) { /* Inverted loop branch? */
2850 /* asm_guardcc already inverted the jcc and patched the jmp. */
2853 *(int32_t *)(p
-4) = (int32_t)(target
- p
); /* Patch jcc. */
2854 if (as
->loopinv
== 2) {
2855 *(int32_t *)(p
-10) = (int32_t)(target
- p
+ 6); /* Patch opt. jp. */
2858 } else { /* Otherwise just patch jmp. */
2859 *(int32_t *)(p
-4) = (int32_t)(target
- p
);
2862 /* Realign small loops and shorten the loop branch. */
2863 if (newloop
>= p
- 128) {
2864 as
->realign
= newloop
; /* Force a retry and remember alignment. */
2865 as
->curins
= as
->stopins
; /* Abort asm_trace now. */
2866 as
->T
->nins
= as
->orignins
; /* Remove any added renames. */
2871 /* Fixup the tail of the loop. */
2872 static void asm_loop_tail_fixup(ASMState
*as
)
2874 UNUSED(as
); /* Nothing to do. */
2877 /* -- Head of trace ------------------------------------------------------- */
2879 /* Coalesce BASE register for a root trace. */
2880 static void asm_head_root_base(ASMState
*as
)
2882 IRIns
*ir
= IR(REF_BASE
);
2886 if (rset_test(as
->modset
, r
) || irt_ismarked(ir
->t
))
2887 ir
->r
= RID_INIT
; /* No inheritance for modified BASE register. */
2889 emit_rr(as
, XO_MOV
, r
|REX_GC64
, RID_BASE
);
2893 /* Coalesce or reload BASE register for a side trace. */
2894 static Reg
asm_head_side_base(ASMState
*as
, IRIns
*irp
)
2896 IRIns
*ir
= IR(REF_BASE
);
2900 if (rset_test(as
->modset
, r
) || irt_ismarked(ir
->t
))
2901 ir
->r
= RID_INIT
; /* No inheritance for modified BASE register. */
2903 return r
; /* Same BASE register already coalesced. */
2904 } else if (ra_hasreg(irp
->r
) && rset_test(as
->freeset
, irp
->r
)) {
2905 /* Move from coalesced parent reg. */
2906 emit_rr(as
, XO_MOV
, r
|REX_GC64
, irp
->r
);
2909 emit_getgl(as
, r
, jit_base
); /* Otherwise reload BASE. */
2915 /* -- Tail of trace ------------------------------------------------------- */
2917 /* Fixup the tail code. */
2918 static void asm_tail_fixup(ASMState
*as
, TraceNo lnk
)
2920 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
2921 MCode
*p
= as
->mctop
;
2923 int32_t spadj
= as
->T
->spadjust
;
2928 /* Patch stack adjustment. */
2929 if (checki8(spadj
)) {
2935 *(int32_t *)p1
= spadj
;
2940 p1
[-2] = (MCode
)(checki8(spadj
) ? XI_ARITHi8
: XI_ARITHi
);
2941 p1
[-1] = MODRM(XM_REG
, XOg_ADD
, RID_ESP
);
2943 /* Patch exit branch. */
2944 target
= lnk
? traceref(as
->J
, lnk
)->mcode
: (MCode
*)lj_vm_exit_interp
;
2945 *(int32_t *)(p
-4) = jmprel(as
->J
, p
, target
);
2947 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
2948 for (q
= as
->mctop
-1; q
>= p
; q
--)
2953 /* Prepare tail of code. */
2954 static void asm_tail_prep(ASMState
*as
)
2956 MCode
*p
= as
->mctop
;
2957 /* Realign and leave room for backwards loop branch or exit branch. */
2959 int i
= ((int)(intptr_t)as
->realign
) & 15;
2960 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
2964 p
-= (as
->loopinv
? 5 : 2); /* Space for short/near jmp. */
2966 p
-= 5; /* Space for exit branch (near jmp). */
2969 as
->invmcp
= as
->mcp
= p
;
2971 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
2972 as
->mcp
= p
- (LJ_64
? 7 : 6);
2977 /* -- Trace setup --------------------------------------------------------- */
2979 /* Ensure there are enough stack slots for call arguments. */
2980 static Reg
asm_setup_call_slots(ASMState
*as
, IRIns
*ir
, const CCallInfo
*ci
)
2982 IRRef args
[CCI_NARGS_MAX
*2];
2984 asm_collectargs(as
, ir
, ci
, args
);
2985 nslots
= asm_count_call_slots(as
, ci
, args
);
2986 if (nslots
> as
->evenspill
) /* Leave room for args in stack slots. */
2987 as
->evenspill
= nslots
;
2989 return irt_isfp(ir
->t
) ? REGSP_HINT(RID_FPRET
) : REGSP_HINT(RID_RET
);
2991 return irt_isfp(ir
->t
) ? REGSP_INIT
: REGSP_HINT(RID_RET
);
2995 /* Target-specific setup. */
2996 static void asm_setup_target(ASMState
*as
)
2998 asm_exitstub_setup(as
, as
->T
->nsnap
);
3002 /* -- Trace patching ------------------------------------------------------ */
3004 static const uint8_t map_op1
[256] = {
3005 0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x20,
3006 0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,
3007 0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,
3008 0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,
3010 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x14,0x14,0x14,0x14,0x14,0x14,0x14,0x14,
3012 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,
3014 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,
3015 0x51,0x51,0x92,0x92,0x10,0x10,0x12,0x11,0x45,0x86,0x52,0x93,0x51,0x51,0x51,0x51,
3016 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,
3017 0x93,0x86,0x93,0x93,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,
3018 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x47,0x51,0x51,0x51,0x51,0x51,
3020 0x59,0x59,0x59,0x59,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51,
3022 0x55,0x55,0x55,0x55,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51,
3024 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x05,
3025 0x93,0x93,0x53,0x51,0x70,0x71,0x93,0x86,0x54,0x51,0x53,0x51,0x51,0x52,0x51,0x51,
3026 0x92,0x92,0x92,0x92,0x52,0x52,0x51,0x51,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,
3027 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x45,0x45,0x47,0x52,0x51,0x51,0x51,0x51,
3028 0x10,0x51,0x10,0x10,0x51,0x51,0x63,0x66,0x51,0x51,0x51,0x51,0x51,0x51,0x92,0x92
3031 static const uint8_t map_op2
[256] = {
3032 0x93,0x93,0x93,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x51,0x52,0x51,0x93,0x52,0x94,
3033 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3034 0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3035 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x34,0x51,0x35,0x51,0x51,0x51,0x51,0x51,
3036 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3037 0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3038 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3039 0x94,0x54,0x54,0x54,0x93,0x93,0x93,0x52,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3040 0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,
3041 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3042 0x52,0x52,0x52,0x93,0x94,0x93,0x51,0x51,0x52,0x52,0x52,0x93,0x94,0x93,0x93,0x93,
3043 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x94,0x93,0x93,0x93,0x93,0x93,
3044 0x93,0x93,0x94,0x93,0x94,0x94,0x94,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,
3045 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3046 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3047 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x52
3050 static uint32_t asm_x86_inslen(const uint8_t* p
)
3052 uint32_t result
= 0;
3053 uint32_t prefixes
= 0;
3054 uint32_t x
= map_op1
[*p
];
3057 case 0: return result
+ x
+ (prefixes
& 4);
3058 case 1: prefixes
|= x
; x
= map_op1
[*++p
]; result
++; break;
3059 case 2: x
= map_op2
[*++p
]; break;
3060 case 3: p
++; goto mrm
;
3061 case 4: result
-= (prefixes
& 2); /* fallthrough */
3062 case 5: return result
+ (x
& 15);
3063 case 6: /* Group 3. */
3064 if (p
[1] & 0x38) x
= 2;
3065 else if ((prefixes
& 2) && (x
== 0x66)) x
= 4;
3067 case 7: /* VEX c4/c5. */
3068 if (LJ_32
&& p
[1] < 0xc0) {
3085 case 8: result
-= (prefixes
& 2); /* fallthrough */
3086 case 9: mrm
: /* ModR/M and possibly SIB. */
3090 case 0: if ((x
& 7) == 5) return result
+ 4; break;
3091 case 1: result
++; break;
3092 case 2: result
+= 4; break;
3093 case 3: return result
;
3097 if (x
< 0x40 && (p
[1] & 7) == 5) result
+= 4;
3104 /* Patch exit jumps of existing machine code to a new target. */
3105 void lj_asm_patchexit(jit_State
*J
, GCtrace
*T
, ExitNo exitno
, MCode
*target
)
3107 MCode
*p
= T
->mcode
;
3108 MCode
*mcarea
= lj_mcode_patch(J
, p
, 0);
3109 MSize len
= T
->szmcode
;
3110 MCode
*px
= exitstub_addr(J
, exitno
) - 6;
3111 MCode
*pe
= p
+len
-6;
3114 uint32_t statei
= (uint32_t)(GG_OFS(g
.vmstate
) - GG_OFS(dispatch
));
3116 uint32_t statei
= u32ptr(&J2G(J
)->vmstate
);
3118 if (len
> 5 && p
[len
-5] == XI_JMP
&& p
+len
-6 + *(int32_t *)(p
+len
-4) == px
)
3119 *(int32_t *)(p
+len
-4) = jmprel(J
, p
+len
, target
);
3120 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
3121 for (; p
< pe
; p
+= asm_x86_inslen(p
)) {
3122 intptr_t ofs
= LJ_GC64
? (p
[0] & 0xf0) == 0x40 : LJ_64
;
3123 if (*(uint32_t *)(p
+2+ofs
) == statei
&& p
[ofs
+LJ_GC64
-LJ_64
] == XI_MOVmi
)
3126 lj_assertJ(p
< pe
, "instruction length decoder failed");
3127 for (; p
< pe
; p
+= asm_x86_inslen(p
)) {
3128 if ((*(uint16_t *)p
& 0xf0ff) == 0x800f && p
+ *(int32_t *)(p
+2) == px
&&
3130 *(int32_t *)(p
+2) = jmprel(J
, p
+6, target
);
3131 } else if (*p
== XI_CALL
&&
3132 (void *)(p
+5+*(int32_t *)(p
+1)) == (void *)lj_gc_step_jit
) {
3133 pgc
= p
+7; /* Do not patch GC check exit. */
3136 lj_mcode_sync(T
->mcode
, T
->mcode
+ T
->szmcode
);
3137 lj_mcode_patch(J
, mcarea
, 1);