fix various codegen bugs on arm64
[qbe.git] / amd64 / emit.c
blob8f3618843f273953aa23337c1aba500137048b9a
1 #include "all.h"
4 typedef struct E E;
6 struct E {
7 FILE *f;
8 Fn *fn;
9 int fp;
10 uint64_t fsz;
11 int nclob;
14 #define CMP(X) \
15 X(Ciule, "be") \
16 X(Ciult, "b") \
17 X(Cisle, "le") \
18 X(Cislt, "l") \
19 X(Cisgt, "g") \
20 X(Cisge, "ge") \
21 X(Ciugt, "a") \
22 X(Ciuge, "ae") \
23 X(Cieq, "z") \
24 X(Cine, "nz") \
25 X(NCmpI+Cfle, "be") \
26 X(NCmpI+Cflt, "b") \
27 X(NCmpI+Cfgt, "a") \
28 X(NCmpI+Cfge, "ae") \
29 X(NCmpI+Cfeq, "z") \
30 X(NCmpI+Cfne, "nz") \
31 X(NCmpI+Cfo, "np") \
32 X(NCmpI+Cfuo, "p")
34 enum {
35 SLong = 0,
36 SWord = 1,
37 SShort = 2,
38 SByte = 3,
40 Ki = -1, /* matches Kw and Kl */
41 Ka = -2, /* matches all classes */
44 /* Instruction format strings:
46 * if the format string starts with -, the instruction
47 * is assumed to be 3-address and is put in 2-address
48 * mode using an extra mov if necessary
50 * if the format string starts with +, the same as the
51 * above applies, but commutativity is also assumed
53 * %k is used to set the class of the instruction,
54 * it'll expand to "l", "q", "ss", "sd", depending
55 * on the instruction class
56 * %0 designates the first argument
57 * %1 designates the second argument
58 * %= designates the result
60 * if %k is not used, a prefix to 0, 1, or = must be
61 * added, it can be:
62 * M - memory reference
63 * L - long (64 bits)
64 * W - word (32 bits)
65 * H - short (16 bits)
66 * B - byte (8 bits)
67 * S - single precision float
68 * D - double precision float
70 static struct {
71 short op;
72 short cls;
73 char *fmt;
74 } omap[] = {
75 { Oadd, Ka, "+add%k %1, %=" },
76 { Osub, Ka, "-sub%k %1, %=" },
77 { Oand, Ki, "+and%k %1, %=" },
78 { Oor, Ki, "+or%k %1, %=" },
79 { Oxor, Ki, "+xor%k %1, %=" },
80 { Osar, Ki, "-sar%k %B1, %=" },
81 { Oshr, Ki, "-shr%k %B1, %=" },
82 { Oshl, Ki, "-shl%k %B1, %=" },
83 { Omul, Ki, "+imul%k %1, %=" },
84 { Omul, Ks, "+mulss %1, %=" },
85 { Omul, Kd, "+mulsd %1, %=" },
86 { Odiv, Ka, "-div%k %1, %=" },
87 { Ostorel, Ka, "movq %L0, %M1" },
88 { Ostorew, Ka, "movl %W0, %M1" },
89 { Ostoreh, Ka, "movw %H0, %M1" },
90 { Ostoreb, Ka, "movb %B0, %M1" },
91 { Ostores, Ka, "movss %S0, %M1" },
92 { Ostored, Ka, "movsd %D0, %M1" },
93 { Oload, Ka, "mov%k %M0, %=" },
94 { Oloadsw, Kl, "movslq %M0, %L=" },
95 { Oloadsw, Kw, "movl %M0, %W=" },
96 { Oloaduw, Ki, "movl %M0, %W=" },
97 { Oloadsh, Ki, "movsw%k %M0, %=" },
98 { Oloaduh, Ki, "movzw%k %M0, %=" },
99 { Oloadsb, Ki, "movsb%k %M0, %=" },
100 { Oloadub, Ki, "movzb%k %M0, %=" },
101 { Oextsw, Kl, "movslq %W0, %L=" },
102 { Oextuw, Kl, "movl %W0, %W=" },
103 { Oextsh, Ki, "movsw%k %H0, %=" },
104 { Oextuh, Ki, "movzw%k %H0, %=" },
105 { Oextsb, Ki, "movsb%k %B0, %=" },
106 { Oextub, Ki, "movzb%k %B0, %=" },
108 { Oexts, Kd, "cvtss2sd %0, %=" },
109 { Otruncd, Ks, "cvtsd2ss %0, %=" },
110 { Ostosi, Ki, "cvttss2si%k %0, %=" },
111 { Odtosi, Ki, "cvttsd2si%k %0, %=" },
112 { Oswtof, Ka, "cvtsi2%k %W0, %=" },
113 { Osltof, Ka, "cvtsi2%k %L0, %=" },
114 { Ocast, Ki, "movq %D0, %L=" },
115 { Ocast, Ka, "movq %L0, %D=" },
117 { Oaddr, Ki, "lea%k %M0, %=" },
118 { Oswap, Ki, "xchg%k %0, %1" },
119 { Osign, Kl, "cqto" },
120 { Osign, Kw, "cltd" },
121 { Oxdiv, Ki, "div%k %0" },
122 { Oxidiv, Ki, "idiv%k %0" },
123 { Oxcmp, Ks, "ucomiss %S0, %S1" },
124 { Oxcmp, Kd, "ucomisd %D0, %D1" },
125 { Oxcmp, Ki, "cmp%k %0, %1" },
126 { Oxtest, Ki, "test%k %0, %1" },
127 #define X(c, s) \
128 { Oflag+c, Ki, "set" s " %B=\n\tmovzb%k %B=, %=" },
129 CMP(X)
130 #undef X
131 { NOp, 0, 0 }
134 static char *rname[][4] = {
135 [RAX] = {"rax", "eax", "ax", "al"},
136 [RBX] = {"rbx", "ebx", "bx", "bl"},
137 [RCX] = {"rcx", "ecx", "cx", "cl"},
138 [RDX] = {"rdx", "edx", "dx", "dl"},
139 [RSI] = {"rsi", "esi", "si", "sil"},
140 [RDI] = {"rdi", "edi", "di", "dil"},
141 [RBP] = {"rbp", "ebp", "bp", "bpl"},
142 [RSP] = {"rsp", "esp", "sp", "spl"},
143 [R8 ] = {"r8" , "r8d", "r8w", "r8b"},
144 [R9 ] = {"r9" , "r9d", "r9w", "r9b"},
145 [R10] = {"r10", "r10d", "r10w", "r10b"},
146 [R11] = {"r11", "r11d", "r11w", "r11b"},
147 [R12] = {"r12", "r12d", "r12w", "r12b"},
148 [R13] = {"r13", "r13d", "r13w", "r13b"},
149 [R14] = {"r14", "r14d", "r14w", "r14b"},
150 [R15] = {"r15", "r15d", "r15w", "r15b"},
154 static int
155 slot(Ref r, E *e)
157 int s;
159 s = rsval(r);
160 assert(s <= e->fn->slot);
161 /* specific to NAlign == 3 */
162 if (s < 0) {
163 if (e->fp == RSP)
164 return 4*-s - 8 + e->fsz + e->nclob*8;
165 else
166 return 4*-s;
168 else if (e->fp == RSP)
169 return 4*s + e->nclob*8;
170 else if (e->fn->vararg)
171 return -176 + -4 * (e->fn->slot - s);
172 else
173 return -4 * (e->fn->slot - s);
176 static void
177 emitcon(Con *con, E *e)
179 char *p, *l;
181 switch (con->type) {
182 case CAddr:
183 l = str(con->sym.id);
184 p = l[0] == '"' ? "" : T.assym;
185 if (con->sym.type == SThr) {
186 if (T.apple)
187 fprintf(e->f, "%s%s@TLVP", p, l);
188 else
189 fprintf(e->f, "%%fs:%s%s@tpoff", p, l);
190 } else
191 fprintf(e->f, "%s%s", p, l);
192 if (con->bits.i)
193 fprintf(e->f, "%+"PRId64, con->bits.i);
194 break;
195 case CBits:
196 fprintf(e->f, "%"PRId64, con->bits.i);
197 break;
198 default:
199 die("unreachable");
203 static char *
204 regtoa(int reg, int sz)
206 static char buf[6];
208 assert(reg <= XMM15);
209 if (reg >= XMM0) {
210 sprintf(buf, "xmm%d", reg-XMM0);
211 return buf;
212 } else
213 return rname[reg][sz];
216 static Ref
217 getarg(char c, Ins *i)
219 switch (c) {
220 case '0':
221 return i->arg[0];
222 case '1':
223 return i->arg[1];
224 case '=':
225 return i->to;
226 default:
227 die("invalid arg letter %c", c);
231 static void emitins(Ins, E *);
233 static void
234 emitcopy(Ref r1, Ref r2, int k, E *e)
236 Ins icp;
238 icp.op = Ocopy;
239 icp.arg[0] = r2;
240 icp.to = r1;
241 icp.cls = k;
242 emitins(icp, e);
245 static void
246 emitf(char *s, Ins *i, E *e)
248 static char clstoa[][3] = {"l", "q", "ss", "sd"};
249 char c;
250 int sz;
251 Ref ref;
252 Mem *m;
253 Con off;
255 switch (*s) {
256 case '+':
257 if (req(i->arg[1], i->to)) {
258 ref = i->arg[0];
259 i->arg[0] = i->arg[1];
260 i->arg[1] = ref;
262 /* fall through */
263 case '-':
264 assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
265 "cannot convert to 2-address");
266 emitcopy(i->to, i->arg[0], i->cls, e);
267 s++;
268 break;
271 fputc('\t', e->f);
272 Next:
273 while ((c = *s++) != '%')
274 if (!c) {
275 fputc('\n', e->f);
276 return;
277 } else
278 fputc(c, e->f);
279 switch ((c = *s++)) {
280 case '%':
281 fputc('%', e->f);
282 break;
283 case 'k':
284 fputs(clstoa[i->cls], e->f);
285 break;
286 case '0':
287 case '1':
288 case '=':
289 sz = KWIDE(i->cls) ? SLong : SWord;
290 s--;
291 goto Ref;
292 case 'D':
293 case 'S':
294 sz = SLong; /* does not matter for floats */
295 Ref:
296 c = *s++;
297 ref = getarg(c, i);
298 switch (rtype(ref)) {
299 case RTmp:
300 assert(isreg(ref));
301 fprintf(e->f, "%%%s", regtoa(ref.val, sz));
302 break;
303 case RSlot:
304 fprintf(e->f, "%d(%%%s)",
305 slot(ref, e),
306 regtoa(e->fp, SLong)
308 break;
309 case RMem:
310 Mem:
311 m = &e->fn->mem[ref.val];
312 if (rtype(m->base) == RSlot) {
313 off.type = CBits;
314 off.bits.i = slot(m->base, e);
315 addcon(&m->offset, &off, 1);
316 m->base = TMP(e->fp);
318 if (m->offset.type != CUndef)
319 emitcon(&m->offset, e);
320 fputc('(', e->f);
321 if (!req(m->base, R))
322 fprintf(e->f, "%%%s",
323 regtoa(m->base.val, SLong)
325 else if (m->offset.type == CAddr)
326 fprintf(e->f, "%%rip");
327 if (!req(m->index, R))
328 fprintf(e->f, ", %%%s, %d",
329 regtoa(m->index.val, SLong),
330 m->scale
332 fputc(')', e->f);
333 break;
334 case RCon:
335 fputc('$', e->f);
336 emitcon(&e->fn->con[ref.val], e);
337 break;
338 default:
339 die("unreachable");
341 break;
342 case 'L':
343 sz = SLong;
344 goto Ref;
345 case 'W':
346 sz = SWord;
347 goto Ref;
348 case 'H':
349 sz = SShort;
350 goto Ref;
351 case 'B':
352 sz = SByte;
353 goto Ref;
354 case 'M':
355 c = *s++;
356 ref = getarg(c, i);
357 switch (rtype(ref)) {
358 case RMem:
359 goto Mem;
360 case RSlot:
361 fprintf(e->f, "%d(%%%s)",
362 slot(ref, e),
363 regtoa(e->fp, SLong)
365 break;
366 case RCon:
367 off = e->fn->con[ref.val];
368 emitcon(&off, e);
369 if (off.type == CAddr)
370 if (off.sym.type != SThr || T.apple)
371 fprintf(e->f, "(%%rip)");
372 break;
373 case RTmp:
374 assert(isreg(ref));
375 fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong));
376 break;
377 default:
378 die("unreachable");
380 break;
381 default:
382 die("invalid format specifier %%%c", c);
384 goto Next;
387 static void *negmask[4] = {
388 [Ks] = (uint32_t[4]){ 0x80000000 },
389 [Kd] = (uint64_t[2]){ 0x8000000000000000 },
392 static void
393 emitins(Ins i, E *e)
395 Ref r;
396 int64_t val;
397 int o, t0;
398 Ins ineg;
399 Con *con;
400 char *sym;
402 switch (i.op) {
403 default:
404 Table:
405 /* most instructions are just pulled out of
406 * the table omap[], some special cases are
407 * detailed below */
408 for (o=0;; o++) {
409 /* this linear search should really be a binary
410 * search */
411 if (omap[o].op == NOp)
412 die("no match for %s(%c)",
413 optab[i.op].name, "wlsd"[i.cls]);
414 if (omap[o].op == i.op)
415 if (omap[o].cls == i.cls
416 || (omap[o].cls == Ki && KBASE(i.cls) == 0)
417 || (omap[o].cls == Ka))
418 break;
420 emitf(omap[o].fmt, &i, e);
421 break;
422 case Onop:
423 /* just do nothing for nops, they are inserted
424 * by some passes */
425 break;
426 case Omul:
427 /* here, we try to use the 3-addresss form
428 * of multiplication when possible */
429 if (rtype(i.arg[1]) == RCon) {
430 r = i.arg[0];
431 i.arg[0] = i.arg[1];
432 i.arg[1] = r;
434 if (KBASE(i.cls) == 0 /* only available for ints */
435 && rtype(i.arg[0]) == RCon
436 && rtype(i.arg[1]) == RTmp) {
437 emitf("imul%k %0, %1, %=", &i, e);
438 break;
440 goto Table;
441 case Osub:
442 /* we have to use the negation trick to handle
443 * some 3-address subtractions */
444 if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) {
445 ineg = (Ins){Oneg, i.cls, i.to, {i.to}};
446 emitins(ineg, e);
447 emitf("add%k %0, %=", &i, e);
448 break;
450 goto Table;
451 case Oneg:
452 if (!req(i.to, i.arg[0]))
453 emitf("mov%k %0, %=", &i, e);
454 if (KBASE(i.cls) == 0)
455 emitf("neg%k %=", &i, e);
456 else
457 fprintf(e->f,
458 "\txorp%c %sfp%d(%%rip), %%%s\n",
459 "xxsd"[i.cls],
460 T.asloc,
461 stashbits(negmask[i.cls], 16),
462 regtoa(i.to.val, SLong)
464 break;
465 case Odiv:
466 /* use xmm15 to adjust the instruction when the
467 * conversion to 2-address in emitf() would fail */
468 if (req(i.to, i.arg[1])) {
469 i.arg[1] = TMP(XMM0+15);
470 emitf("mov%k %=, %1", &i, e);
471 emitf("mov%k %0, %=", &i, e);
472 i.arg[0] = i.to;
474 goto Table;
475 case Ocopy:
476 /* copies are used for many things; see my note
477 * to understand how to load big constants:
478 * https://c9x.me/notes/2015-09-19.html */
479 assert(rtype(i.to) != RMem);
480 if (req(i.to, R) || req(i.arg[0], R))
481 break;
482 if (req(i.to, i.arg[0]))
483 break;
484 t0 = rtype(i.arg[0]);
485 if (i.cls == Kl
486 && t0 == RCon
487 && e->fn->con[i.arg[0].val].type == CBits) {
488 val = e->fn->con[i.arg[0].val].bits.i;
489 if (isreg(i.to))
490 if (val >= 0 && val <= UINT32_MAX) {
491 emitf("movl %W0, %W=", &i, e);
492 break;
494 if (rtype(i.to) == RSlot)
495 if (val < INT32_MIN || val > INT32_MAX) {
496 emitf("movl %0, %=", &i, e);
497 emitf("movl %0>>32, 4+%=", &i, e);
498 break;
501 if (isreg(i.to)
502 && t0 == RCon
503 && e->fn->con[i.arg[0].val].type == CAddr) {
504 emitf("lea%k %M0, %=", &i, e);
505 break;
507 if (rtype(i.to) == RSlot
508 && (t0 == RSlot || t0 == RMem)) {
509 i.cls = KWIDE(i.cls) ? Kd : Ks;
510 i.arg[1] = TMP(XMM0+15);
511 emitf("mov%k %0, %1", &i, e);
512 emitf("mov%k %1, %=", &i, e);
513 break;
515 /* conveniently, the assembler knows if it
516 * should use movabsq when reading movq */
517 emitf("mov%k %0, %=", &i, e);
518 break;
519 case Oaddr:
520 if (!T.apple
521 && rtype(i.arg[0]) == RCon
522 && e->fn->con[i.arg[0].val].sym.type == SThr) {
523 /* derive the symbol address from the TCB
524 * address at offset 0 of %fs */
525 assert(isreg(i.to));
526 con = &e->fn->con[i.arg[0].val];
527 sym = str(con->sym.id);
528 emitf("movq %%fs:0, %L=", &i, e);
529 fprintf(e->f, "\tleaq %s%s@tpoff",
530 sym[0] == '"' ? "" : T.assym, sym);
531 if (con->bits.i)
532 fprintf(e->f, "%+"PRId64,
533 con->bits.i);
534 fprintf(e->f, "(%%%s), %%%s\n",
535 regtoa(i.to.val, SLong),
536 regtoa(i.to.val, SLong));
537 break;
539 goto Table;
540 case Ocall:
541 /* calls simply have a weird syntax in AT&T
542 * assembly... */
543 switch (rtype(i.arg[0])) {
544 case RCon:
545 fprintf(e->f, "\tcallq ");
546 emitcon(&e->fn->con[i.arg[0].val], e);
547 fprintf(e->f, "\n");
548 break;
549 case RTmp:
550 emitf("callq *%L0", &i, e);
551 break;
552 default:
553 die("invalid call argument");
555 break;
556 case Osalloc:
557 /* there is no good reason why this is here
558 * maybe we should split Osalloc in 2 different
559 * instructions depending on the result
561 assert(e->fp == RBP);
562 emitf("subq %L0, %%rsp", &i, e);
563 if (!req(i.to, R))
564 emitcopy(i.to, TMP(RSP), Kl, e);
565 break;
566 case Oswap:
567 if (KBASE(i.cls) == 0)
568 goto Table;
569 /* for floats, there is no swap instruction
570 * so we use xmm15 as a temporary
572 emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e);
573 emitcopy(i.arg[0], i.arg[1], i.cls, e);
574 emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e);
575 break;
576 case Odbgloc:
577 emitdbgloc(i.arg[0].val, i.arg[1].val, e->f);
578 break;
582 static void
583 framesz(E *e)
585 uint64_t i, o, f;
587 /* specific to NAlign == 3 */
588 o = 0;
589 if (!e->fn->leaf) {
590 for (i=0, o=0; i<NCLR; i++)
591 o ^= e->fn->reg >> amd64_sysv_rclob[i];
592 o &= 1;
594 f = e->fn->slot;
595 f = (f + 3) & -4;
596 if (f > 0
597 && e->fp == RSP
598 && e->fn->salign == 4)
599 f += 2;
600 e->fsz = 4*f + 8*o + 176*e->fn->vararg;
603 void
604 amd64_emitfn(Fn *fn, FILE *f)
606 static char *ctoa[] = {
607 #define X(c, s) [c] = s,
608 CMP(X)
609 #undef X
611 static int id0;
612 Blk *b, *s;
613 Ins *i, itmp;
614 int *r, c, o, n, lbl;
615 E *e;
617 e = &(E){.f = f, .fn = fn};
618 emitfnlnk(fn->name, &fn->lnk, f);
619 fputs("\tendbr64\n", f);
620 if (!fn->leaf || fn->vararg || fn->dynalloc) {
621 e->fp = RBP;
622 fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
623 } else
624 e->fp = RSP;
625 framesz(e);
626 if (e->fsz)
627 fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
628 if (fn->vararg) {
629 o = -176;
630 for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
631 fprintf(f, "\tmovq %%%s, %d(%%rbp)\n", rname[*r][0], o);
632 for (n=0; n<8; ++n, o+=16)
633 fprintf(f, "\tmovaps %%xmm%d, %d(%%rbp)\n", n, o);
635 for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++)
636 if (fn->reg & BIT(*r)) {
637 itmp.arg[0] = TMP(*r);
638 emitf("pushq %L0", &itmp, e);
639 e->nclob++;
642 for (lbl=0, b=fn->start; b; b=b->link) {
643 if (lbl || b->npred > 1)
644 fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
645 for (i=b->ins; i!=&b->ins[b->nins]; i++)
646 emitins(*i, e);
647 lbl = 1;
648 switch (b->jmp.type) {
649 case Jhlt:
650 fprintf(f, "\tud2\n");
651 break;
652 case Jret0:
653 if (fn->dynalloc)
654 fprintf(f,
655 "\tmovq %%rbp, %%rsp\n"
656 "\tsubq $%"PRIu64", %%rsp\n",
657 e->fsz + e->nclob * 8);
658 for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;)
659 if (fn->reg & BIT(*--r)) {
660 itmp.arg[0] = TMP(*r);
661 emitf("popq %L0", &itmp, e);
663 if (e->fp == RBP)
664 fputs("\tleave\n", f);
665 else if (e->fsz)
666 fprintf(f,
667 "\taddq $%"PRIu64", %%rsp\n",
668 e->fsz);
669 fputs("\tret\n", f);
670 break;
671 case Jjmp:
672 Jmp:
673 if (b->s1 != b->link)
674 fprintf(f, "\tjmp %sbb%d\n",
675 T.asloc, id0+b->s1->id);
676 else
677 lbl = 0;
678 break;
679 default:
680 c = b->jmp.type - Jjf;
681 if (0 <= c && c <= NCmp) {
682 if (b->link == b->s2) {
683 s = b->s1;
684 b->s1 = b->s2;
685 b->s2 = s;
686 } else
687 c = cmpneg(c);
688 fprintf(f, "\tj%s %sbb%d\n", ctoa[c],
689 T.asloc, id0+b->s2->id);
690 goto Jmp;
692 die("unhandled jump %d", b->jmp.type);
695 id0 += fn->nblk;
696 if (!T.apple)
697 elf_emitfnfin(fn->name, f);