fix allocation ordering bug in rega
[qbe.git] / amd64 / isel.c
blob8c89378fcf0295de3d7a4ba14d4a8177813b1c14
1 #include "all.h"
2 #include <limits.h>
4 /* For x86_64, do the following:
6 * - check that constants are used only in
7 * places allowed
8 * - ensure immediates always fit in 32b
9 * - expose machine register contraints
10 * on instructions like division.
11 * - implement fast locals (the streak of
12 * constant allocX in the first basic block)
13 * - recognize complex addressing modes
15 * Invariant: the use counts that are used
16 * in sel() must be sound. This
17 * is not so trivial, maybe the
18 * dce should be moved out...
21 typedef struct ANum ANum;
23 struct ANum {
24 char n, l, r;
25 Ins *i;
28 static int amatch(Addr *, Ref, int, ANum *, Fn *);
30 static int
31 noimm(Ref r, Fn *fn)
33 int64_t val;
35 if (rtype(r) != RCon)
36 return 0;
37 switch (fn->con[r.val].type) {
38 case CAddr:
39 /* we only support the 'small'
40 * code model of the ABI, this
41 * means that we can always
42 * address data with 32bits
44 return 0;
45 case CBits:
46 val = fn->con[r.val].bits.i;
47 return (val < INT32_MIN || val > INT32_MAX);
48 default:
49 die("invalid constant");
53 static int
54 rslot(Ref r, Fn *fn)
56 if (rtype(r) != RTmp)
57 return -1;
58 return fn->tmp[r.val].slot;
61 static void
62 fixarg(Ref *r, int k, Ins *i, Fn *fn)
64 char buf[32];
65 Addr a, *m;
66 Con cc, *c;
67 Ref r0, r1, r2, r3;
68 int s, n, op;
70 r1 = r0 = *r;
71 s = rslot(r0, fn);
72 op = i ? i->op : Ocopy;
73 if (KBASE(k) == 1 && rtype(r0) == RCon) {
74 /* load floating points from memory
75 * slots, they can't be used as
76 * immediates
78 r1 = MEM(fn->nmem);
79 vgrow(&fn->mem, ++fn->nmem);
80 memset(&a, 0, sizeof a);
81 a.offset.type = CAddr;
82 n = stashbits(&fn->con[r0.val].bits, KWIDE(k) ? 8 : 4);
83 sprintf(buf, "\"%sfp%d\"", T.asloc, n);
84 a.offset.label = intern(buf);
85 fn->mem[fn->nmem-1] = a;
87 else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
88 /* load constants that do not fit in
89 * a 32bit signed integer into a
90 * long temporary
92 r1 = newtmp("isel", Kl, fn);
93 emit(Ocopy, Kl, r1, r0, R);
95 else if (s != -1) {
96 /* load fast locals' addresses into
97 * temporaries right before the
98 * instruction
100 r1 = newtmp("isel", Kl, fn);
101 emit(Oaddr, Kl, r1, SLOT(s), R);
103 else if (!(isstore(op) && r == &i->arg[1])
104 && !isload(op) && op != Ocall && rtype(r0) == RCon
105 && fn->con[r0.val].type == CAddr) {
106 /* apple as does not support 32-bit
107 * absolute addressing, use a rip-
108 * relative leaq instead
110 r1 = newtmp("isel", Kl, fn);
111 emit(Oaddr, Kl, r1, r0, R);
113 else if (rtype(r0) == RMem) {
114 /* eliminate memory operands of
115 * the form $foo(%rip, ...)
117 m = &fn->mem[r0.val];
118 if (req(m->base, R))
119 if (m->offset.type == CAddr) {
120 r0 = newtmp("isel", Kl, fn);
121 emit(Oaddr, Kl, r0, newcon(&m->offset, fn), R);
122 m->offset.type = CUndef;
123 m->base = r0;
125 } else if (T.apple && rtype(r0) == RCon
126 && (c = &fn->con[r0.val])->type == CAddr
127 && c->reloc == RelThr) {
128 r1 = newtmp("isel", Kl, fn);
129 if (c->bits.i) {
130 r2 = newtmp("isel", Kl, fn);
131 cc = (Con){.type = CBits};
132 cc.bits.i = c->bits.i;
133 r3 = newcon(&cc, fn);
134 emit(Oadd, Kl, r1, r2, r3);
135 } else
136 r2 = r1;
137 emit(Ocopy, Kl, r2, TMP(RAX), R);
138 r2 = newtmp("isel", Kl, fn);
139 r3 = newtmp("isel", Kl, fn);
140 emit(Ocall, 0, R, r3, CALL(17));
141 emit(Ocopy, Kl, TMP(RDI), r2, R);
142 emit(Oload, Kl, r3, r2, R);
143 cc = *c;
144 cc.bits.i = 0;
145 r3 = newcon(&cc, fn);
146 emit(Oload, Kl, r2, r3, R);
148 *r = r1;
151 static void
152 seladdr(Ref *r, ANum *an, Fn *fn)
154 Addr a;
155 Ref r0;
157 r0 = *r;
158 if (rtype(r0) == RTmp) {
159 memset(&a, 0, sizeof a);
160 if (!amatch(&a, r0, an[r0.val].n, an, fn))
161 return;
162 if (!req(a.base, R))
163 if (a.offset.type == CAddr) {
164 /* apple as does not support
165 * $foo(%r0, %r1, M); try to
166 * rewrite it or bail out if
167 * impossible
169 if (!req(a.index, R) || rtype(a.base) != RTmp)
170 return;
171 else {
172 a.index = a.base;
173 a.scale = 1;
174 a.base = R;
177 chuse(r0, -1, fn);
178 vgrow(&fn->mem, ++fn->nmem);
179 fn->mem[fn->nmem-1] = a;
180 chuse(a.base, +1, fn);
181 chuse(a.index, +1, fn);
182 *r = MEM(fn->nmem-1);
186 static int
187 cmpswap(Ref arg[2], int op)
189 switch (op) {
190 case NCmpI+Cflt:
191 case NCmpI+Cfle:
192 return 1;
193 case NCmpI+Cfgt:
194 case NCmpI+Cfge:
195 return 0;
197 return rtype(arg[0]) == RCon;
200 static void
201 selcmp(Ref arg[2], int k, int swap, Fn *fn)
203 Ref r;
204 Ins *icmp;
206 if (swap) {
207 r = arg[1];
208 arg[1] = arg[0];
209 arg[0] = r;
211 emit(Oxcmp, k, R, arg[1], arg[0]);
212 icmp = curi;
213 if (rtype(arg[0]) == RCon) {
214 assert(k != Kw);
215 icmp->arg[1] = newtmp("isel", k, fn);
216 emit(Ocopy, k, icmp->arg[1], arg[0], R);
217 fixarg(&curi->arg[0], k, curi, fn);
219 fixarg(&icmp->arg[0], k, icmp, fn);
220 fixarg(&icmp->arg[1], k, icmp, fn);
223 static void
224 sel(Ins i, ANum *an, Fn *fn)
226 Ref r0, r1, tmp[7];
227 int x, j, k, kc, sh, swap;
228 Ins *i0, *i1;
230 if (rtype(i.to) == RTmp)
231 if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
232 if (fn->tmp[i.to.val].nuse == 0) {
233 chuse(i.arg[0], -1, fn);
234 chuse(i.arg[1], -1, fn);
235 return;
237 i0 = curi;
238 k = i.cls;
239 switch (i.op) {
240 case Odiv:
241 case Orem:
242 case Oudiv:
243 case Ourem:
244 if (KBASE(k) == 1)
245 goto Emit;
246 if (i.op == Odiv || i.op == Oudiv)
247 r0 = TMP(RAX), r1 = TMP(RDX);
248 else
249 r0 = TMP(RDX), r1 = TMP(RAX);
250 emit(Ocopy, k, i.to, r0, R);
251 emit(Ocopy, k, R, r1, R);
252 if (rtype(i.arg[1]) == RCon) {
253 /* immediates not allowed for
254 * divisions in x86
256 r0 = newtmp("isel", k, fn);
257 } else
258 r0 = i.arg[1];
259 if (fn->tmp[r0.val].slot != -1)
260 err("unlikely argument %%%s in %s",
261 fn->tmp[r0.val].name, optab[i.op].name);
262 if (i.op == Odiv || i.op == Orem) {
263 emit(Oxidiv, k, R, r0, R);
264 emit(Osign, k, TMP(RDX), TMP(RAX), R);
265 } else {
266 emit(Oxdiv, k, R, r0, R);
267 emit(Ocopy, k, TMP(RDX), CON_Z, R);
269 emit(Ocopy, k, TMP(RAX), i.arg[0], R);
270 fixarg(&curi->arg[0], k, curi, fn);
271 if (rtype(i.arg[1]) == RCon)
272 emit(Ocopy, k, r0, i.arg[1], R);
273 break;
274 case Osar:
275 case Oshr:
276 case Oshl:
277 r0 = i.arg[1];
278 if (rtype(r0) == RCon)
279 goto Emit;
280 if (fn->tmp[r0.val].slot != -1)
281 err("unlikely argument %%%s in %s",
282 fn->tmp[r0.val].name, optab[i.op].name);
283 i.arg[1] = TMP(RCX);
284 emit(Ocopy, Kw, R, TMP(RCX), R);
285 emiti(i);
286 i1 = curi;
287 emit(Ocopy, Kw, TMP(RCX), r0, R);
288 fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
289 break;
290 case Ouwtof:
291 r0 = newtmp("utof", Kl, fn);
292 emit(Osltof, k, i.to, r0, R);
293 emit(Oextuw, Kl, r0, i.arg[0], R);
294 fixarg(&curi->arg[0], k, curi, fn);
295 break;
296 case Oultof:
297 /* %mask =l and %arg.0, 1
298 %isbig =l shr %arg.0, 63
299 %divided =l shr %arg.0, %isbig
300 %or =l or %mask, %divided
301 %float =d sltof %or
302 %cast =l cast %float
303 %addend =l shl %isbig, 52
304 %sum =l add %cast, %addend
305 %result =d cast %sum
307 r0 = newtmp("utof", k, fn);
308 if (k == Ks)
309 kc = Kw, sh = 23;
310 else
311 kc = Kl, sh = 52;
312 for (j=0; j<4; j++)
313 tmp[j] = newtmp("utof", Kl, fn);
314 for (; j<7; j++)
315 tmp[j] = newtmp("utof", kc, fn);
316 emit(Ocast, k, i.to, tmp[6], R);
317 emit(Oadd, kc, tmp[6], tmp[4], tmp[5]);
318 emit(Oshl, kc, tmp[5], tmp[1], getcon(sh, fn));
319 emit(Ocast, kc, tmp[4], r0, R);
320 emit(Osltof, k, r0, tmp[3], R);
321 emit(Oor, Kl, tmp[3], tmp[0], tmp[2]);
322 emit(Oshr, Kl, tmp[2], i.arg[0], tmp[1]);
323 sel(*curi++, 0, fn);
324 emit(Oshr, Kl, tmp[1], i.arg[0], getcon(63, fn));
325 fixarg(&curi->arg[0], Kl, curi, fn);
326 emit(Oand, Kl, tmp[0], i.arg[0], getcon(1, fn));
327 fixarg(&curi->arg[0], Kl, curi, fn);
328 break;
329 case Ostoui:
330 i.op = Ostosi;
331 kc = Ks;
332 tmp[4] = getcon(0xdf000000, fn);
333 goto Oftoui;
334 case Odtoui:
335 i.op = Odtosi;
336 kc = Kd;
337 tmp[4] = getcon(0xc3e0000000000000, fn);
338 Oftoui:
339 if (k == Kw)
340 goto Emit;
341 r0 = newtmp("ftou", kc, fn);
342 for (j=0; j<4; j++)
343 tmp[j] = newtmp("ftou", Kl, fn);
344 emit(Oor, Kl, i.to, tmp[0], tmp[3]);
345 emit(Oand, Kl, tmp[3], tmp[2], tmp[1]);
346 emit(i.op, Kl, tmp[2], r0, R);
347 emit(Oadd, kc, r0, tmp[4], i.arg[0]);
348 i1 = curi; /* fixarg() can change curi */
349 fixarg(&i1->arg[0], kc, i1, fn);
350 fixarg(&i1->arg[1], kc, i1, fn);
351 emit(Osar, Kl, tmp[1], tmp[0], getcon(63, fn));
352 emit(i.op, Kl, tmp[0], i.arg[0], R);
353 fixarg(&curi->arg[0], Kl, curi, fn);
354 break;
355 case Onop:
356 break;
357 case Ostored:
358 case Ostores:
359 case Ostorel:
360 case Ostorew:
361 case Ostoreh:
362 case Ostoreb:
363 if (rtype(i.arg[0]) == RCon) {
364 if (i.op == Ostored)
365 i.op = Ostorel;
366 if (i.op == Ostores)
367 i.op = Ostorew;
369 seladdr(&i.arg[1], an, fn);
370 goto Emit;
371 case_Oload:
372 seladdr(&i.arg[0], an, fn);
373 goto Emit;
374 case Ocall:
375 case Osalloc:
376 case Ocopy:
377 case Oadd:
378 case Osub:
379 case Oneg:
380 case Omul:
381 case Oand:
382 case Oor:
383 case Oxor:
384 case Oxtest:
385 case Ostosi:
386 case Odtosi:
387 case Oswtof:
388 case Osltof:
389 case Oexts:
390 case Otruncd:
391 case Ocast:
392 case_OExt:
393 Emit:
394 emiti(i);
395 i1 = curi; /* fixarg() can change curi */
396 fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
397 fixarg(&i1->arg[1], argcls(&i, 1), i1, fn);
398 break;
399 case Oalloc4:
400 case Oalloc8:
401 case Oalloc16:
402 salloc(i.to, i.arg[0], fn);
403 break;
404 default:
405 if (isext(i.op))
406 goto case_OExt;
407 if (isload(i.op))
408 goto case_Oload;
409 if (iscmp(i.op, &kc, &x)) {
410 switch (x) {
411 case NCmpI+Cfeq:
412 /* zf is set when operands are
413 * unordered, so we may have to
414 * check pf
416 r0 = newtmp("isel", Kw, fn);
417 r1 = newtmp("isel", Kw, fn);
418 emit(Oand, Kw, i.to, r0, r1);
419 emit(Oflagfo, k, r1, R, R);
420 i.to = r0;
421 break;
422 case NCmpI+Cfne:
423 r0 = newtmp("isel", Kw, fn);
424 r1 = newtmp("isel", Kw, fn);
425 emit(Oor, Kw, i.to, r0, r1);
426 emit(Oflagfuo, k, r1, R, R);
427 i.to = r0;
428 break;
430 swap = cmpswap(i.arg, x);
431 if (swap)
432 x = cmpop(x);
433 emit(Oflag+x, k, i.to, R, R);
434 selcmp(i.arg, kc, swap, fn);
435 break;
437 die("unknown instruction %s", optab[i.op].name);
440 while (i0>curi && --i0) {
441 assert(rslot(i0->arg[0], fn) == -1);
442 assert(rslot(i0->arg[1], fn) == -1);
446 static Ins *
447 flagi(Ins *i0, Ins *i)
449 while (i>i0) {
450 i--;
451 if (amd64_op[i->op].zflag)
452 return i;
453 if (amd64_op[i->op].lflag)
454 continue;
455 return 0;
457 return 0;
460 static void
461 seljmp(Blk *b, Fn *fn)
463 Ref r;
464 int c, k, swap;
465 Ins *fi;
466 Tmp *t;
468 if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
469 return;
470 assert(b->jmp.type == Jjnz);
471 r = b->jmp.arg;
472 t = &fn->tmp[r.val];
473 b->jmp.arg = R;
474 assert(rtype(r) == RTmp);
475 if (b->s1 == b->s2) {
476 chuse(r, -1, fn);
477 b->jmp.type = Jjmp;
478 b->s2 = 0;
479 return;
481 fi = flagi(b->ins, &b->ins[b->nins]);
482 if (!fi || !req(fi->to, r)) {
483 selcmp((Ref[2]){r, CON_Z}, Kw, 0, fn); /* todo, long jnz */
484 b->jmp.type = Jjf + Cine;
486 else if (iscmp(fi->op, &k, &c)
487 && c != NCmpI+Cfeq /* see sel() */
488 && c != NCmpI+Cfne) {
489 swap = cmpswap(fi->arg, c);
490 if (swap)
491 c = cmpop(c);
492 if (t->nuse == 1) {
493 selcmp(fi->arg, k, swap, fn);
494 *fi = (Ins){.op = Onop};
496 b->jmp.type = Jjf + c;
498 else if (fi->op == Oand && t->nuse == 1
499 && (rtype(fi->arg[0]) == RTmp ||
500 rtype(fi->arg[1]) == RTmp)) {
501 fi->op = Oxtest;
502 fi->to = R;
503 b->jmp.type = Jjf + Cine;
504 if (rtype(fi->arg[1]) == RCon) {
505 r = fi->arg[1];
506 fi->arg[1] = fi->arg[0];
507 fi->arg[0] = r;
510 else {
511 /* since flags are not tracked in liveness,
512 * the result of the flag-setting instruction
513 * has to be marked as live
515 if (t->nuse == 1)
516 emit(Ocopy, Kw, R, r, R);
517 b->jmp.type = Jjf + Cine;
521 static int
522 aref(Ref r, ANum *ai)
524 switch (rtype(r)) {
525 case RCon:
526 return 2;
527 case RTmp:
528 return ai[r.val].n;
529 default:
530 die("constant or temporary expected");
534 static int
535 ascale(Ref r, Con *con)
537 int64_t n;
539 if (rtype(r) != RCon)
540 return 0;
541 if (con[r.val].type != CBits)
542 return 0;
543 n = con[r.val].bits.i;
544 return n == 1 || n == 2 || n == 4 || n == 8;
547 static void
548 anumber(ANum *ai, Blk *b, Con *con)
550 /* This should be made obsolete by a proper
551 * reassoc pass.
553 * Rules:
555 * RTmp(_) -> 0 tmp
556 * ( RTmp(_) -> 1 slot )
557 * RCon(_) -> 2 con
558 * 0 * 2 -> 3 s * i (when constant is 1,2,4,8)
560 static char add[10][10] = {
561 [2] [4] = 4, [4] [2] = 4,
562 [2] [6] = 6, [6] [2] = 6,
563 [2] [7] = 7, [7] [2] = 7,
564 [0] [2] = 4, [2] [0] = 4, /* 4: o + b */
565 [0] [0] = 5, /* 5: b + s * i */
566 [0] [3] = 5, [3] [0] = 5,
567 [2] [3] = 6, [3] [2] = 6, /* 6: o + s * i */
568 [2] [5] = 7, [5] [2] = 7, /* 7: o + b + s * i */
569 [0] [6] = 7, [6] [0] = 7,
570 [4] [3] = 7, [3] [4] = 7,
572 int a, a1, a2, n1, n2, t1, t2;
573 Ins *i;
575 for (i=b->ins; i<&b->ins[b->nins]; i++) {
576 if (rtype(i->to) == RTmp)
577 ai[i->to.val].i = i;
578 if (i->op != Oadd && i->op != Omul)
579 continue;
580 a1 = aref(i->arg[0], ai);
581 a2 = aref(i->arg[1], ai);
582 t1 = a1 != 1 && a1 != 2;
583 t2 = a2 != 1 && a2 != 2;
584 if (i->op == Oadd) {
585 a = add[n1 = a1][n2 = a2];
586 if (t1 && a < add[0][a2])
587 a = add[n1 = 0][n2 = a2];
588 if (t2 && a < add[a1][0])
589 a = add[n1 = a1][n2 = 0];
590 if (t1 && t2 && a < add[0][0])
591 a = add[n1 = 0][n2 = 0];
592 } else {
593 n1 = n2 = a = 0;
594 if (ascale(i->arg[0], con) && t2)
595 a = 3, n1 = 2, n2 = 0;
596 if (t1 && ascale(i->arg[1], con))
597 a = 3, n1 = 0, n2 = 2;
599 ai[i->to.val].n = a;
600 ai[i->to.val].l = n1;
601 ai[i->to.val].r = n2;
605 static int
606 amatch(Addr *a, Ref r, int n, ANum *ai, Fn *fn)
608 Ins *i;
609 int nl, nr, t, s;
610 Ref al, ar;
612 if (rtype(r) == RCon) {
613 if (!addcon(&a->offset, &fn->con[r.val]))
614 err("unlikely sum of $%s and $%s",
615 str(a->offset.label), str(fn->con[r.val].label));
616 return 1;
618 assert(rtype(r) == RTmp);
619 i = ai[r.val].i;
620 nl = ai[r.val].l;
621 nr = ai[r.val].r;
622 if (i) {
623 if (nl > nr) {
624 al = i->arg[1];
625 ar = i->arg[0];
626 t = nl, nl = nr, nr = t;
627 } else {
628 al = i->arg[0];
629 ar = i->arg[1];
632 switch (n) {
633 case 3: /* s * i */
634 a->index = al;
635 a->scale = fn->con[ar.val].bits.i;
636 return 0;
637 case 5: /* b + s * i */
638 switch (nr) {
639 case 0:
640 if (fn->tmp[ar.val].slot != -1) {
641 al = i->arg[1];
642 ar = i->arg[0];
644 a->index = ar;
645 a->scale = 1;
646 break;
647 case 3:
648 amatch(a, ar, nr, ai, fn);
649 break;
651 r = al;
652 /* fall through */
653 case 0:
654 s = fn->tmp[r.val].slot;
655 if (s != -1)
656 r = SLOT(s);
657 a->base = r;
658 return n || s != -1;
659 case 2: /* constants */
660 case 4: /* o + b */
661 case 6: /* o + s * i */
662 case 7: /* o + b + s * i */
663 amatch(a, ar, nr, ai, fn);
664 amatch(a, al, nl, ai, fn);
665 return 1;
666 default:
667 die("unreachable");
671 /* instruction selection
672 * requires use counts (as given by parsing)
674 void
675 amd64_isel(Fn *fn)
677 Blk *b, **sb;
678 Ins *i;
679 Phi *p;
680 uint a;
681 int n, al;
682 int64_t sz;
683 ANum *ainfo;
685 /* assign slots to fast allocs */
686 b = fn->start;
687 /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
688 for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
689 for (i=b->ins; i<&b->ins[b->nins]; i++)
690 if (i->op == al) {
691 if (rtype(i->arg[0]) != RCon)
692 break;
693 sz = fn->con[i->arg[0].val].bits.i;
694 if (sz < 0 || sz >= INT_MAX-15)
695 err("invalid alloc size %"PRId64, sz);
696 sz = (sz + n-1) & -n;
697 sz /= 4;
698 if (sz > INT_MAX - fn->slot)
699 die("alloc too large");
700 fn->tmp[i->to.val].slot = fn->slot;
701 fn->slot += sz;
702 *i = (Ins){.op = Onop};
705 /* process basic blocks */
706 n = fn->ntmp;
707 ainfo = emalloc(n * sizeof ainfo[0]);
708 for (b=fn->start; b; b=b->link) {
709 curi = &insb[NIns];
710 for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
711 for (p=(*sb)->phi; p; p=p->link) {
712 for (a=0; p->blk[a] != b; a++)
713 assert(a+1 < p->narg);
714 fixarg(&p->arg[a], p->cls, 0, fn);
716 memset(ainfo, 0, n * sizeof ainfo[0]);
717 anumber(ainfo, b, fn->con);
718 seljmp(b, fn);
719 for (i=&b->ins[b->nins]; i!=b->ins;)
720 sel(*--i, ainfo, fn);
721 b->nins = &insb[NIns] - curi;
722 idup(&b->ins, curi, b->nins);
724 free(ainfo);
726 if (debug['I']) {
727 fprintf(stderr, "\n> After instruction selection:\n");
728 printfn(fn, stderr);