add support for thread-local storage
[qbe.git] / amd64 / isel.c
bloba56244151e2fdde0947cca5eac20860e33a894ec
1 #include "all.h"
2 #include <limits.h>
4 /* For x86_64, do the following:
6 * - check that constants are used only in
7 * places allowed
8 * - ensure immediates always fit in 32b
9 * - expose machine register contraints
10 * on instructions like division.
11 * - implement fast locals (the streak of
12 * constant allocX in the first basic block)
13 * - recognize complex addressing modes
15 * Invariant: the use counts that are used
16 * in sel() must be sound. This
17 * is not so trivial, maybe the
18 * dce should be moved out...
21 typedef struct ANum ANum;
23 struct ANum {
24 char n, l, r;
25 Ins *i;
28 static int amatch(Addr *, Ref, int, ANum *, Fn *);
30 static int
31 noimm(Ref r, Fn *fn)
33 int64_t val;
35 if (rtype(r) != RCon)
36 return 0;
37 switch (fn->con[r.val].type) {
38 case CAddr:
39 /* we only support the 'small'
40 * code model of the ABI, this
41 * means that we can always
42 * address data with 32bits
44 return 0;
45 case CBits:
46 val = fn->con[r.val].bits.i;
47 return (val < INT32_MIN || val > INT32_MAX);
48 default:
49 die("invalid constant");
53 static int
54 rslot(Ref r, Fn *fn)
56 if (rtype(r) != RTmp)
57 return -1;
58 return fn->tmp[r.val].slot;
61 static void
62 fixarg(Ref *r, int k, Ins *i, Fn *fn)
64 char buf[32];
65 Addr a, *m;
66 Ref r0, r1;
67 int s, n, op;
69 r1 = r0 = *r;
70 s = rslot(r0, fn);
71 op = i ? i->op : Ocopy;
72 if (KBASE(k) == 1 && rtype(r0) == RCon) {
73 /* load floating points from memory
74 * slots, they can't be used as
75 * immediates
77 r1 = MEM(fn->nmem);
78 vgrow(&fn->mem, ++fn->nmem);
79 memset(&a, 0, sizeof a);
80 a.offset.type = CAddr;
81 n = stashbits(&fn->con[r0.val].bits, KWIDE(k) ? 8 : 4);
82 sprintf(buf, "\"%sfp%d\"", T.asloc, n);
83 a.offset.label = intern(buf);
84 fn->mem[fn->nmem-1] = a;
86 else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
87 /* load constants that do not fit in
88 * a 32bit signed integer into a
89 * long temporary
91 r1 = newtmp("isel", Kl, fn);
92 emit(Ocopy, Kl, r1, r0, R);
94 else if (s != -1) {
95 /* load fast locals' addresses into
96 * temporaries right before the
97 * instruction
99 r1 = newtmp("isel", Kl, fn);
100 emit(Oaddr, Kl, r1, SLOT(s), R);
102 else if (!(isstore(op) && r == &i->arg[1])
103 && !isload(op) && op != Ocall && rtype(r0) == RCon
104 && fn->con[r0.val].type == CAddr) {
105 /* apple as does not support 32-bit
106 * absolute addressing, use a rip-
107 * relative leaq instead
109 r1 = newtmp("isel", Kl, fn);
110 emit(Oaddr, Kl, r1, r0, R);
112 else if (rtype(r0) == RMem) {
113 /* eliminate memory operands of
114 * the form $foo(%rip, ...)
116 m = &fn->mem[r0.val];
117 if (req(m->base, R))
118 if (m->offset.type == CAddr) {
119 r0 = newtmp("isel", Kl, fn);
120 emit(Oaddr, Kl, r0, newcon(&m->offset, fn), R);
121 m->offset.type = CUndef;
122 m->base = r0;
125 *r = r1;
128 static void
129 seladdr(Ref *r, ANum *an, Fn *fn)
131 Addr a;
132 Ref r0;
134 r0 = *r;
135 if (rtype(r0) == RTmp) {
136 memset(&a, 0, sizeof a);
137 if (!amatch(&a, r0, an[r0.val].n, an, fn))
138 return;
139 if (!req(a.base, R))
140 if (a.offset.type == CAddr) {
141 /* apple as does not support
142 * $foo(%r0, %r1, M); try to
143 * rewrite it or bail out if
144 * impossible
146 if (!req(a.index, R) || rtype(a.base) != RTmp)
147 return;
148 else {
149 a.index = a.base;
150 a.scale = 1;
151 a.base = R;
154 chuse(r0, -1, fn);
155 vgrow(&fn->mem, ++fn->nmem);
156 fn->mem[fn->nmem-1] = a;
157 chuse(a.base, +1, fn);
158 chuse(a.index, +1, fn);
159 *r = MEM(fn->nmem-1);
163 static int
164 cmpswap(Ref arg[2], int op)
166 switch (op) {
167 case NCmpI+Cflt:
168 case NCmpI+Cfle:
169 return 1;
170 case NCmpI+Cfgt:
171 case NCmpI+Cfge:
172 return 0;
174 return rtype(arg[0]) == RCon;
177 static void
178 selcmp(Ref arg[2], int k, int swap, Fn *fn)
180 Ref r;
181 Ins *icmp;
183 if (swap) {
184 r = arg[1];
185 arg[1] = arg[0];
186 arg[0] = r;
188 emit(Oxcmp, k, R, arg[1], arg[0]);
189 icmp = curi;
190 if (rtype(arg[0]) == RCon) {
191 assert(k != Kw);
192 icmp->arg[1] = newtmp("isel", k, fn);
193 emit(Ocopy, k, icmp->arg[1], arg[0], R);
194 fixarg(&curi->arg[0], k, curi, fn);
196 fixarg(&icmp->arg[0], k, icmp, fn);
197 fixarg(&icmp->arg[1], k, icmp, fn);
200 static void
201 sel(Ins i, ANum *an, Fn *fn)
203 Ref r0, r1, tmp[7];
204 int x, j, k, kc, sh, swap;
205 Ins *i0, *i1;
207 if (rtype(i.to) == RTmp)
208 if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
209 if (fn->tmp[i.to.val].nuse == 0) {
210 chuse(i.arg[0], -1, fn);
211 chuse(i.arg[1], -1, fn);
212 return;
214 i0 = curi;
215 k = i.cls;
216 switch (i.op) {
217 case Odiv:
218 case Orem:
219 case Oudiv:
220 case Ourem:
221 if (KBASE(k) == 1)
222 goto Emit;
223 if (i.op == Odiv || i.op == Oudiv)
224 r0 = TMP(RAX), r1 = TMP(RDX);
225 else
226 r0 = TMP(RDX), r1 = TMP(RAX);
227 emit(Ocopy, k, i.to, r0, R);
228 emit(Ocopy, k, R, r1, R);
229 if (rtype(i.arg[1]) == RCon) {
230 /* immediates not allowed for
231 * divisions in x86
233 r0 = newtmp("isel", k, fn);
234 } else
235 r0 = i.arg[1];
236 if (fn->tmp[r0.val].slot != -1)
237 err("unlikely argument %%%s in %s",
238 fn->tmp[r0.val].name, optab[i.op].name);
239 if (i.op == Odiv || i.op == Orem) {
240 emit(Oxidiv, k, R, r0, R);
241 emit(Osign, k, TMP(RDX), TMP(RAX), R);
242 } else {
243 emit(Oxdiv, k, R, r0, R);
244 emit(Ocopy, k, TMP(RDX), CON_Z, R);
246 emit(Ocopy, k, TMP(RAX), i.arg[0], R);
247 fixarg(&curi->arg[0], k, curi, fn);
248 if (rtype(i.arg[1]) == RCon)
249 emit(Ocopy, k, r0, i.arg[1], R);
250 break;
251 case Osar:
252 case Oshr:
253 case Oshl:
254 r0 = i.arg[1];
255 if (rtype(r0) == RCon)
256 goto Emit;
257 if (fn->tmp[r0.val].slot != -1)
258 err("unlikely argument %%%s in %s",
259 fn->tmp[r0.val].name, optab[i.op].name);
260 i.arg[1] = TMP(RCX);
261 emit(Ocopy, Kw, R, TMP(RCX), R);
262 emiti(i);
263 i1 = curi;
264 emit(Ocopy, Kw, TMP(RCX), r0, R);
265 fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
266 break;
267 case Ouwtof:
268 r0 = newtmp("utof", Kl, fn);
269 emit(Osltof, k, i.to, r0, R);
270 emit(Oextuw, Kl, r0, i.arg[0], R);
271 fixarg(&curi->arg[0], k, curi, fn);
272 break;
273 case Oultof:
274 /* %mask =l and %arg.0, 1
275 %isbig =l shr %arg.0, 63
276 %divided =l shr %arg.0, %isbig
277 %or =l or %mask, %divided
278 %float =d sltof %or
279 %cast =l cast %float
280 %addend =l shl %isbig, 52
281 %sum =l add %cast, %addend
282 %result =d cast %sum
284 r0 = newtmp("utof", k, fn);
285 if (k == Ks)
286 kc = Kw, sh = 23;
287 else
288 kc = Kl, sh = 52;
289 for (j=0; j<4; j++)
290 tmp[j] = newtmp("utof", Kl, fn);
291 for (; j<7; j++)
292 tmp[j] = newtmp("utof", kc, fn);
293 emit(Ocast, k, i.to, tmp[6], R);
294 emit(Oadd, kc, tmp[6], tmp[4], tmp[5]);
295 emit(Oshl, kc, tmp[5], tmp[1], getcon(sh, fn));
296 emit(Ocast, kc, tmp[4], r0, R);
297 emit(Osltof, k, r0, tmp[3], R);
298 emit(Oor, Kl, tmp[3], tmp[0], tmp[2]);
299 emit(Oshr, Kl, tmp[2], i.arg[0], tmp[1]);
300 sel(*curi++, 0, fn);
301 emit(Oshr, Kl, tmp[1], i.arg[0], getcon(63, fn));
302 fixarg(&curi->arg[0], Kl, curi, fn);
303 emit(Oand, Kl, tmp[0], i.arg[0], getcon(1, fn));
304 fixarg(&curi->arg[0], Kl, curi, fn);
305 break;
306 case Ostoui:
307 i.op = Ostosi;
308 kc = Ks;
309 tmp[4] = getcon(0xdf000000, fn);
310 goto Oftoui;
311 case Odtoui:
312 i.op = Odtosi;
313 kc = Kd;
314 tmp[4] = getcon(0xc3e0000000000000, fn);
315 Oftoui:
316 if (k == Kw)
317 goto Emit;
318 r0 = newtmp("ftou", kc, fn);
319 for (j=0; j<4; j++)
320 tmp[j] = newtmp("ftou", Kl, fn);
321 emit(Oor, Kl, i.to, tmp[0], tmp[3]);
322 emit(Oand, Kl, tmp[3], tmp[2], tmp[1]);
323 emit(i.op, Kl, tmp[2], r0, R);
324 emit(Oadd, kc, r0, tmp[4], i.arg[0]);
325 i1 = curi; /* fixarg() can change curi */
326 fixarg(&i1->arg[0], kc, i1, fn);
327 fixarg(&i1->arg[1], kc, i1, fn);
328 emit(Osar, Kl, tmp[1], tmp[0], getcon(63, fn));
329 emit(i.op, Kl, tmp[0], i.arg[0], R);
330 fixarg(&curi->arg[0], Kl, curi, fn);
331 break;
332 case Onop:
333 break;
334 case Ostored:
335 case Ostores:
336 case Ostorel:
337 case Ostorew:
338 case Ostoreh:
339 case Ostoreb:
340 if (rtype(i.arg[0]) == RCon) {
341 if (i.op == Ostored)
342 i.op = Ostorel;
343 if (i.op == Ostores)
344 i.op = Ostorew;
346 seladdr(&i.arg[1], an, fn);
347 goto Emit;
348 case_Oload:
349 seladdr(&i.arg[0], an, fn);
350 goto Emit;
351 case Ocall:
352 case Osalloc:
353 case Ocopy:
354 case Oadd:
355 case Osub:
356 case Oneg:
357 case Omul:
358 case Oand:
359 case Oor:
360 case Oxor:
361 case Oxtest:
362 case Ostosi:
363 case Odtosi:
364 case Oswtof:
365 case Osltof:
366 case Oexts:
367 case Otruncd:
368 case Ocast:
369 case_OExt:
370 Emit:
371 emiti(i);
372 i1 = curi; /* fixarg() can change curi */
373 fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
374 fixarg(&i1->arg[1], argcls(&i, 1), i1, fn);
375 break;
376 case Oalloc4:
377 case Oalloc8:
378 case Oalloc16:
379 salloc(i.to, i.arg[0], fn);
380 break;
381 default:
382 if (isext(i.op))
383 goto case_OExt;
384 if (isload(i.op))
385 goto case_Oload;
386 if (iscmp(i.op, &kc, &x)) {
387 switch (x) {
388 case NCmpI+Cfeq:
389 /* zf is set when operands are
390 * unordered, so we may have to
391 * check pf
393 r0 = newtmp("isel", Kw, fn);
394 r1 = newtmp("isel", Kw, fn);
395 emit(Oand, Kw, i.to, r0, r1);
396 emit(Oflagfo, k, r1, R, R);
397 i.to = r0;
398 break;
399 case NCmpI+Cfne:
400 r0 = newtmp("isel", Kw, fn);
401 r1 = newtmp("isel", Kw, fn);
402 emit(Oor, Kw, i.to, r0, r1);
403 emit(Oflagfuo, k, r1, R, R);
404 i.to = r0;
405 break;
407 swap = cmpswap(i.arg, x);
408 if (swap)
409 x = cmpop(x);
410 emit(Oflag+x, k, i.to, R, R);
411 selcmp(i.arg, kc, swap, fn);
412 break;
414 die("unknown instruction %s", optab[i.op].name);
417 while (i0>curi && --i0) {
418 assert(rslot(i0->arg[0], fn) == -1);
419 assert(rslot(i0->arg[1], fn) == -1);
423 static Ins *
424 flagi(Ins *i0, Ins *i)
426 while (i>i0) {
427 i--;
428 if (amd64_op[i->op].zflag)
429 return i;
430 if (amd64_op[i->op].lflag)
431 continue;
432 return 0;
434 return 0;
437 static void
438 seljmp(Blk *b, Fn *fn)
440 Ref r;
441 int c, k, swap;
442 Ins *fi;
443 Tmp *t;
445 if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
446 return;
447 assert(b->jmp.type == Jjnz);
448 r = b->jmp.arg;
449 t = &fn->tmp[r.val];
450 b->jmp.arg = R;
451 assert(rtype(r) == RTmp);
452 if (b->s1 == b->s2) {
453 chuse(r, -1, fn);
454 b->jmp.type = Jjmp;
455 b->s2 = 0;
456 return;
458 fi = flagi(b->ins, &b->ins[b->nins]);
459 if (!fi || !req(fi->to, r)) {
460 selcmp((Ref[2]){r, CON_Z}, Kw, 0, fn); /* todo, long jnz */
461 b->jmp.type = Jjf + Cine;
463 else if (iscmp(fi->op, &k, &c)
464 && c != NCmpI+Cfeq /* see sel() */
465 && c != NCmpI+Cfne) {
466 swap = cmpswap(fi->arg, c);
467 if (swap)
468 c = cmpop(c);
469 if (t->nuse == 1) {
470 selcmp(fi->arg, k, swap, fn);
471 *fi = (Ins){.op = Onop};
473 b->jmp.type = Jjf + c;
475 else if (fi->op == Oand && t->nuse == 1
476 && (rtype(fi->arg[0]) == RTmp ||
477 rtype(fi->arg[1]) == RTmp)) {
478 fi->op = Oxtest;
479 fi->to = R;
480 b->jmp.type = Jjf + Cine;
481 if (rtype(fi->arg[1]) == RCon) {
482 r = fi->arg[1];
483 fi->arg[1] = fi->arg[0];
484 fi->arg[0] = r;
487 else {
488 /* since flags are not tracked in liveness,
489 * the result of the flag-setting instruction
490 * has to be marked as live
492 if (t->nuse == 1)
493 emit(Ocopy, Kw, R, r, R);
494 b->jmp.type = Jjf + Cine;
498 static int
499 aref(Ref r, ANum *ai)
501 switch (rtype(r)) {
502 case RCon:
503 return 2;
504 case RTmp:
505 return ai[r.val].n;
506 default:
507 die("constant or temporary expected");
511 static int
512 ascale(Ref r, Con *con)
514 int64_t n;
516 if (rtype(r) != RCon)
517 return 0;
518 if (con[r.val].type != CBits)
519 return 0;
520 n = con[r.val].bits.i;
521 return n == 1 || n == 2 || n == 4 || n == 8;
524 static void
525 anumber(ANum *ai, Blk *b, Con *con)
527 /* This should be made obsolete by a proper
528 * reassoc pass.
530 * Rules:
532 * RTmp(_) -> 0 tmp
533 * ( RTmp(_) -> 1 slot )
534 * RCon(_) -> 2 con
535 * 0 * 2 -> 3 s * i (when constant is 1,2,4,8)
537 static char add[10][10] = {
538 [2] [4] = 4, [4] [2] = 4,
539 [2] [6] = 6, [6] [2] = 6,
540 [2] [7] = 7, [7] [2] = 7,
541 [0] [2] = 4, [2] [0] = 4, /* 4: o + b */
542 [0] [0] = 5, /* 5: b + s * i */
543 [0] [3] = 5, [3] [0] = 5,
544 [2] [3] = 6, [3] [2] = 6, /* 6: o + s * i */
545 [2] [5] = 7, [5] [2] = 7, /* 7: o + b + s * i */
546 [0] [6] = 7, [6] [0] = 7,
547 [4] [3] = 7, [3] [4] = 7,
549 int a, a1, a2, n1, n2, t1, t2;
550 Ins *i;
552 for (i=b->ins; i<&b->ins[b->nins]; i++) {
553 if (rtype(i->to) == RTmp)
554 ai[i->to.val].i = i;
555 if (i->op != Oadd && i->op != Omul)
556 continue;
557 a1 = aref(i->arg[0], ai);
558 a2 = aref(i->arg[1], ai);
559 t1 = a1 != 1 && a1 != 2;
560 t2 = a2 != 1 && a2 != 2;
561 if (i->op == Oadd) {
562 a = add[n1 = a1][n2 = a2];
563 if (t1 && a < add[0][a2])
564 a = add[n1 = 0][n2 = a2];
565 if (t2 && a < add[a1][0])
566 a = add[n1 = a1][n2 = 0];
567 if (t1 && t2 && a < add[0][0])
568 a = add[n1 = 0][n2 = 0];
569 } else {
570 n1 = n2 = a = 0;
571 if (ascale(i->arg[0], con) && t2)
572 a = 3, n1 = 2, n2 = 0;
573 if (t1 && ascale(i->arg[1], con))
574 a = 3, n1 = 0, n2 = 2;
576 ai[i->to.val].n = a;
577 ai[i->to.val].l = n1;
578 ai[i->to.val].r = n2;
582 static int
583 amatch(Addr *a, Ref r, int n, ANum *ai, Fn *fn)
585 Ins *i;
586 int nl, nr, t, s;
587 Ref al, ar;
589 if (rtype(r) == RCon) {
590 if (!addcon(&a->offset, &fn->con[r.val]))
591 err("unlikely sum of $%s and $%s",
592 str(a->offset.label), str(fn->con[r.val].label));
593 return 1;
595 assert(rtype(r) == RTmp);
596 i = ai[r.val].i;
597 nl = ai[r.val].l;
598 nr = ai[r.val].r;
599 if (i) {
600 if (nl > nr) {
601 al = i->arg[1];
602 ar = i->arg[0];
603 t = nl, nl = nr, nr = t;
604 } else {
605 al = i->arg[0];
606 ar = i->arg[1];
609 switch (n) {
610 case 3: /* s * i */
611 a->index = al;
612 a->scale = fn->con[ar.val].bits.i;
613 return 0;
614 case 5: /* b + s * i */
615 switch (nr) {
616 case 0:
617 if (fn->tmp[ar.val].slot != -1) {
618 al = i->arg[1];
619 ar = i->arg[0];
621 a->index = ar;
622 a->scale = 1;
623 break;
624 case 3:
625 amatch(a, ar, nr, ai, fn);
626 break;
628 r = al;
629 /* fall through */
630 case 0:
631 s = fn->tmp[r.val].slot;
632 if (s != -1)
633 r = SLOT(s);
634 a->base = r;
635 return n || s != -1;
636 case 2: /* constants */
637 case 4: /* o + b */
638 case 6: /* o + s * i */
639 case 7: /* o + b + s * i */
640 amatch(a, ar, nr, ai, fn);
641 amatch(a, al, nl, ai, fn);
642 return 1;
643 default:
644 die("unreachable");
648 /* instruction selection
649 * requires use counts (as given by parsing)
651 void
652 amd64_isel(Fn *fn)
654 Blk *b, **sb;
655 Ins *i;
656 Phi *p;
657 uint a;
658 int n, al;
659 int64_t sz;
660 ANum *ainfo;
662 /* assign slots to fast allocs */
663 b = fn->start;
664 /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
665 for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
666 for (i=b->ins; i<&b->ins[b->nins]; i++)
667 if (i->op == al) {
668 if (rtype(i->arg[0]) != RCon)
669 break;
670 sz = fn->con[i->arg[0].val].bits.i;
671 if (sz < 0 || sz >= INT_MAX-15)
672 err("invalid alloc size %"PRId64, sz);
673 sz = (sz + n-1) & -n;
674 sz /= 4;
675 if (sz > INT_MAX - fn->slot)
676 die("alloc too large");
677 fn->tmp[i->to.val].slot = fn->slot;
678 fn->slot += sz;
679 *i = (Ins){.op = Onop};
682 /* process basic blocks */
683 n = fn->ntmp;
684 ainfo = emalloc(n * sizeof ainfo[0]);
685 for (b=fn->start; b; b=b->link) {
686 curi = &insb[NIns];
687 for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
688 for (p=(*sb)->phi; p; p=p->link) {
689 for (a=0; p->blk[a] != b; a++)
690 assert(a+1 < p->narg);
691 fixarg(&p->arg[a], p->cls, 0, fn);
693 memset(ainfo, 0, n * sizeof ainfo[0]);
694 anumber(ainfo, b, fn->con);
695 seljmp(b, fn);
696 for (i=&b->ins[b->nins]; i!=b->ins;)
697 sel(*--i, ainfo, fn);
698 b->nins = &insb[NIns] - curi;
699 idup(&b->ins, curi, b->nins);
701 free(ainfo);
703 if (debug['I']) {
704 fprintf(stderr, "\n> After instruction selection:\n");
705 printfn(fn, stderr);