return non-zero when tests fail
[qbe.git] / isel.c
blob399ed5e8fbe0d60478d40b664ed7a007fe53bf6e
1 #include "all.h"
2 #include <limits.h>
4 /* For x86_64, do the following:
6 * - lower calls
7 * - check that constants are used only in
8 * places allowed
9 * - ensure immediates always fit in 32b
10 * - explicit machine register contraints
11 * on instructions like division.
12 * - implement fast locals (the streak of
13 * constant allocX in the first basic block)
14 * - recognize complex addressing modes
16 * Invariant: the use counts that are used
17 * in sel() must be sound. This
18 * is not so trivial, maybe the
19 * dce should be moved out...
22 typedef struct ANum ANum;
24 struct ANum {
25 char n, l, r;
26 Ins *i;
27 Ref mem;
30 static void amatch(Addr *, Ref, ANum *, Fn *, int);
32 static int
33 fcmptoi(int fc)
35 switch (fc) {
36 default: die("invalid fp comparison %d", fc);
37 case FCle: return ICule;
38 case FClt: return ICult;
39 case FCgt: return ICugt;
40 case FCge: return ICuge;
41 case FCne: return ICne;
42 case FCeq: return ICeq;
43 case FCo: return ICxnp;
44 case FCuo: return ICxp;
48 static int
49 iscmp(int op, int *pk, int *pc)
51 int k, c;
53 if (Ocmpw <= op && op <= Ocmpw1) {
54 c = op - Ocmpw;
55 k = Kw;
57 else if (Ocmpl <= op && op <= Ocmpl1) {
58 c = op - Ocmpl;
59 k = Kl;
61 else if (Ocmps <= op && op <= Ocmps1) {
62 c = fcmptoi(op - Ocmps);
63 k = Ks;
65 else if (Ocmpd <= op && op <= Ocmpd1) {
66 c = fcmptoi(op - Ocmpd);
67 k = Kd;
69 else
70 return 0;
71 if (pk)
72 *pk = k;
73 if (pc)
74 *pc = c;
75 return 1;
78 static int
79 noimm(Ref r, Fn *fn)
81 int64_t val;
83 if (rtype(r) != RCon)
84 return 0;
85 switch (fn->con[r.val].type) {
86 case CAddr:
87 /* we only support the 'small'
88 * code model of the ABI, this
89 * means that we can always
90 * address data with 32bits
92 return 0;
93 case CBits:
94 val = fn->con[r.val].bits.i;
95 return (val < INT32_MIN || val > INT32_MAX);
96 default:
97 die("invalid constant");
101 static int
102 rslot(Ref r, Fn *fn)
104 if (rtype(r) != RTmp)
105 return -1;
106 return fn->tmp[r.val].slot;
109 static int
110 argcls(Ins *i, int n)
112 return opdesc[i->op].argcls[n][i->cls];
115 static void
116 fixarg(Ref *r, int k, int phi, Fn *fn)
118 Addr a;
119 Ref r0, r1;
120 int s, n;
122 r1 = r0 = *r;
123 s = rslot(r0, fn);
124 if (KBASE(k) == 1 && rtype(r0) == RCon) {
125 /* load floating points from memory
126 * slots, they can't be used as
127 * immediates
129 r1 = MEM(fn->nmem);
130 vgrow(&fn->mem, ++fn->nmem);
131 memset(&a, 0, sizeof a);
132 a.offset.type = CAddr;
133 a.offset.local = 1;
134 n = stashfp(fn->con[r0.val].bits.i, KWIDE(k));
135 sprintf(a.offset.label, "fp%d", n);
136 fn->mem[fn->nmem-1] = a;
138 else if (!phi && k == Kl && noimm(r0, fn)) {
139 /* load constants that do not fit in
140 * a 32bit signed integer into a
141 * long temporary
143 r1 = newtmp("isel", Kl, fn);
144 emit(Ocopy, Kl, r1, r0, R);
146 else if (s != -1) {
147 /* load fast locals' addresses into
148 * temporaries right before the
149 * instruction
151 r1 = newtmp("isel", Kl, fn);
152 emit(Oaddr, Kl, r1, SLOT(s), R);
154 *r = r1;
157 static void
158 seladdr(Ref *r, ANum *an, Fn *fn)
160 Addr a;
161 Ref r0, r1;
163 r0 = *r;
164 if (rtype(r0) == RTmp) {
165 chuse(r0, -1, fn);
166 r1 = an[r0.val].mem;
167 if (req(r1, R)) {
168 amatch(&a, r0, an, fn, 1);
169 vgrow(&fn->mem, ++fn->nmem);
170 fn->mem[fn->nmem-1] = a;
171 r1 = MEM(fn->nmem-1);
172 chuse(a.base, +1, fn);
173 chuse(a.index, +1, fn);
174 if (rtype(a.base) != RTmp)
175 if (rtype(a.index) != RTmp)
176 an[r0.val].mem = r1;
178 *r = r1;
182 static void
183 selcmp(Ref arg[2], int k, Fn *fn)
185 Ref r, *iarg;
187 if (rtype(arg[0]) == RCon) {
188 r = arg[1];
189 arg[1] = arg[0];
190 arg[0] = r;
192 assert(rtype(arg[0]) != RCon);
193 emit(Oxcmp, k, R, arg[1], arg[0]);
194 iarg = curi->arg;
195 fixarg(&iarg[0], k, 0, fn);
196 fixarg(&iarg[1], k, 0, fn);
199 static void
200 sel(Ins i, ANum *an, Fn *fn)
202 Ref r0, r1, *iarg;
203 int x, k, kc;
204 int64_t sz;
205 Ins *i0;
207 if (rtype(i.to) == RTmp)
208 if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
209 if (fn->tmp[i.to.val].nuse == 0) {
210 chuse(i.arg[0], -1, fn);
211 chuse(i.arg[1], -1, fn);
212 return;
214 i0 = curi;
215 k = i.cls;
216 switch (i.op) {
217 case Odiv:
218 case Orem:
219 case Oudiv:
220 case Ourem:
221 if (i.op == Odiv || i.op == Oudiv)
222 r0 = TMP(RAX), r1 = TMP(RDX);
223 else
224 r0 = TMP(RDX), r1 = TMP(RAX);
225 emit(Ocopy, k, i.to, r0, R);
226 emit(Ocopy, k, R, r1, R);
227 if (rtype(i.arg[1]) == RCon) {
228 /* immediates not allowed for
229 * divisions in x86
231 r0 = newtmp("isel", k, fn);
232 } else
233 r0 = i.arg[1];
234 if (fn->tmp[r0.val].slot != -1)
235 err("unlikely argument %%%s in %s",
236 fn->tmp[r0.val].name, opdesc[i.op].name);
237 if (i.op == Odiv || i.op == Orem) {
238 emit(Oxidiv, k, R, r0, R);
239 emit(Osign, k, TMP(RDX), TMP(RAX), R);
240 } else {
241 emit(Oxdiv, k, R, r0, R);
242 emit(Ocopy, k, TMP(RDX), CON_Z, R);
244 emit(Ocopy, k, TMP(RAX), i.arg[0], R);
245 fixarg(&curi->arg[0], k, 0, fn);
246 if (rtype(i.arg[1]) == RCon)
247 emit(Ocopy, k, r0, i.arg[1], R);
248 break;
249 case Osar:
250 case Oshr:
251 case Oshl:
252 if (rtype(i.arg[1]) == RCon)
253 goto Emit;
254 r0 = i.arg[1];
255 i.arg[1] = TMP(RCX);
256 emit(Ocopy, Kw, R, TMP(RCX), R);
257 emiti(i);
258 emit(Ocopy, Kw, TMP(RCX), r0, R);
259 break;
260 case Onop:
261 break;
262 case Ostored:
263 case Ostores:
264 case Ostorel:
265 case Ostorew:
266 case Ostoreh:
267 case Ostoreb:
268 if (rtype(i.arg[0]) == RCon) {
269 if (i.op == Ostored)
270 i.op = Ostorel;
271 if (i.op == Ostores)
272 i.op = Ostorew;
274 seladdr(&i.arg[1], an, fn);
275 goto Emit;
276 case_Oload:
277 seladdr(&i.arg[0], an, fn);
278 goto Emit;
279 case Ocall:
280 case Osalloc:
281 case Ocopy:
282 case Oadd:
283 case Osub:
284 case Omul:
285 case Oand:
286 case Oor:
287 case Oxor:
288 case Oxtest:
289 case Ostosi:
290 case Odtosi:
291 case Oswtof:
292 case Osltof:
293 case Oexts:
294 case Otruncd:
295 case Ocast:
296 case_OExt:
297 Emit:
298 emiti(i);
299 iarg = curi->arg;
300 fixarg(&iarg[0], argcls(&i, 0), 0, fn);
301 fixarg(&iarg[1], argcls(&i, 1), 0, fn);
302 break;
303 case Oalloc:
304 case Oalloc+1:
305 case Oalloc+2: /* == Oalloc1 */
306 /* we need to make sure
307 * the stack remains aligned
308 * (rsp = 0) mod 16
310 if (rtype(i.arg[0]) == RCon) {
311 sz = fn->con[i.arg[0].val].bits.i;
312 if (sz < 0 || sz >= INT_MAX-15)
313 err("invalid alloc size %"PRId64, sz);
314 sz = (sz + 15) & -16;
315 emit(Osalloc, Kl, i.to, getcon(sz, fn), R);
316 } else {
317 /* r0 = (i.arg[0] + 15) & -16 */
318 r0 = newtmp("isel", Kl, fn);
319 r1 = newtmp("isel", Kl, fn);
320 emit(Osalloc, Kl, i.to, r0, R);
321 emit(Oand, Kl, r0, r1, getcon(-16, fn));
322 emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn));
323 if (fn->tmp[i.arg[0].val].slot != -1)
324 err("unlikely argument %%%s in %s",
325 fn->tmp[i.arg[0].val].name, opdesc[i.op].name);
327 break;
328 default:
329 if (isext(i.op))
330 goto case_OExt;
331 if (isload(i.op))
332 goto case_Oload;
333 if (iscmp(i.op, &kc, &x)) {
334 if (rtype(i.arg[0]) == RCon)
335 x = icmpop(x);
336 emit(Oxset+x, k, i.to, R, R);
337 selcmp(i.arg, kc, fn);
338 break;
340 die("unknown instruction %s", opdesc[i.op].name);
343 while (i0 > curi && --i0) {
344 assert(rslot(i0->arg[0], fn) == -1);
345 assert(rslot(i0->arg[1], fn) == -1);
349 static Ins *
350 flagi(Ins *i0, Ins *i)
352 while (i>i0) {
353 i--;
354 if (opdesc[i->op].sflag)
355 return i;
356 if (opdesc[i->op].lflag)
357 continue;
358 return 0;
360 return 0;
363 static void
364 seljmp(Blk *b, Fn *fn)
366 Ref r;
367 int c, k;
368 Ins *fi;
369 Tmp *t;
371 if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
372 return;
373 assert(b->jmp.type == Jjnz);
374 r = b->jmp.arg;
375 t = &fn->tmp[r.val];
376 b->jmp.arg = R;
377 assert(!req(r, R) && rtype(r) != RCon);
378 if (b->s1 == b->s2) {
379 chuse(r, -1, fn);
380 b->jmp.type = Jjmp;
381 b->s2 = 0;
382 return;
384 fi = flagi(b->ins, &b->ins[b->nins]);
385 if (fi && req(fi->to, r)) {
386 if (iscmp(fi->op, &k, &c)) {
387 if (rtype(fi->arg[0]) == RCon)
388 c = icmpop(c);
389 b->jmp.type = Jxjc + c;
390 if (t->nuse == 1) {
391 selcmp(fi->arg, k, fn);
392 *fi = (Ins){.op = Onop};
394 return;
396 if (fi->op == Oand && t->nuse == 1
397 && (rtype(fi->arg[0]) == RTmp ||
398 rtype(fi->arg[1]) == RTmp)) {
399 fi->op = Oxtest;
400 fi->to = R;
401 b->jmp.type = Jxjc + ICne;
402 if (rtype(fi->arg[1]) == RCon) {
403 r = fi->arg[1];
404 fi->arg[1] = fi->arg[0];
405 fi->arg[0] = r;
407 return;
409 /* since flags are not tracked in liveness,
410 * the result of the flag-setting instruction
411 * has to be marked as live
413 if (t->nuse == 1)
414 emit(Ocopy, Kw, R, r, R);
415 b->jmp.type = Jxjc + ICne;
416 return;
418 selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, add long branch if non-zero */
419 b->jmp.type = Jxjc + ICne;
422 static int
423 aref(Ref r, ANum *ai)
425 switch (rtype(r)) {
426 case RCon:
427 return 2;
428 case RTmp:
429 return ai[r.val].n;
430 default:
431 die("constant or temporary expected");
435 static int
436 ascale(Ref r, Con *con)
438 int64_t n;
440 if (rtype(r) != RCon)
441 return 0;
442 if (con[r.val].type != CBits)
443 return 0;
444 n = con[r.val].bits.i;
445 return n == 1 || n == 2 || n == 4 || n == 8;
448 static void
449 anumber(ANum *ai, Blk *b, Con *con)
451 /* This should be made obsolete by a proper
452 * reassoc pass.
454 * Rules:
456 * RTmp(_) -> 0 tmp
457 * ( RTmp(_) -> 1 slot )
458 * RCon(_) -> 2 con
459 * 0 * 2 -> 3 s * i (when constant is 1,2,4,8)
461 static char add[10][10] = {
462 [2] [2] = 2, /* folding */
463 [2] [5] = 5, [5] [2] = 5,
464 [2] [6] = 6, [6] [2] = 6,
465 [2] [7] = 7, [7] [2] = 7,
466 [0] [0] = 4, /* 4: b + s * i */
467 [0] [3] = 4, [3] [0] = 4,
468 [2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */
469 [0] [2] = 6, [2] [0] = 6, /* 6: o + b */
470 [2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */
471 [0] [5] = 7, [5] [0] = 7,
472 [6] [3] = 7, [3] [6] = 7,
475 int a, a1, a2, n1, n2, t1, t2;
476 Ins *i;
478 for (i=b->ins; i-b->ins < b->nins; i++) {
479 if (rtype(i->to) == RTmp)
480 ai[i->to.val].i = i;
481 if (i->op != Oadd && i->op != Omul)
482 continue;
483 a1 = aref(i->arg[0], ai);
484 a2 = aref(i->arg[1], ai);
485 t1 = a1 != 1 && a1 != 2;
486 t2 = a2 != 1 && a2 != 2;
487 if (i->op == Oadd) {
488 a = add[n1 = a1][n2 = a2];
489 if (t1 && a < add[0][a2])
490 a = add[n1 = 0][n2 = a2];
491 if (t2 && a < add[a1][0])
492 a = add[n1 = a1][n2 = 0];
493 if (t1 && t2 && a < add[0][0])
494 a = add[n1 = 0][n2 = 0];
495 } else {
496 n1 = n2 = a = 0;
497 if (ascale(i->arg[0], con) && t2)
498 a = 3, n1 = 2, n2 = 0;
499 if (t1 && ascale(i->arg[1], con))
500 a = 3, n1 = 0, n2 = 2;
502 ai[i->to.val].n = a;
503 ai[i->to.val].l = n1;
504 ai[i->to.val].r = n2;
508 static void
509 amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top)
511 Ins *i;
512 int nl, nr, t, s;
513 Ref al, ar;
515 if (top)
516 memset(a, 0, sizeof *a);
517 if (rtype(r) == RCon) {
518 addcon(&a->offset, &fn->con[r.val]);
519 return;
521 assert(rtype(r) == RTmp);
522 i = ai[r.val].i;
523 nl = ai[r.val].l;
524 nr = ai[r.val].r;
525 if (i) {
526 if (nl > nr) {
527 al = i->arg[1];
528 ar = i->arg[0];
529 t = nl, nl = nr, nr = t;
530 } else {
531 al = i->arg[0];
532 ar = i->arg[1];
535 switch (ai[r.val].n) {
536 case 3: /* s * i */
537 if (!top) {
538 a->index = al;
539 a->scale = fn->con[ar.val].bits.i;
540 } else
541 a->base = r;
542 break;
543 case 4: /* b + s * i */
544 switch (nr) {
545 case 0:
546 if (fn->tmp[ar.val].slot != -1) {
547 al = i->arg[1];
548 ar = i->arg[0];
550 a->index = ar;
551 a->scale = 1;
552 break;
553 case 3:
554 amatch(a, ar, ai, fn, 0);
555 break;
557 r = al;
558 case 0:
559 s = fn->tmp[r.val].slot;
560 if (s != -1)
561 r = SLOT(s);
562 a->base = r;
563 break;
564 case 2: /* constants */
565 case 5: /* o + s * i */
566 case 6: /* o + b */
567 case 7: /* o + b + s * i */
568 amatch(a, ar, ai, fn, 0);
569 amatch(a, al, ai, fn, 0);
570 break;
571 default:
572 die("unreachable");
576 /* instruction selection
577 * requires use counts (as given by parsing)
579 void
580 isel(Fn *fn)
582 Blk *b, **sb;
583 Ins *i;
584 Phi *p;
585 uint a;
586 int n, al;
587 int64_t sz;
588 ANum *ainfo;
590 /* assign slots to fast allocs */
591 b = fn->start;
592 /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
593 for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
594 for (i=b->ins; i-b->ins < b->nins; i++)
595 if (i->op == al) {
596 if (rtype(i->arg[0]) != RCon)
597 break;
598 sz = fn->con[i->arg[0].val].bits.i;
599 if (sz < 0 || sz >= INT_MAX-15)
600 err("invalid alloc size %"PRId64, sz);
601 sz = (sz + n-1) & -n;
602 sz /= 4;
603 fn->tmp[i->to.val].slot = fn->slot;
604 fn->slot += sz;
605 *i = (Ins){.op = Onop};
608 /* process basic blocks */
609 n = fn->ntmp;
610 ainfo = emalloc(n * sizeof ainfo[0]);
611 for (b=fn->start; b; b=b->link) {
612 curi = &insb[NIns];
613 for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
614 for (p=(*sb)->phi; p; p=p->link) {
615 for (a=0; p->blk[a] != b; a++)
616 assert(a+1 < p->narg);
617 fixarg(&p->arg[a], p->cls, 1, fn);
619 memset(ainfo, 0, n * sizeof ainfo[0]);
620 anumber(ainfo, b, fn->con);
621 seljmp(b, fn);
622 for (i=&b->ins[b->nins]; i!=b->ins;)
623 sel(*--i, ainfo, fn);
624 b->nins = &insb[NIns] - curi;
625 idup(&b->ins, curi, b->nins);
627 free(ainfo);
629 if (debug['I']) {
630 fprintf(stderr, "\n> After instruction selection:\n");
631 printfn(fn, stderr);