cleanup amd64 constant addressing
[qbe.git] / amd64 / isel.c
blobce0c98f2ab1f61c791b990d7d3e6d174876869b8
1 #include "all.h"
2 #include <limits.h>
4 /* For x86_64, do the following:
6 * - check that constants are used only in
7 * places allowed
8 * - ensure immediates always fit in 32b
9 * - expose machine register contraints
10 * on instructions like division.
11 * - implement fast locals (the streak of
12 * constant allocX in the first basic block)
13 * - recognize complex addressing modes
15 * Invariant: the use counts that are used
16 * in sel() must be sound. This
17 * is not so trivial, maybe the
18 * dce should be moved out...
21 typedef struct ANum ANum;
23 struct ANum {
24 char n, l, r;
25 Ins *i;
28 static int amatch(Addr *, Ref, int, ANum *, Fn *);
30 static int
31 noimm(Ref r, Fn *fn)
33 int64_t val;
35 if (rtype(r) != RCon)
36 return 0;
37 switch (fn->con[r.val].type) {
38 case CAddr:
39 /* we only support the 'small'
40 * code model of the ABI, this
41 * means that we can always
42 * address data with 32bits
44 return 0;
45 case CBits:
46 val = fn->con[r.val].bits.i;
47 return (val < INT32_MIN || val > INT32_MAX);
48 default:
49 die("invalid constant");
53 static int
54 rslot(Ref r, Fn *fn)
56 if (rtype(r) != RTmp)
57 return -1;
58 return fn->tmp[r.val].slot;
61 static void
62 fixarg(Ref *r, int k, int op, Fn *fn)
64 char buf[32];
65 Addr a;
66 Ref r0, r1;
67 int s, n, cpy, mem;
69 r1 = r0 = *r;
70 s = rslot(r0, fn);
71 cpy = op == Ocopy || op == -1;
72 mem = isstore(op) || isload(op) || op == Ocall;
73 if (KBASE(k) == 1 && rtype(r0) == RCon) {
74 /* load floating points from memory
75 * slots, they can't be used as
76 * immediates
78 r1 = MEM(fn->nmem);
79 vgrow(&fn->mem, ++fn->nmem);
80 memset(&a, 0, sizeof a);
81 a.offset.type = CAddr;
82 a.offset.local = 1;
83 n = gasstash(&fn->con[r0.val].bits, KWIDE(k) ? 8 : 4);
84 sprintf(buf, "fp%d", n);
85 a.offset.label = intern(buf);
86 fn->mem[fn->nmem-1] = a;
88 else if (!cpy && k == Kl && noimm(r0, fn)) {
89 /* load constants that do not fit in
90 * a 32bit signed integer into a
91 * long temporary
93 r1 = newtmp("isel", Kl, fn);
94 emit(Ocopy, Kl, r1, r0, R);
96 else if (s != -1) {
97 /* load fast locals' addresses into
98 * temporaries right before the
99 * instruction
101 r1 = newtmp("isel", Kl, fn);
102 emit(Oaddr, Kl, r1, SLOT(s), R);
104 else if (!mem && rtype(r0) == RCon
105 && fn->con[r0.val].type == CAddr) {
106 /* apple as does not support 32-bit
107 * absolute addressing, use a rip-
108 * relative leaq instead
110 r1 = newtmp("isel", Kl, fn);
111 emit(Oaddr, Kl, r1, r0, R);
113 *r = r1;
116 static void
117 seladdr(Ref *r, ANum *an, Fn *fn)
119 Addr a;
120 Ref r0;
122 r0 = *r;
123 if (rtype(r0) == RTmp) {
124 memset(&a, 0, sizeof a);
125 if (!amatch(&a, r0, an[r0.val].n, an, fn))
126 return;
127 if (!req(a.base, R))
128 if (a.offset.type == CAddr) {
129 /* apple as does not support
130 * $foo(%r0, %r1, M); try to
131 * rewrite it or bail out if
132 * impossible
134 if (!req(a.index, R))
135 return;
136 else {
137 a.index = a.base;
138 a.scale = 1;
139 a.base = R;
142 chuse(r0, -1, fn);
143 vgrow(&fn->mem, ++fn->nmem);
144 fn->mem[fn->nmem-1] = a;
145 chuse(a.base, +1, fn);
146 chuse(a.index, +1, fn);
147 *r = MEM(fn->nmem-1);
151 static int
152 selcmp(Ref arg[2], int k, Fn *fn)
154 int swap;
155 Ref r, *iarg;
157 swap = rtype(arg[0]) == RCon;
158 if (swap) {
159 r = arg[1];
160 arg[1] = arg[0];
161 arg[0] = r;
163 emit(Oxcmp, k, R, arg[1], arg[0]);
164 iarg = curi->arg;
165 if (rtype(arg[0]) == RCon) {
166 assert(k == Kl);
167 iarg[1] = newtmp("isel", k, fn);
168 emit(Ocopy, k, iarg[1], arg[0], R);
170 fixarg(&iarg[0], k, Oxcmp, fn);
171 fixarg(&iarg[1], k, Oxcmp, fn);
172 return swap;
175 static void
176 sel(Ins i, ANum *an, Fn *fn)
178 Ref r0, r1, *iarg;
179 int x, k, kc;
180 int64_t sz;
181 Ins *i0, *i1;
183 if (rtype(i.to) == RTmp)
184 if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
185 if (fn->tmp[i.to.val].nuse == 0) {
186 chuse(i.arg[0], -1, fn);
187 chuse(i.arg[1], -1, fn);
188 return;
190 i0 = curi;
191 k = i.cls;
192 switch (i.op) {
193 case Odiv:
194 case Orem:
195 case Oudiv:
196 case Ourem:
197 if (KBASE(k) == 1)
198 goto Emit;
199 if (i.op == Odiv || i.op == Oudiv)
200 r0 = TMP(RAX), r1 = TMP(RDX);
201 else
202 r0 = TMP(RDX), r1 = TMP(RAX);
203 emit(Ocopy, k, i.to, r0, R);
204 emit(Ocopy, k, R, r1, R);
205 if (rtype(i.arg[1]) == RCon) {
206 /* immediates not allowed for
207 * divisions in x86
209 r0 = newtmp("isel", k, fn);
210 } else
211 r0 = i.arg[1];
212 if (fn->tmp[r0.val].slot != -1)
213 err("unlikely argument %%%s in %s",
214 fn->tmp[r0.val].name, optab[i.op].name);
215 if (i.op == Odiv || i.op == Orem) {
216 emit(Oxidiv, k, R, r0, R);
217 emit(Osign, k, TMP(RDX), TMP(RAX), R);
218 } else {
219 emit(Oxdiv, k, R, r0, R);
220 emit(Ocopy, k, TMP(RDX), CON_Z, R);
222 emit(Ocopy, k, TMP(RAX), i.arg[0], R);
223 fixarg(&curi->arg[0], k, Ocopy, fn);
224 if (rtype(i.arg[1]) == RCon)
225 emit(Ocopy, k, r0, i.arg[1], R);
226 break;
227 case Osar:
228 case Oshr:
229 case Oshl:
230 if (rtype(i.arg[1]) == RCon)
231 goto Emit;
232 r0 = i.arg[1];
233 i.arg[1] = TMP(RCX);
234 emit(Ocopy, Kw, R, TMP(RCX), R);
235 emiti(i);
236 emit(Ocopy, Kw, TMP(RCX), r0, R);
237 break;
238 case Onop:
239 break;
240 case Ostored:
241 case Ostores:
242 case Ostorel:
243 case Ostorew:
244 case Ostoreh:
245 case Ostoreb:
246 if (rtype(i.arg[0]) == RCon) {
247 if (i.op == Ostored)
248 i.op = Ostorel;
249 if (i.op == Ostores)
250 i.op = Ostorew;
252 seladdr(&i.arg[1], an, fn);
253 goto Emit;
254 case_Oload:
255 seladdr(&i.arg[0], an, fn);
256 goto Emit;
257 case Ocall:
258 case Osalloc:
259 case Ocopy:
260 case Oadd:
261 case Osub:
262 case Omul:
263 case Oand:
264 case Oor:
265 case Oxor:
266 case Oxtest:
267 case Ostosi:
268 case Odtosi:
269 case Oswtof:
270 case Osltof:
271 case Oexts:
272 case Otruncd:
273 case Ocast:
274 case_OExt:
275 Emit:
276 emiti(i);
277 iarg = curi->arg; /* fixarg() can change curi */
278 fixarg(&iarg[0], argcls(&i, 0), i.op, fn);
279 fixarg(&iarg[1], argcls(&i, 1), i.op, fn);
280 break;
281 case Oalloc:
282 case Oalloc+1:
283 case Oalloc+2: /* == Oalloc1 */
284 /* we need to make sure
285 * the stack remains aligned
286 * (rsp = 0) mod 16
288 fn->dynalloc = 1;
289 if (rtype(i.arg[0]) == RCon) {
290 sz = fn->con[i.arg[0].val].bits.i;
291 if (sz < 0 || sz >= INT_MAX-15)
292 err("invalid alloc size %"PRId64, sz);
293 sz = (sz + 15) & -16;
294 emit(Osalloc, Kl, i.to, getcon(sz, fn), R);
295 } else {
296 /* r0 = (i.arg[0] + 15) & -16 */
297 r0 = newtmp("isel", Kl, fn);
298 r1 = newtmp("isel", Kl, fn);
299 emit(Osalloc, Kl, i.to, r0, R);
300 emit(Oand, Kl, r0, r1, getcon(-16, fn));
301 emit(Oadd, Kl, r1, i.arg[0], getcon(15, fn));
302 if (fn->tmp[i.arg[0].val].slot != -1)
303 err("unlikely argument %%%s in %s",
304 fn->tmp[i.arg[0].val].name, optab[i.op].name);
306 break;
307 default:
308 if (isext(i.op))
309 goto case_OExt;
310 if (isload(i.op))
311 goto case_Oload;
312 if (iscmp(i.op, &kc, &x)) {
313 emit(Oflag+x, k, i.to, R, R);
314 i1 = curi;
315 if (selcmp(i.arg, kc, fn))
316 i1->op = Oflag + cmpop(x);
317 break;
319 die("unknown instruction %s", optab[i.op].name);
322 while (i0 > curi && --i0) {
323 assert(rslot(i0->arg[0], fn) == -1);
324 assert(rslot(i0->arg[1], fn) == -1);
328 static Ins *
329 flagi(Ins *i0, Ins *i)
331 while (i>i0) {
332 i--;
333 if (amd64_op[i->op].zflag)
334 return i;
335 if (amd64_op[i->op].lflag)
336 continue;
337 return 0;
339 return 0;
342 static void
343 seljmp(Blk *b, Fn *fn)
345 Ref r;
346 int c, k;
347 Ins *fi;
348 Tmp *t;
350 if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
351 return;
352 assert(b->jmp.type == Jjnz);
353 r = b->jmp.arg;
354 t = &fn->tmp[r.val];
355 b->jmp.arg = R;
356 assert(!req(r, R) && rtype(r) != RCon);
357 if (b->s1 == b->s2) {
358 chuse(r, -1, fn);
359 b->jmp.type = Jjmp;
360 b->s2 = 0;
361 return;
363 fi = flagi(b->ins, &b->ins[b->nins]);
364 if (!fi || !req(fi->to, r)) {
365 selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, long jnz */
366 b->jmp.type = Jjf + Cine;
368 else if (iscmp(fi->op, &k, &c)) {
369 if (t->nuse == 1) {
370 if (selcmp(fi->arg, k, fn))
371 c = cmpop(c);
372 *fi = (Ins){.op = Onop};
374 b->jmp.type = Jjf + c;
376 else if (fi->op == Oand && t->nuse == 1
377 && (rtype(fi->arg[0]) == RTmp ||
378 rtype(fi->arg[1]) == RTmp)) {
379 fi->op = Oxtest;
380 fi->to = R;
381 b->jmp.type = Jjf + Cine;
382 if (rtype(fi->arg[1]) == RCon) {
383 r = fi->arg[1];
384 fi->arg[1] = fi->arg[0];
385 fi->arg[0] = r;
388 else {
389 /* since flags are not tracked in liveness,
390 * the result of the flag-setting instruction
391 * has to be marked as live
393 if (t->nuse == 1)
394 emit(Ocopy, Kw, R, r, R);
395 b->jmp.type = Jjf + Cine;
399 static int
400 aref(Ref r, ANum *ai)
402 switch (rtype(r)) {
403 case RCon:
404 return 2;
405 case RTmp:
406 return ai[r.val].n;
407 default:
408 die("constant or temporary expected");
412 static int
413 ascale(Ref r, Con *con)
415 int64_t n;
417 if (rtype(r) != RCon)
418 return 0;
419 if (con[r.val].type != CBits)
420 return 0;
421 n = con[r.val].bits.i;
422 return n == 1 || n == 2 || n == 4 || n == 8;
425 static void
426 anumber(ANum *ai, Blk *b, Con *con)
428 /* This should be made obsolete by a proper
429 * reassoc pass.
431 * Rules:
433 * RTmp(_) -> 0 tmp
434 * ( RTmp(_) -> 1 slot )
435 * RCon(_) -> 2 con
436 * 0 * 2 -> 3 s * i (when constant is 1,2,4,8)
438 static char add[10][10] = {
439 [2] [2] = 2, /* folding */
440 [2] [4] = 4, [4] [2] = 4,
441 [2] [6] = 6, [6] [2] = 6,
442 [2] [7] = 7, [7] [2] = 7,
443 [0] [2] = 4, [2] [0] = 4, /* 4: o + b */
444 [0] [0] = 5, /* 5: b + s * i */
445 [0] [3] = 5, [3] [0] = 5,
446 [2] [3] = 6, [3] [2] = 6, /* 6: o + s * i */
447 [2] [5] = 7, [5] [2] = 7, /* 7: o + b + s * i */
448 [0] [6] = 7, [6] [0] = 7,
449 [4] [3] = 7, [3] [4] = 7,
451 int a, a1, a2, n1, n2, t1, t2;
452 Ins *i;
454 for (i=b->ins; i<&b->ins[b->nins]; i++) {
455 if (rtype(i->to) == RTmp)
456 ai[i->to.val].i = i;
457 if (i->op != Oadd && i->op != Omul)
458 continue;
459 a1 = aref(i->arg[0], ai);
460 a2 = aref(i->arg[1], ai);
461 t1 = a1 != 1 && a1 != 2;
462 t2 = a2 != 1 && a2 != 2;
463 if (i->op == Oadd) {
464 a = add[n1 = a1][n2 = a2];
465 if (t1 && a < add[0][a2])
466 a = add[n1 = 0][n2 = a2];
467 if (t2 && a < add[a1][0])
468 a = add[n1 = a1][n2 = 0];
469 if (t1 && t2 && a < add[0][0])
470 a = add[n1 = 0][n2 = 0];
471 } else {
472 n1 = n2 = a = 0;
473 if (ascale(i->arg[0], con) && t2)
474 a = 3, n1 = 2, n2 = 0;
475 if (t1 && ascale(i->arg[1], con))
476 a = 3, n1 = 0, n2 = 2;
478 ai[i->to.val].n = a;
479 ai[i->to.val].l = n1;
480 ai[i->to.val].r = n2;
484 static int
485 amatch(Addr *a, Ref r, int n, ANum *ai, Fn *fn)
487 Ins *i;
488 int nl, nr, t, s;
489 Ref al, ar;
491 if (rtype(r) == RCon) {
492 addcon(&a->offset, &fn->con[r.val]);
493 return 1;
495 assert(rtype(r) == RTmp);
496 i = ai[r.val].i;
497 nl = ai[r.val].l;
498 nr = ai[r.val].r;
499 if (i) {
500 if (nl > nr) {
501 al = i->arg[1];
502 ar = i->arg[0];
503 t = nl, nl = nr, nr = t;
504 } else {
505 al = i->arg[0];
506 ar = i->arg[1];
509 switch (n) {
510 case 3: /* s * i */
511 a->index = al;
512 a->scale = fn->con[ar.val].bits.i;
513 return 0;
514 case 5: /* b + s * i */
515 switch (nr) {
516 case 0:
517 if (fn->tmp[ar.val].slot != -1) {
518 al = i->arg[1];
519 ar = i->arg[0];
521 a->index = ar;
522 a->scale = 1;
523 break;
524 case 3:
525 amatch(a, ar, nr, ai, fn);
526 break;
528 r = al;
529 /* fall through */
530 case 0:
531 s = fn->tmp[r.val].slot;
532 if (s != -1)
533 r = SLOT(s);
534 a->base = r;
535 return n || s != -1;
536 case 2: /* constants */
537 case 4: /* o + b */
538 case 6: /* o + s * i */
539 case 7: /* o + b + s * i */
540 amatch(a, ar, nr, ai, fn);
541 amatch(a, al, nl, ai, fn);
542 return 1;
543 default:
544 die("unreachable");
548 /* instruction selection
549 * requires use counts (as given by parsing)
551 void
552 amd64_isel(Fn *fn)
554 Blk *b, **sb;
555 Ins *i;
556 Phi *p;
557 uint a;
558 int n, al;
559 int64_t sz;
560 ANum *ainfo;
562 /* assign slots to fast allocs */
563 b = fn->start;
564 /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
565 for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
566 for (i=b->ins; i<&b->ins[b->nins]; i++)
567 if (i->op == al) {
568 if (rtype(i->arg[0]) != RCon)
569 break;
570 sz = fn->con[i->arg[0].val].bits.i;
571 if (sz < 0 || sz >= INT_MAX-15)
572 err("invalid alloc size %"PRId64, sz);
573 sz = (sz + n-1) & -n;
574 sz /= 4;
575 fn->tmp[i->to.val].slot = fn->slot;
576 fn->slot += sz;
577 *i = (Ins){.op = Onop};
580 /* process basic blocks */
581 n = fn->ntmp;
582 ainfo = emalloc(n * sizeof ainfo[0]);
583 for (b=fn->start; b; b=b->link) {
584 curi = &insb[NIns];
585 for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
586 for (p=(*sb)->phi; p; p=p->link) {
587 for (a=0; p->blk[a] != b; a++)
588 assert(a+1 < p->narg);
589 fixarg(&p->arg[a], p->cls, -1, fn);
591 memset(ainfo, 0, n * sizeof ainfo[0]);
592 anumber(ainfo, b, fn->con);
593 seljmp(b, fn);
594 for (i=&b->ins[b->nins]; i!=b->ins;)
595 sel(*--i, ainfo, fn);
596 b->nins = &insb[NIns] - curi;
597 idup(&b->ins, curi, b->nins);
599 free(ainfo);
601 if (debug['I']) {
602 fprintf(stderr, "\n> After instruction selection:\n");
603 printfn(fn, stderr);