drop imul rewriting
[qbe.git] / rega.c
blob8e601c9b07b7e96ba665cf186851fec59826d295
1 #include "all.h"
3 #ifdef TEST_PMOV
4 #undef assert
5 #define assert(x) assert_test(#x, x)
6 #endif
8 typedef struct RMap RMap;
10 struct RMap {
11 int t[Tmp0];
12 int r[Tmp0];
13 int w[Tmp0]; /* wait list, for unmatched hints */
14 BSet b[1];
15 int n;
18 static bits regu; /* registers used */
19 static Tmp *tmp; /* function temporaries */
20 static Mem *mem; /* function mem references */
21 static struct {
22 Ref src, dst;
23 int cls;
24 } pm[Tmp0]; /* parallel move constructed */
25 static int npm; /* size of pm */
26 static int loop; /* current loop level */
28 static uint stmov; /* stats: added moves */
29 static uint stblk; /* stats: added blocks */
31 static int *
32 hint(int t)
34 return &tmp[phicls(t, tmp)].hint.r;
37 static void
38 sethint(int t, int r)
40 Tmp *p;
42 p = &tmp[phicls(t, tmp)];
43 if (p->hint.r == -1 || p->hint.w > loop) {
44 p->hint.r = r;
45 p->hint.w = loop;
46 tmp[t].visit = -1;
50 static void
51 rcopy(RMap *ma, RMap *mb)
53 memcpy(ma->t, mb->t, sizeof ma->t);
54 memcpy(ma->r, mb->r, sizeof ma->r);
55 memcpy(ma->w, mb->w, sizeof ma->w);
56 bscopy(ma->b, mb->b);
57 ma->n = mb->n;
60 static int
61 rfind(RMap *m, int t)
63 int i;
65 for (i=0; i<m->n; i++)
66 if (m->t[i] == t)
67 return m->r[i];
68 return -1;
71 static Ref
72 rref(RMap *m, int t)
74 int r, s;
76 r = rfind(m, t);
77 if (r == -1) {
78 s = tmp[t].slot;
79 assert(s != -1 && "should have spilled");
80 return SLOT(s);
81 } else
82 return TMP(r);
85 static void
86 radd(RMap *m, int t, int r)
88 assert((t >= Tmp0 || t == r) && "invalid temporary");
89 assert(((T.gpr0 <= r && r < T.gpr0 + T.ngpr)
90 || (T.fpr0 <= r && r < T.fpr0 + T.nfpr))
91 && "invalid register");
92 assert(!bshas(m->b, t) && "temporary has mapping");
93 assert(!bshas(m->b, r) && "register already allocated");
94 assert(m->n <= T.ngpr+T.nfpr && "too many mappings");
95 bsset(m->b, t);
96 bsset(m->b, r);
97 m->t[m->n] = t;
98 m->r[m->n] = r;
99 m->n++;
100 regu |= BIT(r);
103 static Ref
104 ralloctry(RMap *m, int t, int try)
106 bits regs;
107 int h, r, r0, r1;
109 if (t < Tmp0) {
110 assert(bshas(m->b, t));
111 return TMP(t);
113 if (bshas(m->b, t)) {
114 r = rfind(m, t);
115 assert(r != -1);
116 return TMP(r);
118 r = tmp[t].visit;
119 if (r == -1 || bshas(m->b, r))
120 r = *hint(t);
121 if (r == -1 || bshas(m->b, r)) {
122 if (try)
123 return R;
124 regs = tmp[phicls(t, tmp)].hint.m;
125 regs |= m->b->t[0];
126 if (KBASE(tmp[t].cls) == 0) {
127 r0 = T.gpr0;
128 r1 = r0 + T.ngpr;
129 } else {
130 r0 = T.fpr0;
131 r1 = r0 + T.nfpr;
133 for (r=r0; r<r1; r++)
134 if (!(regs & BIT(r)))
135 goto Found;
136 for (r=r0; r<r1; r++)
137 if (!bshas(m->b, r))
138 goto Found;
139 die("no more regs");
141 Found:
142 radd(m, t, r);
143 sethint(t, r);
144 tmp[t].visit = r;
145 h = *hint(t);
146 if (h != -1 && h != r)
147 m->w[h] = t;
148 return TMP(r);
151 static inline Ref
152 ralloc(RMap *m, int t)
154 return ralloctry(m, t, 0);
157 static int
158 rfree(RMap *m, int t)
160 int i, r;
162 assert(t >= Tmp0 || !(BIT(t) & T.rglob));
163 if (!bshas(m->b, t))
164 return -1;
165 for (i=0; m->t[i] != t; i++)
166 assert(i+1 < m->n);
167 r = m->r[i];
168 bsclr(m->b, t);
169 bsclr(m->b, r);
170 m->n--;
171 memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
172 memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
173 assert(t >= Tmp0 || t == r);
174 return r;
177 static void
178 mdump(RMap *m)
180 int i;
182 for (i=0; i<m->n; i++)
183 if (m->t[i] >= Tmp0)
184 fprintf(stderr, " (%s, R%d)",
185 tmp[m->t[i]].name,
186 m->r[i]);
187 fprintf(stderr, "\n");
190 static void
191 pmadd(Ref src, Ref dst, int k)
193 if (npm == Tmp0)
194 die("cannot have more moves than registers");
195 pm[npm].src = src;
196 pm[npm].dst = dst;
197 pm[npm].cls = k;
198 npm++;
201 enum PMStat { ToMove, Moving, Moved };
203 static int
204 pmrec(enum PMStat *status, int i, int *k)
206 int j, c;
208 /* note, this routine might emit
209 * too many large instructions
211 if (req(pm[i].src, pm[i].dst)) {
212 status[i] = Moved;
213 return -1;
215 assert(KBASE(pm[i].cls) == KBASE(*k));
216 assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
217 *k |= pm[i].cls;
218 for (j=0; j<npm; j++)
219 if (req(pm[j].dst, pm[i].src))
220 break;
221 switch (j == npm ? Moved : status[j]) {
222 case Moving:
223 c = j; /* start of cycle */
224 emit(Oswap, *k, R, pm[i].src, pm[i].dst);
225 break;
226 case ToMove:
227 status[i] = Moving;
228 c = pmrec(status, j, k);
229 if (c == i) {
230 c = -1; /* end of cycle */
231 break;
233 if (c != -1) {
234 emit(Oswap, *k, R, pm[i].src, pm[i].dst);
235 break;
237 /* fall through */
238 case Moved:
239 c = -1;
240 emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
241 break;
242 default:
243 die("unreachable");
245 status[i] = Moved;
246 return c;
249 static void
250 pmgen()
252 int i;
253 enum PMStat *status;
255 status = alloc(npm * sizeof status[0]);
256 assert(!npm || status[npm-1] == ToMove);
257 for (i=0; i<npm; i++)
258 if (status[i] == ToMove)
259 pmrec(status, i, (int[]){pm[i].cls});
262 static void
263 move(int r, Ref to, RMap *m)
265 int n, t, r1;
267 r1 = req(to, R) ? -1 : rfree(m, to.val);
268 if (bshas(m->b, r)) {
269 /* r is used and not by to */
270 assert(r1 != r);
271 for (n=0; m->r[n] != r; n++)
272 assert(n+1 < m->n);
273 t = m->t[n];
274 rfree(m, t);
275 bsset(m->b, r);
276 ralloc(m, t);
277 bsclr(m->b, r);
279 t = req(to, R) ? r : to.val;
280 radd(m, t, r);
283 static int
284 regcpy(Ins *i)
286 return i->op == Ocopy && isreg(i->arg[0]);
289 static Ins *
290 dopm(Blk *b, Ins *i, RMap *m)
292 RMap m0;
293 int n, r, r1, t, s;
294 Ins *i1, *ip;
295 bits def;
297 m0 = *m; /* okay since we don't use m0.b */
298 m0.b->t = 0;
299 i1 = ++i;
300 do {
301 i--;
302 move(i->arg[0].val, i->to, m);
303 } while (i != b->ins && regcpy(i-1));
304 assert(m0.n <= m->n);
305 if (i != b->ins && (i-1)->op == Ocall) {
306 def = T.retregs((i-1)->arg[1], 0) | T.rglob;
307 for (r=0; T.rsave[r]>=0; r++)
308 if (!(BIT(T.rsave[r]) & def))
309 move(T.rsave[r], R, m);
311 for (npm=0, n=0; n<m->n; n++) {
312 t = m->t[n];
313 s = tmp[t].slot;
314 r1 = m->r[n];
315 r = rfind(&m0, t);
316 if (r != -1)
317 pmadd(TMP(r1), TMP(r), tmp[t].cls);
318 else if (s != -1)
319 pmadd(TMP(r1), SLOT(s), tmp[t].cls);
321 for (ip=i; ip<i1; ip++) {
322 if (!req(ip->to, R))
323 rfree(m, ip->to.val);
324 r = ip->arg[0].val;
325 if (rfind(m, r) == -1)
326 radd(m, r, r);
328 pmgen();
329 return i;
332 static int
333 prio1(Ref r1, Ref r2)
335 /* trivial heuristic to begin with,
336 * later we can use the distance to
337 * the definition instruction
339 (void) r2;
340 return *hint(r1.val) != -1;
343 static void
344 insert(Ref *r, Ref **rs, int p)
346 int i;
348 rs[i = p] = r;
349 while (i-- > 0 && prio1(*r, *rs[i])) {
350 rs[i+1] = rs[i];
351 rs[i] = r;
355 static void
356 doblk(Blk *b, RMap *cur)
358 int t, x, r, rf, rt, nr;
359 bits rs;
360 Ins *i, *i1;
361 Mem *m;
362 Ref *ra[4];
364 if (rtype(b->jmp.arg) == RTmp)
365 b->jmp.arg = ralloc(cur, b->jmp.arg.val);
366 curi = &insb[NIns];
367 for (i1=&b->ins[b->nins]; i1!=b->ins;) {
368 emiti(*--i1);
369 i = curi;
370 rf = -1;
371 switch (i->op) {
372 case Ocall:
373 rs = T.argregs(i->arg[1], 0) | T.rglob;
374 for (r=0; T.rsave[r]>=0; r++)
375 if (!(BIT(T.rsave[r]) & rs))
376 rfree(cur, T.rsave[r]);
377 break;
378 case Ocopy:
379 if (regcpy(i)) {
380 curi++;
381 i1 = dopm(b, i1, cur);
382 stmov += i+1 - curi;
383 continue;
385 if (isreg(i->to))
386 if (rtype(i->arg[0]) == RTmp)
387 sethint(i->arg[0].val, i->to.val);
388 /* fall through */
389 default:
390 if (!req(i->to, R)) {
391 assert(rtype(i->to) == RTmp);
392 r = i->to.val;
393 if (r < Tmp0 && (BIT(r) & T.rglob))
394 break;
395 rf = rfree(cur, r);
396 if (rf == -1) {
397 assert(!isreg(i->to));
398 curi++;
399 continue;
401 i->to = TMP(rf);
403 break;
405 for (x=0, nr=0; x<2; x++)
406 switch (rtype(i->arg[x])) {
407 case RMem:
408 m = &mem[i->arg[x].val];
409 if (rtype(m->base) == RTmp)
410 insert(&m->base, ra, nr++);
411 if (rtype(m->index) == RTmp)
412 insert(&m->index, ra, nr++);
413 break;
414 case RTmp:
415 insert(&i->arg[x], ra, nr++);
416 break;
418 for (r=0; r<nr; r++)
419 *ra[r] = ralloc(cur, ra[r]->val);
420 if (i->op == Ocopy && req(i->to, i->arg[0]))
421 curi++;
423 /* try to change the register of a hinted
424 * temporary if rf is available */
425 if (rf != -1 && (t = cur->w[rf]) != 0)
426 if (!bshas(cur->b, rf) && *hint(t) == rf
427 && (rt = rfree(cur, t)) != -1) {
428 tmp[t].visit = -1;
429 ralloc(cur, t);
430 assert(bshas(cur->b, rf));
431 emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
432 stmov += 1;
433 cur->w[rf] = 0;
434 for (r=0; r<nr; r++)
435 if (req(*ra[r], TMP(rt)))
436 *ra[r] = TMP(rf);
437 /* one could iterate this logic with
438 * the newly freed rt, but in this case
439 * the above loop must be changed */
442 b->nins = &insb[NIns] - curi;
443 idup(&b->ins, curi, b->nins);
446 /* qsort() comparison function to peel
447 * loop nests from inside out */
448 static int
449 carve(const void *a, const void *b)
451 Blk *ba, *bb;
453 /* todo, evaluate if this order is really
454 * better than the simple postorder */
455 ba = *(Blk**)a;
456 bb = *(Blk**)b;
457 if (ba->loop == bb->loop)
458 return ba->id > bb->id ? -1 : ba->id < bb->id;
459 return ba->loop > bb->loop ? -1 : +1;
462 /* comparison function to order temporaries
463 * for allocation at the end of blocks */
464 static int
465 prio2(int t1, int t2)
467 if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */
468 return tmp[t1].visit != -1 ? +1 : -1;
469 if ((*hint(t1) ^ *hint(t2)) < 0)
470 return *hint(t1) != -1 ? +1 : -1;
471 return tmp[t1].cost - tmp[t2].cost;
474 /* register allocation
475 * depends on rpo, phi, cost, (and obviously spill)
477 void
478 rega(Fn *fn)
480 int j, t, r, x, rl[Tmp0];
481 Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
482 RMap *end, *beg, cur, old, *m;
483 Ins *i;
484 Phi *p;
485 uint u, n;
486 Ref src, dst;
488 /* 1. setup */
489 stmov = 0;
490 stblk = 0;
491 regu = 0;
492 tmp = fn->tmp;
493 mem = fn->mem;
494 blk = alloc(fn->nblk * sizeof blk[0]);
495 end = alloc(fn->nblk * sizeof end[0]);
496 beg = alloc(fn->nblk * sizeof beg[0]);
497 for (n=0; n<fn->nblk; n++) {
498 bsinit(end[n].b, fn->ntmp);
499 bsinit(beg[n].b, fn->ntmp);
501 bsinit(cur.b, fn->ntmp);
502 bsinit(old.b, fn->ntmp);
504 loop = INT_MAX;
505 for (t=0; t<fn->ntmp; t++) {
506 tmp[t].hint.r = t < Tmp0 ? t : -1;
507 tmp[t].hint.w = loop;
508 tmp[t].visit = -1;
510 for (bp=blk, b=fn->start; b; b=b->link)
511 *bp++ = b;
512 qsort(blk, fn->nblk, sizeof blk[0], carve);
513 for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
514 if (i->op != Ocopy || !isreg(i->arg[0]))
515 break;
516 else {
517 assert(rtype(i->to) == RTmp);
518 sethint(i->to.val, i->arg[0].val);
521 /* 2. assign registers */
522 for (bp=blk; bp<&blk[fn->nblk]; bp++) {
523 b = *bp;
524 n = b->id;
525 loop = b->loop;
526 cur.n = 0;
527 bszero(cur.b);
528 memset(cur.w, 0, sizeof cur.w);
529 for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
530 j = x++;
531 rl[j] = t;
532 while (j-- > 0 && prio2(t, rl[j]) > 0) {
533 rl[j+1] = rl[j];
534 rl[j] = t;
537 for (r=0; bsiter(b->out, &r) && r<Tmp0; r++)
538 radd(&cur, r, r);
539 for (j=0; j<x; j++)
540 ralloctry(&cur, rl[j], 1);
541 for (j=0; j<x; j++)
542 ralloc(&cur, rl[j]);
543 rcopy(&end[n], &cur);
544 doblk(b, &cur);
545 bscopy(b->in, cur.b);
546 for (p=b->phi; p; p=p->link)
547 if (rtype(p->to) == RTmp)
548 bsclr(b->in, p->to.val);
549 rcopy(&beg[n], &cur);
552 /* 3. emit copies shared by multiple edges
553 * to the same block */
554 for (s=fn->start; s; s=s->link) {
555 if (s->npred <= 1)
556 continue;
557 m = &beg[s->id];
559 /* rl maps a register that is live at the
560 * beginning of s to the one used in all
561 * predecessors (if any, -1 otherwise) */
562 memset(rl, 0, sizeof rl);
564 /* to find the register of a phi in a
565 * predecessor, we have to find the
566 * corresponding argument */
567 for (p=s->phi; p; p=p->link) {
568 if (rtype(p->to) != RTmp
569 || (r=rfind(m, p->to.val)) == -1)
570 continue;
571 for (u=0; u<p->narg; u++) {
572 b = p->blk[u];
573 src = p->arg[u];
574 if (rtype(src) != RTmp)
575 continue;
576 x = rfind(&end[b->id], src.val);
577 if (x == -1) /* spilled */
578 continue;
579 rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
581 if (rl[r] == 0)
582 rl[r] = -1;
585 /* process non-phis temporaries */
586 for (j=0; j<m->n; j++) {
587 t = m->t[j];
588 r = m->r[j];
589 if (rl[r] || t < Tmp0 /* todo, remove this */)
590 continue;
591 for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
592 x = rfind(&end[(*bp)->id], t);
593 if (x == -1) /* spilled */
594 continue;
595 rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
597 if (rl[r] == 0)
598 rl[r] = -1;
601 npm = 0;
602 for (j=0; j<m->n; j++) {
603 t = m->t[j];
604 r = m->r[j];
605 x = rl[r];
606 assert(x != 0 || t < Tmp0 /* todo, ditto */);
607 if (x > 0 && !bshas(m->b, x)) {
608 pmadd(TMP(x), TMP(r), tmp[t].cls);
609 m->r[j] = x;
610 bsset(m->b, x);
613 curi = &insb[NIns];
614 pmgen();
615 j = &insb[NIns] - curi;
616 if (j == 0)
617 continue;
618 stmov += j;
619 s->nins += j;
620 i = alloc(s->nins * sizeof(Ins));
621 icpy(icpy(i, curi, j), s->ins, s->nins-j);
622 s->ins = i;
625 if (debug['R']) {
626 fprintf(stderr, "\n> Register mappings:\n");
627 for (n=0; n<fn->nblk; n++) {
628 b = fn->rpo[n];
629 fprintf(stderr, "\t%-10s beg", b->name);
630 mdump(&beg[n]);
631 fprintf(stderr, "\t end");
632 mdump(&end[n]);
634 fprintf(stderr, "\n");
637 /* 4. emit remaining copies in new blocks */
638 blist = 0;
639 for (b=fn->start;; b=b->link) {
640 ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
641 for (; (s=**ps); ps++) {
642 npm = 0;
643 for (p=s->phi; p; p=p->link) {
644 dst = p->to;
645 assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
646 if (rtype(dst) == RTmp) {
647 r = rfind(&beg[s->id], dst.val);
648 if (r == -1)
649 continue;
650 dst = TMP(r);
652 for (u=0; p->blk[u]!=b; u++)
653 assert(u+1 < p->narg);
654 src = p->arg[u];
655 if (rtype(src) == RTmp)
656 src = rref(&end[b->id], src.val);
657 pmadd(src, dst, p->cls);
659 for (t=Tmp0; bsiter(s->in, &t); t++) {
660 src = rref(&end[b->id], t);
661 dst = rref(&beg[s->id], t);
662 pmadd(src, dst, tmp[t].cls);
664 curi = &insb[NIns];
665 pmgen();
666 if (curi == &insb[NIns])
667 continue;
668 b1 = newblk();
669 b1->loop = (b->loop+s->loop) / 2;
670 b1->link = blist;
671 blist = b1;
672 fn->nblk++;
673 strf(b1->name, "%s_%s", b->name, s->name);
674 b1->nins = &insb[NIns] - curi;
675 stmov += b1->nins;
676 stblk += 1;
677 idup(&b1->ins, curi, b1->nins);
678 b1->jmp.type = Jjmp;
679 b1->s1 = s;
680 **ps = b1;
682 if (!b->link) {
683 b->link = blist;
684 break;
687 for (b=fn->start; b; b=b->link)
688 b->phi = 0;
689 fn->reg = regu;
691 if (debug['R']) {
692 fprintf(stderr, "\n> Register allocation statistics:\n");
693 fprintf(stderr, "\tnew moves: %d\n", stmov);
694 fprintf(stderr, "\tnew blocks: %d\n", stblk);
695 fprintf(stderr, "\n> After register allocation:\n");
696 printfn(fn, stderr);