new hlt block terminator
[qbe.git] / rega.c
blobc97d1e198a8a4fdf0f3bb1ca67f95ecb4812d9a0
1 #include "all.h"
3 #ifdef TEST_PMOV
4 #undef assert
5 #define assert(x) assert_test(#x, x)
6 #endif
8 typedef struct RMap RMap;
10 struct RMap {
11 int t[Tmp0];
12 int r[Tmp0];
13 int w[Tmp0]; /* wait list, for unmatched hints */
14 BSet b[1];
15 int n;
18 static bits regu; /* registers used */
19 static Tmp *tmp; /* function temporaries */
20 static Mem *mem; /* function mem references */
21 static struct {
22 Ref src, dst;
23 int cls;
24 } pm[Tmp0]; /* parallel move constructed */
25 static int npm; /* size of pm */
26 static int loop; /* current loop level */
28 static uint stmov; /* stats: added moves */
29 static uint stblk; /* stats: added blocks */
31 static int *
32 hint(int t)
34 return &tmp[phicls(t, tmp)].hint.r;
37 static void
38 sethint(int t, int r)
40 Tmp *p;
42 p = &tmp[phicls(t, tmp)];
43 if (p->hint.r == -1 || p->hint.w > loop) {
44 p->hint.r = r;
45 p->hint.w = loop;
46 tmp[t].visit = -1;
50 static void
51 rcopy(RMap *ma, RMap *mb)
53 memcpy(ma->t, mb->t, sizeof ma->t);
54 memcpy(ma->r, mb->r, sizeof ma->r);
55 memcpy(ma->w, mb->w, sizeof ma->w);
56 bscopy(ma->b, mb->b);
57 ma->n = mb->n;
60 static int
61 rfind(RMap *m, int t)
63 int i;
65 for (i=0; i<m->n; i++)
66 if (m->t[i] == t)
67 return m->r[i];
68 return -1;
71 static Ref
72 rref(RMap *m, int t)
74 int r, s;
76 r = rfind(m, t);
77 if (r == -1) {
78 s = tmp[t].slot;
79 assert(s != -1 && "should have spilled");
80 return SLOT(s);
81 } else
82 return TMP(r);
85 static void
86 radd(RMap *m, int t, int r)
88 assert((t >= Tmp0 || t == r) && "invalid temporary");
89 assert(((T.gpr0 <= r && r < T.gpr0 + T.ngpr)
90 || (T.fpr0 <= r && r < T.fpr0 + T.nfpr))
91 && "invalid register");
92 assert(!bshas(m->b, t) && "temporary has mapping");
93 assert(!bshas(m->b, r) && "register already allocated");
94 assert(m->n <= T.ngpr+T.nfpr && "too many mappings");
95 bsset(m->b, t);
96 bsset(m->b, r);
97 m->t[m->n] = t;
98 m->r[m->n] = r;
99 m->n++;
100 regu |= BIT(r);
103 static Ref
104 ralloctry(RMap *m, int t, int try)
106 bits regs;
107 int h, r, r0, r1;
109 if (t < Tmp0) {
110 assert(bshas(m->b, t));
111 return TMP(t);
113 if (bshas(m->b, t)) {
114 r = rfind(m, t);
115 assert(r != -1);
116 return TMP(r);
118 r = tmp[t].visit;
119 if (r == -1 || bshas(m->b, r))
120 r = *hint(t);
121 if (r == -1 || bshas(m->b, r)) {
122 if (try)
123 return R;
124 regs = tmp[phicls(t, tmp)].hint.m;
125 regs |= m->b->t[0];
126 if (KBASE(tmp[t].cls) == 0) {
127 r0 = T.gpr0;
128 r1 = r0 + T.ngpr;
129 } else {
130 r0 = T.fpr0;
131 r1 = r0 + T.nfpr;
133 for (r=r0; r<r1; r++)
134 if (!(regs & BIT(r)))
135 goto Found;
136 for (r=r0; r<r1; r++)
137 if (!bshas(m->b, r))
138 goto Found;
139 die("no more regs");
141 Found:
142 radd(m, t, r);
143 sethint(t, r);
144 tmp[t].visit = r;
145 h = *hint(t);
146 if (h != -1 && h != r)
147 m->w[h] = t;
148 return TMP(r);
151 static inline Ref
152 ralloc(RMap *m, int t)
154 return ralloctry(m, t, 0);
157 static int
158 rfree(RMap *m, int t)
160 int i, r;
162 assert(t >= Tmp0 || !(BIT(t) & T.rglob));
163 if (!bshas(m->b, t))
164 return -1;
165 for (i=0; m->t[i] != t; i++)
166 assert(i+1 < m->n);
167 r = m->r[i];
168 bsclr(m->b, t);
169 bsclr(m->b, r);
170 m->n--;
171 memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
172 memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
173 assert(t >= Tmp0 || t == r);
174 return r;
177 static void
178 mdump(RMap *m)
180 int i;
182 for (i=0; i<m->n; i++)
183 if (m->t[i] >= Tmp0)
184 fprintf(stderr, " (%s, R%d)",
185 tmp[m->t[i]].name,
186 m->r[i]);
187 fprintf(stderr, "\n");
190 static void
191 pmadd(Ref src, Ref dst, int k)
193 if (npm == Tmp0)
194 die("cannot have more moves than registers");
195 pm[npm].src = src;
196 pm[npm].dst = dst;
197 pm[npm].cls = k;
198 npm++;
201 enum PMStat { ToMove, Moving, Moved };
203 static int
204 pmrec(enum PMStat *status, int i, int *k)
206 int j, c;
208 /* note, this routine might emit
209 * too many large instructions
211 if (req(pm[i].src, pm[i].dst)) {
212 status[i] = Moved;
213 return -1;
215 assert(KBASE(pm[i].cls) == KBASE(*k));
216 assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
217 *k |= pm[i].cls;
218 for (j=0; j<npm; j++)
219 if (req(pm[j].dst, pm[i].src))
220 break;
221 switch (j == npm ? Moved : status[j]) {
222 case Moving:
223 c = j; /* start of cycle */
224 emit(Oswap, *k, R, pm[i].src, pm[i].dst);
225 break;
226 case ToMove:
227 status[i] = Moving;
228 c = pmrec(status, j, k);
229 if (c == i) {
230 c = -1; /* end of cycle */
231 break;
233 if (c != -1) {
234 emit(Oswap, *k, R, pm[i].src, pm[i].dst);
235 break;
237 /* fall through */
238 case Moved:
239 c = -1;
240 emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
241 break;
243 status[i] = Moved;
244 return c;
247 static void
248 pmgen()
250 int i;
251 enum PMStat *status;
253 status = alloc(npm * sizeof status[0]);
254 assert(!npm || status[npm-1] == ToMove);
255 for (i=0; i<npm; i++)
256 if (status[i] == ToMove)
257 pmrec(status, i, (int[]){pm[i].cls});
260 static void
261 move(int r, Ref to, RMap *m)
263 int n, t, r1;
265 r1 = req(to, R) ? -1 : rfree(m, to.val);
266 if (bshas(m->b, r)) {
267 /* r is used and not by to */
268 assert(r1 != r);
269 for (n=0; m->r[n] != r; n++)
270 assert(n+1 < m->n);
271 t = m->t[n];
272 rfree(m, t);
273 bsset(m->b, r);
274 ralloc(m, t);
275 bsclr(m->b, r);
277 t = req(to, R) ? r : to.val;
278 radd(m, t, r);
281 static int
282 regcpy(Ins *i)
284 return i->op == Ocopy && isreg(i->arg[0]);
287 static Ins *
288 dopm(Blk *b, Ins *i, RMap *m)
290 RMap m0;
291 int n, r, r1, t, s;
292 Ins *i1, *ip;
293 bits def;
295 m0 = *m; /* okay since we don't use m0.b */
296 m0.b->t = 0;
297 i1 = ++i;
298 do {
299 i--;
300 move(i->arg[0].val, i->to, m);
301 } while (i != b->ins && regcpy(i-1));
302 assert(m0.n <= m->n);
303 if (i != b->ins && (i-1)->op == Ocall) {
304 def = T.retregs((i-1)->arg[1], 0) | T.rglob;
305 for (r=0; T.rsave[r]>=0; r++)
306 if (!(BIT(T.rsave[r]) & def))
307 move(T.rsave[r], R, m);
309 for (npm=0, n=0; n<m->n; n++) {
310 t = m->t[n];
311 s = tmp[t].slot;
312 r1 = m->r[n];
313 r = rfind(&m0, t);
314 if (r != -1)
315 pmadd(TMP(r1), TMP(r), tmp[t].cls);
316 else if (s != -1)
317 pmadd(TMP(r1), SLOT(s), tmp[t].cls);
319 for (ip=i; ip<i1; ip++) {
320 if (!req(ip->to, R))
321 rfree(m, ip->to.val);
322 r = ip->arg[0].val;
323 if (rfind(m, r) == -1)
324 radd(m, r, r);
326 pmgen();
327 return i;
330 static int
331 prio1(Ref r1, Ref r2)
333 /* trivial heuristic to begin with,
334 * later we can use the distance to
335 * the definition instruction
337 (void) r2;
338 return *hint(r1.val) != -1;
341 static void
342 insert(Ref *r, Ref **rs, int p)
344 int i;
346 rs[i = p] = r;
347 while (i-- > 0 && prio1(*r, *rs[i])) {
348 rs[i+1] = rs[i];
349 rs[i] = r;
353 static void
354 doblk(Blk *b, RMap *cur)
356 int t, x, r, rf, rt, nr;
357 bits rs;
358 Ins *i, *i1;
359 Mem *m;
360 Ref *ra[4];
362 if (rtype(b->jmp.arg) == RTmp)
363 b->jmp.arg = ralloc(cur, b->jmp.arg.val);
364 curi = &insb[NIns];
365 for (i1=&b->ins[b->nins]; i1!=b->ins;) {
366 emiti(*--i1);
367 i = curi;
368 rf = -1;
369 switch (i->op) {
370 case Ocall:
371 rs = T.argregs(i->arg[1], 0) | T.rglob;
372 for (r=0; T.rsave[r]>=0; r++)
373 if (!(BIT(T.rsave[r]) & rs))
374 rfree(cur, T.rsave[r]);
375 break;
376 case Ocopy:
377 if (regcpy(i)) {
378 curi++;
379 i1 = dopm(b, i1, cur);
380 stmov += i+1 - curi;
381 continue;
383 if (isreg(i->to))
384 if (rtype(i->arg[0]) == RTmp)
385 sethint(i->arg[0].val, i->to.val);
386 /* fall through */
387 default:
388 if (!req(i->to, R)) {
389 assert(rtype(i->to) == RTmp);
390 r = i->to.val;
391 if (r < Tmp0 && (BIT(r) & T.rglob))
392 break;
393 rf = rfree(cur, r);
394 if (rf == -1) {
395 assert(!isreg(i->to));
396 curi++;
397 continue;
399 i->to = TMP(rf);
401 break;
403 for (x=0, nr=0; x<2; x++)
404 switch (rtype(i->arg[x])) {
405 case RMem:
406 m = &mem[i->arg[x].val];
407 if (rtype(m->base) == RTmp)
408 insert(&m->base, ra, nr++);
409 if (rtype(m->index) == RTmp)
410 insert(&m->index, ra, nr++);
411 break;
412 case RTmp:
413 insert(&i->arg[x], ra, nr++);
414 break;
416 for (r=0; r<nr; r++)
417 *ra[r] = ralloc(cur, ra[r]->val);
418 if (i->op == Ocopy && req(i->to, i->arg[0]))
419 curi++;
421 /* try to change the register of a hinted
422 * temporary if rf is available */
423 if (rf != -1 && (t = cur->w[rf]) != 0)
424 if (!bshas(cur->b, rf) && *hint(t) == rf
425 && (rt = rfree(cur, t)) != -1) {
426 tmp[t].visit = -1;
427 ralloc(cur, t);
428 assert(bshas(cur->b, rf));
429 emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
430 stmov += 1;
431 cur->w[rf] = 0;
432 for (r=0; r<nr; r++)
433 if (req(*ra[r], TMP(rt)))
434 *ra[r] = TMP(rf);
435 /* one could iterate this logic with
436 * the newly freed rt, but in this case
437 * the above loop must be changed */
440 b->nins = &insb[NIns] - curi;
441 idup(&b->ins, curi, b->nins);
444 /* qsort() comparison function to peel
445 * loop nests from inside out */
446 static int
447 carve(const void *a, const void *b)
449 Blk *ba, *bb;
451 /* todo, evaluate if this order is really
452 * better than the simple postorder */
453 ba = *(Blk**)a;
454 bb = *(Blk**)b;
455 if (ba->loop == bb->loop)
456 return ba->id > bb->id ? -1 : ba->id < bb->id;
457 return ba->loop > bb->loop ? -1 : +1;
460 /* comparison function to order temporaries
461 * for allocation at the end of blocks */
462 static int
463 prio2(int t1, int t2)
465 if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */
466 return tmp[t1].visit != -1 ? +1 : -1;
467 if ((*hint(t1) ^ *hint(t2)) < 0)
468 return *hint(t1) != -1 ? +1 : -1;
469 return tmp[t1].cost - tmp[t2].cost;
472 /* register allocation
473 * depends on rpo, phi, cost, (and obviously spill)
475 void
476 rega(Fn *fn)
478 int j, t, r, x, rl[Tmp0];
479 Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
480 RMap *end, *beg, cur, old, *m;
481 Ins *i;
482 Phi *p;
483 uint u, n;
484 Ref src, dst;
486 /* 1. setup */
487 stmov = 0;
488 stblk = 0;
489 regu = 0;
490 tmp = fn->tmp;
491 mem = fn->mem;
492 blk = alloc(fn->nblk * sizeof blk[0]);
493 end = alloc(fn->nblk * sizeof end[0]);
494 beg = alloc(fn->nblk * sizeof beg[0]);
495 for (n=0; n<fn->nblk; n++) {
496 bsinit(end[n].b, fn->ntmp);
497 bsinit(beg[n].b, fn->ntmp);
499 bsinit(cur.b, fn->ntmp);
500 bsinit(old.b, fn->ntmp);
502 loop = INT_MAX;
503 for (t=0; t<fn->ntmp; t++) {
504 tmp[t].hint.r = t < Tmp0 ? t : -1;
505 tmp[t].hint.w = loop;
506 tmp[t].visit = -1;
508 for (bp=blk, b=fn->start; b; b=b->link)
509 *bp++ = b;
510 qsort(blk, fn->nblk, sizeof blk[0], carve);
511 for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
512 if (i->op != Ocopy || !isreg(i->arg[0]))
513 break;
514 else {
515 assert(rtype(i->to) == RTmp);
516 sethint(i->to.val, i->arg[0].val);
519 /* 2. assign registers */
520 for (bp=blk; bp<&blk[fn->nblk]; bp++) {
521 b = *bp;
522 n = b->id;
523 loop = b->loop;
524 cur.n = 0;
525 bszero(cur.b);
526 memset(cur.w, 0, sizeof cur.w);
527 for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
528 j = x++;
529 rl[j] = t;
530 while (j-- > 0 && prio2(t, rl[j]) > 0) {
531 rl[j+1] = rl[j];
532 rl[j] = t;
535 for (r=0; bsiter(b->out, &r) && r<Tmp0; r++)
536 radd(&cur, r, r);
537 for (j=0; j<x; j++)
538 ralloctry(&cur, rl[j], 1);
539 for (j=0; j<x; j++)
540 ralloc(&cur, rl[j]);
541 rcopy(&end[n], &cur);
542 doblk(b, &cur);
543 bscopy(b->in, cur.b);
544 for (p=b->phi; p; p=p->link)
545 if (rtype(p->to) == RTmp)
546 bsclr(b->in, p->to.val);
547 rcopy(&beg[n], &cur);
550 /* 3. emit copies shared by multiple edges
551 * to the same block */
552 for (s=fn->start; s; s=s->link) {
553 if (s->npred <= 1)
554 continue;
555 m = &beg[s->id];
557 /* rl maps a register that is live at the
558 * beginning of s to the one used in all
559 * predecessors (if any, -1 otherwise) */
560 memset(rl, 0, sizeof rl);
562 /* to find the register of a phi in a
563 * predecessor, we have to find the
564 * corresponding argument */
565 for (p=s->phi; p; p=p->link) {
566 if (rtype(p->to) != RTmp
567 || (r=rfind(m, p->to.val)) == -1)
568 continue;
569 for (u=0; u<p->narg; u++) {
570 b = p->blk[u];
571 src = p->arg[u];
572 if (rtype(src) != RTmp)
573 continue;
574 x = rfind(&end[b->id], src.val);
575 if (x == -1) /* spilled */
576 continue;
577 rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
579 if (rl[r] == 0)
580 rl[r] = -1;
583 /* process non-phis temporaries */
584 for (j=0; j<m->n; j++) {
585 t = m->t[j];
586 r = m->r[j];
587 if (rl[r] || t < Tmp0 /* todo, remove this */)
588 continue;
589 for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
590 x = rfind(&end[(*bp)->id], t);
591 if (x == -1) /* spilled */
592 continue;
593 rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
595 if (rl[r] == 0)
596 rl[r] = -1;
599 npm = 0;
600 for (j=0; j<m->n; j++) {
601 t = m->t[j];
602 r = m->r[j];
603 x = rl[r];
604 assert(x != 0 || t < Tmp0 /* todo, ditto */);
605 if (x > 0 && !bshas(m->b, x)) {
606 pmadd(TMP(x), TMP(r), tmp[t].cls);
607 m->r[j] = x;
608 bsset(m->b, x);
611 curi = &insb[NIns];
612 pmgen();
613 j = &insb[NIns] - curi;
614 if (j == 0)
615 continue;
616 stmov += j;
617 s->nins += j;
618 i = alloc(s->nins * sizeof(Ins));
619 icpy(icpy(i, curi, j), s->ins, s->nins-j);
620 s->ins = i;
623 if (debug['R']) {
624 fprintf(stderr, "\n> Register mappings:\n");
625 for (n=0; n<fn->nblk; n++) {
626 b = fn->rpo[n];
627 fprintf(stderr, "\t%-10s beg", b->name);
628 mdump(&beg[n]);
629 fprintf(stderr, "\t end");
630 mdump(&end[n]);
632 fprintf(stderr, "\n");
635 /* 4. emit remaining copies in new blocks */
636 blist = 0;
637 for (b=fn->start;; b=b->link) {
638 ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
639 for (; (s=**ps); ps++) {
640 npm = 0;
641 for (p=s->phi; p; p=p->link) {
642 dst = p->to;
643 assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
644 if (rtype(dst) == RTmp) {
645 r = rfind(&beg[s->id], dst.val);
646 if (r == -1)
647 continue;
648 dst = TMP(r);
650 for (u=0; p->blk[u]!=b; u++)
651 assert(u+1 < p->narg);
652 src = p->arg[u];
653 if (rtype(src) == RTmp)
654 src = rref(&end[b->id], src.val);
655 pmadd(src, dst, p->cls);
657 for (t=Tmp0; bsiter(s->in, &t); t++) {
658 src = rref(&end[b->id], t);
659 dst = rref(&beg[s->id], t);
660 pmadd(src, dst, tmp[t].cls);
662 curi = &insb[NIns];
663 pmgen();
664 if (curi == &insb[NIns])
665 continue;
666 b1 = blknew();
667 b1->loop = (b->loop+s->loop) / 2;
668 b1->link = blist;
669 blist = b1;
670 fn->nblk++;
671 (void)!snprintf(b1->name, sizeof(b1->name), "%s_%s", b->name, s->name);
672 b1->nins = &insb[NIns] - curi;
673 stmov += b1->nins;
674 stblk += 1;
675 idup(&b1->ins, curi, b1->nins);
676 b1->jmp.type = Jjmp;
677 b1->s1 = s;
678 **ps = b1;
680 if (!b->link) {
681 b->link = blist;
682 break;
685 for (b=fn->start; b; b=b->link)
686 b->phi = 0;
687 fn->reg = regu;
689 if (debug['R']) {
690 fprintf(stderr, "\n> Register allocation statistics:\n");
691 fprintf(stderr, "\tnew moves: %d\n", stmov);
692 fprintf(stderr, "\tnew blocks: %d\n", stblk);
693 fprintf(stderr, "\n> After register allocation:\n");
694 printfn(fn, stderr);