improve global registers handling
[qbe.git] / spill.c
blob0872fd543e2e6d8ddfa6e7b8634fcab19f83eead
1 #include "all.h"
3 static void
4 aggreg(Blk *hd, Blk *b)
6 int k;
8 /* aggregate looping information at
9 * loop headers */
10 bsunion(hd->gen, b->gen);
11 for (k=0; k<2; k++)
12 if (b->nlive[k] > hd->nlive[k])
13 hd->nlive[k] = b->nlive[k];
16 static void
17 tmpuse(Ref r, int use, int loop, Fn *fn)
19 Mem *m;
20 Tmp *t;
22 if (rtype(r) == RMem) {
23 m = &fn->mem[r.val];
24 tmpuse(m->base, 1, loop, fn);
25 tmpuse(m->index, 1, loop, fn);
27 else if (rtype(r) == RTmp && r.val >= Tmp0) {
28 t = &fn->tmp[r.val];
29 t->nuse += use;
30 t->ndef += !use;
31 t->cost += loop;
35 /* evaluate spill costs of temporaries,
36 * this also fills usage information
37 * requires rpo, preds
39 void
40 fillcost(Fn *fn)
42 int n;
43 uint a;
44 Blk *b;
45 Ins *i;
46 Tmp *t;
47 Phi *p;
49 loopiter(fn, aggreg);
50 if (debug['S']) {
51 fprintf(stderr, "\n> Loop information:\n");
52 for (b=fn->start; b; b=b->link) {
53 for (a=0; a<b->npred; ++a)
54 if (b->id <= b->pred[a]->id)
55 break;
56 if (a != b->npred) {
57 fprintf(stderr, "\t%-10s", b->name);
58 fprintf(stderr, " (% 3d ", b->nlive[0]);
59 fprintf(stderr, "% 3d) ", b->nlive[1]);
60 dumpts(b->gen, fn->tmp, stderr);
64 for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
65 t->cost = t-fn->tmp < Tmp0 ? UINT_MAX : 0;
66 t->nuse = 0;
67 t->ndef = 0;
69 for (b=fn->start; b; b=b->link) {
70 for (p=b->phi; p; p=p->link) {
71 /* todo, the cost computation
72 * for p->to is not great... */
73 tmpuse(p->to, 0, 0, fn);
74 for (a=0; a<p->narg; a++) {
75 n = p->blk[a]->loop;
76 assert(b->npred==p->narg && "wrong cfg");
77 n /= b->npred;
78 tmpuse(p->arg[a], 1, n, fn);
81 n = b->loop;
82 for (i=b->ins; i-b->ins < b->nins; i++) {
83 tmpuse(i->to, 0, n, fn);
84 tmpuse(i->arg[0], 1, n, fn);
85 tmpuse(i->arg[1], 1, n, fn);
87 tmpuse(b->jmp.arg, 1, n, fn);
89 if (debug['S']) {
90 fprintf(stderr, "\n> Spill costs:\n");
91 for (n=Tmp0; n<fn->ntmp; n++)
92 fprintf(stderr, "\t%-10s %d\n",
93 fn->tmp[n].name,
94 fn->tmp[n].cost);
95 fprintf(stderr, "\n");
99 static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
100 static Tmp *tmp; /* current temporaries (for tcmpX) */
101 static int ntmp; /* current # of temps (for limit) */
102 static int locs; /* stack size used by locals */
103 static int slot4; /* next slot of 4 bytes */
104 static int slot8; /* ditto, 8 bytes */
105 static BSet mask[2][1]; /* class masks */
107 static int
108 tcmp0(const void *pa, const void *pb)
110 uint ca, cb;
112 ca = tmp[*(int *)pa].cost;
113 cb = tmp[*(int *)pb].cost;
114 return (cb < ca) ? -1 : (cb > ca);
117 static int
118 tcmp1(const void *pa, const void *pb)
120 int c;
122 c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
123 return c ? c : tcmp0(pa, pb);
126 static Ref
127 slot(int t)
129 int s;
131 assert(t >= Tmp0 && "cannot spill register");
132 s = tmp[t].slot;
133 if (s == -1) {
134 /* specific to NAlign == 3 */
135 /* nice logic to pack stack slots
136 * on demand, there can be only
137 * one hole and slot4 points to it
139 * invariant: slot4 <= slot8
141 if (KWIDE(tmp[t].cls)) {
142 s = slot8;
143 if (slot4 == slot8)
144 slot4 += 2;
145 slot8 += 2;
146 } else {
147 s = slot4;
148 if (slot4 == slot8) {
149 slot8 += 2;
150 slot4 += 1;
151 } else
152 slot4 = slot8;
154 s += locs;
155 tmp[t].slot = s;
157 return SLOT(s);
160 static void
161 limit(BSet *b, int k, BSet *f)
163 static int *tarr, maxt;
164 int i, t, nt;
166 nt = bscount(b);
167 if (nt <= k)
168 return;
169 if (nt > maxt) {
170 free(tarr);
171 tarr = emalloc(nt * sizeof tarr[0]);
172 maxt = nt;
174 for (i=0, t=0; bsiter(b, &t); t++) {
175 bsclr(b, t);
176 tarr[i++] = t;
178 if (!f)
179 qsort(tarr, nt, sizeof tarr[0], tcmp0);
180 else {
181 fst = f;
182 qsort(tarr, nt, sizeof tarr[0], tcmp1);
184 for (i=0; i<k && i<nt; i++)
185 bsset(b, tarr[i]);
186 for (; i<nt; i++)
187 slot(tarr[i]);
190 static void
191 limit2(BSet *b1, int k1, int k2, BSet *fst)
193 BSet b2[1];
195 bsinit(b2, ntmp); /* todo, free those */
196 bscopy(b2, b1);
197 bsinter(b1, mask[0]);
198 bsinter(b2, mask[1]);
199 limit(b1, NIReg - k1, fst);
200 limit(b2, NFReg - k2, fst);
201 bsunion(b1, b2);
204 static void
205 sethint(BSet *u, bits r)
207 int t;
209 for (t=Tmp0; bsiter(u, &t); t++)
210 tmp[phicls(t, tmp)].hint.m |= r;
213 static void
214 reloads(BSet *u, BSet *v)
216 int t;
218 for (t=Tmp0; bsiter(u, &t); t++)
219 if (!bshas(v, t))
220 emit(Oload, tmp[t].cls, TMP(t), slot(t), R);
223 static void
224 store(Ref r, int s)
226 if (s != -1)
227 emit(Ostorew + tmp[r.val].cls, 0, R, r, SLOT(s));
230 static int
231 regcpy(Ins *i)
233 return i->op == Ocopy && isreg(i->arg[0]);
236 static Ins *
237 dopm(Blk *b, Ins *i, BSet *v)
239 int n, t;
240 BSet u[1];
241 Ins *i1;
242 bits r;
244 bsinit(u, ntmp); /* todo, free those */
245 /* consecutive copies from
246 * registers need to be handled
247 * as one large instruction
249 * fixme: there is an assumption
250 * that calls are always followed
251 * by copy instructions here, this
252 * might not be true if previous
253 * passes change
255 i1 = ++i;
256 do {
257 i--;
258 t = i->to.val;
259 if (!req(i->to, R))
260 if (bshas(v, t)) {
261 bsclr(v, t);
262 store(i->to, tmp[t].slot);
264 bsset(v, i->arg[0].val);
265 } while (i != b->ins && regcpy(i-1));
266 bscopy(u, v);
267 if (i != b->ins && (i-1)->op == Ocall) {
268 v->t[0] &= ~retregs((i-1)->arg[1], 0);
269 limit2(v, NISave, NFSave, 0);
270 for (r=0, n=0; n<NRSave; n++)
271 r |= BIT(rsave[n]);
272 v->t[0] |= argregs((i-1)->arg[1], 0);
273 } else {
274 limit2(v, 0, 0, 0);
275 r = v->t[0];
277 sethint(v, r);
278 reloads(u, v);
280 emiti(*--i1);
281 while (i1 != i);
282 return i;
285 /* spill code insertion
286 * requires spill costs, rpo, liveness
288 * Note: this will replace liveness
289 * information (in, out) with temporaries
290 * that must be in registers at block
291 * borders
293 * Be careful with:
294 * - Ocopy instructions to ensure register
295 * constraints
297 void
298 spill(Fn *fn)
300 Blk *b, *s1, *s2, *hd, **bp;
301 int j, l, t, k, lvarg[2];
302 uint n;
303 BSet u[1], v[1], w[1];
304 Ins *i;
305 Phi *p;
306 Mem *m;
307 bits r;
309 tmp = fn->tmp;
310 ntmp = fn->ntmp;
311 bsinit(u, ntmp);
312 bsinit(v, ntmp);
313 bsinit(w, ntmp);
314 bsinit(mask[0], ntmp);
315 bsinit(mask[1], ntmp);
316 locs = fn->slot;
317 slot4 = 0;
318 slot8 = 0;
319 for (t=0; t<ntmp; t++) {
320 k = 0;
321 if (t >= XMM0 && t < XMM0 + NFReg)
322 k = 1;
323 else if (t >= Tmp0)
324 k = KBASE(tmp[t].cls);
325 bsset(mask[k], t);
328 for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
329 b = *--bp;
330 /* invariant: all bocks with bigger rpo got
331 * their in,out updated. */
333 /* 1. find temporaries in registers at
334 * the end of the block (put them in v) */
335 curi = 0;
336 s1 = b->s1;
337 s2 = b->s2;
338 hd = 0;
339 if (s1 && s1->id <= b->id)
340 hd = s1;
341 if (s2 && s2->id <= b->id)
342 if (!hd || s2->id >= hd->id)
343 hd = s2;
344 if (hd) {
345 /* back-edge */
346 bszero(v);
347 hd->gen->t[0] |= RGLOB; /* don't spill registers */
348 for (k=0; k<2; k++) {
349 n = k == 0 ? NIReg : NFReg;
350 bscopy(u, b->out);
351 bsinter(u, mask[k]);
352 bscopy(w, u);
353 bsinter(u, hd->gen);
354 bsdiff(w, hd->gen);
355 if (bscount(u) < n) {
356 j = bscount(w); /* live through */
357 l = hd->nlive[k];
358 limit(w, n - (l - j), 0);
359 bsunion(u, w);
360 } else
361 limit(u, n, 0);
362 bsunion(v, u);
364 } else if (s1) {
365 liveon(v, b, s1);
366 if (s2) {
367 liveon(u, b, s2);
368 bscopy(w, u);
369 bsinter(w, v);
370 bsunion(v, u);
372 limit2(v, 0, 0, w);
373 } else {
374 bscopy(v, b->out);
375 if (rtype(b->jmp.arg) == RCall)
376 v->t[0] |= retregs(b->jmp.arg, 0);
378 for (t=Tmp0; bsiter(b->out, &t); t++)
379 if (!bshas(v, t))
380 slot(t);
381 bscopy(b->out, v);
383 /* 2. process the block instructions */
384 r = v->t[0] & (BIT(Tmp0)-1);
385 curi = &insb[NIns];
386 for (i=&b->ins[b->nins]; i!=b->ins;) {
387 i--;
388 if (regcpy(i)) {
389 i = dopm(b, i, v);
390 continue;
392 bszero(w);
393 if (!req(i->to, R)) {
394 assert(rtype(i->to) == RTmp);
395 t = i->to.val;
396 if (bshas(v, t))
397 bsclr(v, t);
398 else {
399 /* make sure we have a reg
400 * for the result */
401 bsset(v, t);
402 bsset(w, t);
405 j = opdesc[i->op].nmem;
406 for (n=0; n<2; n++)
407 if (rtype(i->arg[n]) == RMem)
408 j--;
409 for (n=0; n<2; n++)
410 switch (rtype(i->arg[n])) {
411 case RMem:
412 t = i->arg[n].val;
413 m = &fn->mem[t];
414 if (rtype(m->base) == RTmp) {
415 bsset(v, m->base.val);
416 bsset(w, m->base.val);
418 if (rtype(m->index) == RTmp) {
419 bsset(v, m->index.val);
420 bsset(w, m->index.val);
422 break;
423 case RTmp:
424 t = i->arg[n].val;
425 lvarg[n] = bshas(v, t);
426 bsset(v, t);
427 if (j-- <= 0)
428 bsset(w, t);
429 break;
431 bscopy(u, v);
432 limit2(v, 0, 0, w);
433 for (n=0; n<2; n++)
434 if (rtype(i->arg[n]) == RTmp) {
435 t = i->arg[n].val;
436 if (!bshas(v, t)) {
437 /* do not reload if the
438 * the temporary was dead
440 if (!lvarg[n])
441 bsclr(u, t);
442 i->arg[n] = slot(t);
445 reloads(u, v);
446 if (!req(i->to, R)) {
447 t = i->to.val;
448 store(i->to, tmp[t].slot);
449 bsclr(v, t);
451 emiti(*i);
452 r = v->t[0] & (BIT(Tmp0)-1);
453 if (r)
454 sethint(v, r);
456 assert(r == RGLOB || b == fn->start);
458 for (p=b->phi; p; p=p->link) {
459 assert(rtype(p->to) == RTmp);
460 t = p->to.val;
461 if (bshas(v, t)) {
462 bsclr(v, t);
463 store(p->to, tmp[t].slot);
464 } else if (bshas(b->in, t))
465 /* only if the phi is live */
466 p->to = slot(p->to.val);
468 bscopy(b->in, v);
469 b->nins = &insb[NIns] - curi;
470 idup(&b->ins, curi, b->nins);
473 /* align the locals to a 16 byte boundary */
474 /* specific to NAlign == 3 */
475 slot8 += slot8 & 3;
476 fn->slot += slot8;
478 if (debug['S']) {
479 fprintf(stderr, "\n> Block information:\n");
480 for (b=fn->start; b; b=b->link) {
481 fprintf(stderr, "\t%-10s (% 5d) ", b->name, b->loop);
482 dumpts(b->out, fn->tmp, stderr);
484 fprintf(stderr, "\n> After spilling:\n");
485 printfn(fn, stderr);