cheaper mul by small constants on amd64
[qbe.git] / mem.c
blob3265b394f53c0caf18289e671d5a872ac48809bc
1 #include "all.h"
3 typedef struct Range Range;
4 typedef struct Store Store;
5 typedef struct Slot Slot;
7 /* require use, maintains use counts */
8 void
9 promote(Fn *fn)
11 Blk *b;
12 Ins *i, *l;
13 Tmp *t;
14 Use *u, *ue;
15 int s, k;
17 /* promote uniform stack slots to temporaries */
18 b = fn->start;
19 for (i=b->ins; i<&b->ins[b->nins]; i++) {
20 if (Oalloc > i->op || i->op > Oalloc1)
21 continue;
22 /* specific to NAlign == 3 */
23 assert(rtype(i->to) == RTmp);
24 t = &fn->tmp[i->to.val];
25 if (t->ndef != 1)
26 goto Skip;
27 k = -1;
28 s = -1;
29 for (u=t->use; u<&t->use[t->nuse]; u++) {
30 if (u->type != UIns)
31 goto Skip;
32 l = u->u.ins;
33 if (isload(l->op))
34 if (s == -1 || s == loadsz(l)) {
35 s = loadsz(l);
36 continue;
38 if (isstore(l->op))
39 if (req(i->to, l->arg[1]) && !req(i->to, l->arg[0]))
40 if (s == -1 || s == storesz(l))
41 if (k == -1 || k == optab[l->op].argcls[0][0]) {
42 s = storesz(l);
43 k = optab[l->op].argcls[0][0];
44 continue;
46 goto Skip;
48 /* get rid of the alloc and replace uses */
49 *i = (Ins){.op = Onop};
50 t->ndef--;
51 ue = &t->use[t->nuse];
52 for (u=t->use; u!=ue; u++) {
53 l = u->u.ins;
54 if (isstore(l->op)) {
55 l->cls = k;
56 l->op = Ocopy;
57 l->to = l->arg[1];
58 l->arg[1] = R;
59 t->nuse--;
60 t->ndef++;
61 } else {
62 if (k == -1)
63 err("slot %%%s is read but never stored to",
64 fn->tmp[l->arg[0].val].name);
65 /* try to turn loads into copies so we
66 * can eliminate them later */
67 switch(l->op) {
68 case Oloadsw:
69 case Oloaduw:
70 if (k == Kl)
71 goto Extend;
72 /* fall through */
73 case Oload:
74 if (KBASE(k) != KBASE(l->cls))
75 l->op = Ocast;
76 else
77 l->op = Ocopy;
78 break;
79 default:
80 Extend:
81 l->op = Oextsb + (l->op - Oloadsb);
82 break;
86 Skip:;
88 if (debug['M']) {
89 fprintf(stderr, "\n> After slot promotion:\n");
90 printfn(fn, stderr);
94 /* [a, b) with 0 <= a */
95 struct Range {
96 int a, b;
99 struct Store {
100 int ip;
101 Ins *i;
104 struct Slot {
105 int t;
106 int sz;
107 bits m;
108 bits l;
109 Range r;
110 Slot *s;
111 Store *st;
112 int nst;
115 static inline int
116 rin(Range r, int n)
118 return r.a <= n && n < r.b;
121 static inline int
122 rovlap(Range r0, Range r1)
124 return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b;
127 static void
128 radd(Range *r, int n)
130 if (!r->b)
131 *r = (Range){n, n+1};
132 else if (n < r->a)
133 r->a = n;
134 else if (n >= r->b)
135 r->b = n+1;
138 static int
139 slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl)
141 Alias a;
142 Tmp *t;
144 getalias(&a, r, fn);
145 if (a.type != ALoc)
146 return 0;
147 t = &fn->tmp[a.base];
148 if (t->visit < 0)
149 return 0;
150 *off = a.offset;
151 *ps = &sl[t->visit];
152 return 1;
155 static void
156 load(Ref r, bits x, int ip, Fn *fn, Slot *sl)
158 int64_t off;
159 Slot *s;
161 if (slot(&s, &off, r, fn, sl)) {
162 s->l |= x << off;
163 s->l &= s->m;
164 if (s->l)
165 radd(&s->r, ip);
169 static void
170 store(Ref r, bits x, int ip, Ins *i, Fn *fn, Slot *sl)
172 int64_t off;
173 Slot *s;
175 if (slot(&s, &off, r, fn, sl)) {
176 if (s->l) {
177 radd(&s->r, ip);
178 s->l &= ~(x << off);
179 } else {
180 vgrow(&s->st, ++s->nst);
181 s->st[s->nst-1].ip = ip;
182 s->st[s->nst-1].i = i;
187 static int
188 scmp(const void *pa, const void *pb)
190 Slot *a, *b;
192 a = (Slot *)pa, b = (Slot *)pb;
193 if (a->sz != b->sz)
194 return b->sz - a->sz;
195 return a->r.a - b->r.a;
198 static void
199 maxrpo(Blk *hd, Blk *b)
201 if (hd->loop < (int)b->id)
202 hd->loop = b->id;
205 void
206 coalesce(Fn *fn)
208 Range r, *br;
209 Slot *s, *s0, *sl;
210 Blk *b, **ps, *succ[3];
211 Ins *i, **bl;
212 Use *u;
213 Tmp *t, *ts;
214 Ref *arg;
215 bits x;
216 int64_t off0, off1;
217 int n, m, ip, sz, nsl, nbl, *stk;
218 uint total, freed, fused;
220 /* minimize the stack usage
221 * by coalescing slots
223 nsl = 0;
224 sl = vnew(0, sizeof sl[0], PHeap);
225 for (n=Tmp0; n<fn->ntmp; n++) {
226 t = &fn->tmp[n];
227 t->visit = -1;
228 if (t->alias.type == ALoc)
229 if (t->alias.slot == &t->alias)
230 if (t->bid == fn->start->id)
231 if (t->alias.u.loc.sz != -1) {
232 t->visit = nsl;
233 vgrow(&sl, ++nsl);
234 s = &sl[nsl-1];
235 s->t = n;
236 s->sz = t->alias.u.loc.sz;
237 s->m = t->alias.u.loc.m;
238 s->s = 0;
239 s->st = vnew(0, sizeof s->st[0], PHeap);
240 s->nst = 0;
244 /* one-pass liveness analysis */
245 for (b=fn->start; b; b=b->link)
246 b->loop = -1;
247 loopiter(fn, maxrpo);
248 nbl = 0;
249 bl = vnew(0, sizeof bl[0], PHeap);
250 br = emalloc(fn->nblk * sizeof br[0]);
251 ip = INT_MAX - 1;
252 for (n=fn->nblk-1; n>=0; n--) {
253 b = fn->rpo[n];
254 succ[0] = b->s1;
255 succ[1] = b->s2;
256 succ[2] = 0;
257 br[n].b = ip--;
258 for (s=sl; s<&sl[nsl]; s++) {
259 s->l = 0;
260 for (ps=succ; *ps; ps++) {
261 m = (*ps)->id;
262 if (m > n && rin(s->r, br[m].a)) {
263 s->l = s->m;
264 radd(&s->r, ip);
268 if (b->jmp.type == Jretc)
269 load(b->jmp.arg, -1, --ip, fn, sl);
270 for (i=&b->ins[b->nins]; i!=b->ins;) {
271 --i;
272 arg = i->arg;
273 if (i->op == Oargc) {
274 load(arg[1], -1, --ip, fn, sl);
276 if (isload(i->op)) {
277 x = BIT(loadsz(i)) - 1;
278 load(arg[0], x, --ip, fn, sl);
280 if (isstore(i->op)) {
281 x = BIT(storesz(i)) - 1;
282 store(arg[1], x, ip--, i, fn, sl);
284 if (i->op == Oblit0) {
285 assert((i+1)->op == Oblit1);
286 assert(rtype((i+1)->arg[0]) == RInt);
287 sz = abs(rsval((i+1)->arg[0]));
288 x = sz >= NBit ? (bits)-1 : BIT(sz) - 1;
289 store(arg[1], x, ip--, i, fn, sl);
290 load(arg[0], x, ip, fn, sl);
291 vgrow(&bl, ++nbl);
292 bl[nbl-1] = i;
295 for (s=sl; s<&sl[nsl]; s++)
296 if (s->l) {
297 radd(&s->r, ip);
298 if (b->loop != -1) {
299 assert(b->loop >= n);
300 radd(&s->r, br[b->loop].b - 1);
303 br[n].a = ip;
305 free(br);
307 /* kill dead stores */
308 for (s=sl; s<&sl[nsl]; s++)
309 for (n=0; n<s->nst; n++)
310 if (!rin(s->r, s->st[n].ip)) {
311 i = s->st[n].i;
312 if (i->op == Oblit0)
313 *(i+1) = (Ins){.op = Onop};
314 *i = (Ins){.op = Onop};
317 /* kill slots with an empty live range */
318 total = 0;
319 freed = 0;
320 stk = vnew(0, sizeof stk[0], PHeap);
321 n = 0;
322 for (s=s0=sl; s<&sl[nsl]; s++) {
323 total += s->sz;
324 if (!s->r.b) {
325 vfree(s->st);
326 vgrow(&stk, ++n);
327 stk[n-1] = s->t;
328 freed += s->sz;
329 } else
330 *s0++ = *s;
332 nsl = s0-sl;
333 if (debug['M']) {
334 fputs("\n> Slot coalescing:\n", stderr);
335 if (n) {
336 fputs("\tkill [", stderr);
337 for (m=0; m<n; m++)
338 fprintf(stderr, " %%%s",
339 fn->tmp[stk[m]].name);
340 fputs(" ]\n", stderr);
343 while (n--) {
344 t = &fn->tmp[stk[n]];
345 assert(t->ndef == 1 && t->def);
346 i = t->def;
347 if (isload(i->op)) {
348 i->op = Ocopy;
349 i->arg[0] = UNDEF;
350 continue;
352 *i = (Ins){.op = Onop};
353 for (u=t->use; u<&t->use[t->nuse]; u++) {
354 if (u->type == UJmp) {
355 b = fn->rpo[u->bid];
356 assert(isret(b->jmp.type));
357 b->jmp.type = Jret0;
358 b->jmp.arg = R;
359 continue;
361 assert(u->type == UIns);
362 i = u->u.ins;
363 if (!req(i->to, R)) {
364 assert(rtype(i->to) == RTmp);
365 vgrow(&stk, ++n);
366 stk[n-1] = i->to.val;
367 } else if (isarg(i->op)) {
368 assert(i->op == Oargc);
369 i->arg[1] = CON_Z; /* crash */
370 } else {
371 if (i->op == Oblit0)
372 *(i+1) = (Ins){.op = Onop};
373 *i = (Ins){.op = Onop};
377 vfree(stk);
379 /* fuse slots by decreasing size */
380 qsort(sl, nsl, sizeof *sl, scmp);
381 fused = 0;
382 for (n=0; n<nsl; n++) {
383 s0 = &sl[n];
384 if (s0->s)
385 continue;
386 s0->s = s0;
387 r = s0->r;
388 for (s=s0+1; s<&sl[nsl]; s++) {
389 if (s->s || !s->r.b)
390 goto Skip;
391 if (rovlap(r, s->r))
392 /* O(n); can be approximated
393 * by 'goto Skip;' if need be
395 for (m=n; &sl[m]<s; m++)
396 if (sl[m].s == s0)
397 if (rovlap(sl[m].r, s->r))
398 goto Skip;
399 radd(&r, s->r.a);
400 radd(&r, s->r.b - 1);
401 s->s = s0;
402 fused += s->sz;
403 Skip:;
407 /* substitute fused slots */
408 for (s=sl; s<&sl[nsl]; s++) {
409 t = &fn->tmp[s->t];
410 /* the visit link is stale,
411 * reset it before the slot()
412 * calls below
414 t->visit = s-sl;
415 assert(t->ndef == 1 && t->def);
416 if (s->s == s)
417 continue;
418 *t->def = (Ins){.op = Onop};
419 ts = &fn->tmp[s->s->t];
420 assert(t->bid == ts->bid);
421 if (t->def < ts->def) {
422 /* make sure the slot we
423 * selected has a def that
424 * dominates its new uses
426 *t->def = *ts->def;
427 *ts->def = (Ins){.op = Onop};
428 ts->def = t->def;
430 for (u=t->use; u<&t->use[t->nuse]; u++) {
431 if (u->type == UJmp) {
432 b = fn->rpo[u->bid];
433 b->jmp.arg = TMP(s->s->t);
434 continue;
436 assert(u->type == UIns);
437 arg = u->u.ins->arg;
438 for (n=0; n<2; n++)
439 if (req(arg[n], TMP(s->t)))
440 arg[n] = TMP(s->s->t);
444 /* fix newly overlapping blits */
445 for (n=0; n<nbl; n++) {
446 i = bl[n];
447 if (i->op == Oblit0)
448 if (slot(&s, &off0, i->arg[0], fn, sl))
449 if (slot(&s0, &off1, i->arg[1], fn, sl))
450 if (s->s == s0->s) {
451 if (off0 < off1) {
452 sz = rsval((i+1)->arg[0]);
453 assert(sz >= 0);
454 (i+1)->arg[0] = INT(-sz);
455 } else if (off0 == off1) {
456 *i = (Ins){.op = Onop};
457 *(i+1) = (Ins){.op = Onop};
461 vfree(bl);
463 if (debug['M']) {
464 for (s0=sl; s0<&sl[nsl]; s0++) {
465 if (s0->s != s0)
466 continue;
467 fprintf(stderr, "\tfuse (% 3db) [", s0->sz);
468 for (s=s0; s<&sl[nsl]; s++) {
469 if (s->s != s0)
470 continue;
471 fprintf(stderr, " %%%s", fn->tmp[s->t].name);
472 if (s->r.b)
473 fprintf(stderr, "[%d,%d)",
474 s->r.a-ip, s->r.b-ip);
475 else
476 fputs("{}", stderr);
478 fputs(" ]\n", stderr);
480 fprintf(stderr, "\tsums %u/%u/%u (killed/fused/total)\n\n",
481 freed, fused, total);
482 printfn(fn, stderr);
485 for (s=sl; s<&sl[nsl]; s++)
486 vfree(s->st);
487 vfree(sl);