fix various codegen bugs on arm64
[qbe.git] / amd64 / sysv.c
blob5b559c1a2677a68d732f0f66b54b198a39d856d4
1 #include "all.h"
3 typedef struct AClass AClass;
4 typedef struct RAlloc RAlloc;
6 struct AClass {
7 Typ *type;
8 int inmem;
9 int align;
10 uint size;
11 int cls[2];
12 Ref ref[2];
15 struct RAlloc {
16 Ins i;
17 RAlloc *link;
20 static void
21 classify(AClass *a, Typ *t, uint s)
23 Field *f;
24 int *cls;
25 uint n, s1;
27 for (n=0, s1=s; n<t->nunion; n++, s=s1)
28 for (f=t->fields[n]; f->type!=FEnd; f++) {
29 assert(s <= 16);
30 cls = &a->cls[s/8];
31 switch (f->type) {
32 case FEnd:
33 die("unreachable");
34 case FPad:
35 /* don't change anything */
36 s += f->len;
37 break;
38 case Fs:
39 case Fd:
40 if (*cls == Kx)
41 *cls = Kd;
42 s += f->len;
43 break;
44 case Fb:
45 case Fh:
46 case Fw:
47 case Fl:
48 *cls = Kl;
49 s += f->len;
50 break;
51 case FTyp:
52 classify(a, &typ[f->len], s);
53 s += typ[f->len].size;
54 break;
59 static void
60 typclass(AClass *a, Typ *t)
62 uint sz, al;
64 sz = t->size;
65 al = 1u << t->align;
67 /* the ABI requires sizes to be rounded
68 * up to the nearest multiple of 8, moreover
69 * it makes it easy load and store structures
70 * in registers
72 if (al < 8)
73 al = 8;
74 sz = (sz + al-1) & -al;
76 a->type = t;
77 a->size = sz;
78 a->align = t->align;
80 if (t->isdark || sz > 16 || sz == 0) {
81 /* large or unaligned structures are
82 * required to be passed in memory
84 a->inmem = 1;
85 return;
88 a->cls[0] = Kx;
89 a->cls[1] = Kx;
90 a->inmem = 0;
91 classify(a, t, 0);
94 static int
95 retr(Ref reg[2], AClass *aret)
97 static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
98 int n, k, ca, nr[2];
100 nr[0] = nr[1] = 0;
101 ca = 0;
102 for (n=0; (uint)n*8<aret->size; n++) {
103 k = KBASE(aret->cls[n]);
104 reg[n] = TMP(retreg[k][nr[k]++]);
105 ca += 1 << (2 * k);
107 return ca;
110 static void
111 selret(Blk *b, Fn *fn)
113 int j, k, ca;
114 Ref r, r0, reg[2];
115 AClass aret;
117 j = b->jmp.type;
119 if (!isret(j) || j == Jret0)
120 return;
122 r0 = b->jmp.arg;
123 b->jmp.type = Jret0;
125 if (j == Jretc) {
126 typclass(&aret, &typ[fn->retty]);
127 if (aret.inmem) {
128 assert(rtype(fn->retr) == RTmp);
129 emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
130 emit(Oblit1, 0, R, INT(aret.type->size), R);
131 emit(Oblit0, 0, R, r0, fn->retr);
132 ca = 1;
133 } else {
134 ca = retr(reg, &aret);
135 if (aret.size > 8) {
136 r = newtmp("abi", Kl, fn);
137 emit(Oload, Kl, reg[1], r, R);
138 emit(Oadd, Kl, r, r0, getcon(8, fn));
140 emit(Oload, Kl, reg[0], r0, R);
142 } else {
143 k = j - Jretw;
144 if (KBASE(k) == 0) {
145 emit(Ocopy, k, TMP(RAX), r0, R);
146 ca = 1;
147 } else {
148 emit(Ocopy, k, TMP(XMM0), r0, R);
149 ca = 1 << 2;
153 b->jmp.arg = CALL(ca);
156 static int
157 argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
159 int varc, envc, nint, ni, nsse, ns, n, *pn;
160 AClass *a;
161 Ins *i;
163 if (aret && aret->inmem)
164 nint = 5; /* hidden argument */
165 else
166 nint = 6;
167 nsse = 8;
168 varc = 0;
169 envc = 0;
170 for (i=i0, a=ac; i<i1; i++, a++)
171 switch (i->op - op + Oarg) {
172 case Oarg:
173 if (KBASE(i->cls) == 0)
174 pn = &nint;
175 else
176 pn = &nsse;
177 if (*pn > 0) {
178 --*pn;
179 a->inmem = 0;
180 } else
181 a->inmem = 2;
182 a->align = 3;
183 a->size = 8;
184 a->cls[0] = i->cls;
185 break;
186 case Oargc:
187 n = i->arg[0].val;
188 typclass(a, &typ[n]);
189 if (a->inmem)
190 continue;
191 ni = ns = 0;
192 for (n=0; (uint)n*8<a->size; n++)
193 if (KBASE(a->cls[n]) == 0)
194 ni++;
195 else
196 ns++;
197 if (nint >= ni && nsse >= ns) {
198 nint -= ni;
199 nsse -= ns;
200 } else
201 a->inmem = 1;
202 break;
203 case Oarge:
204 envc = 1;
205 if (op == Opar)
206 *env = i->to;
207 else
208 *env = i->arg[0];
209 break;
210 case Oargv:
211 varc = 1;
212 break;
213 default:
214 die("unreachable");
217 if (varc && envc)
218 err("sysv abi does not support variadic env calls");
220 return ((varc|envc) << 12) | ((6-nint) << 4) | ((8-nsse) << 8);
223 int amd64_sysv_rsave[] = {
224 RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
225 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
226 XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1
228 int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1};
230 MAKESURE(sysv_arrays_ok,
231 sizeof amd64_sysv_rsave == (NGPS+NFPS+1) * sizeof(int) &&
232 sizeof amd64_sysv_rclob == (NCLR+1) * sizeof(int)
235 /* layout of call's second argument (RCall)
237 * 29 12 8 4 3 0
238 * |0...00|x|xxxx|xxxx|xx|xx| range
239 * | | | | ` gp regs returned (0..2)
240 * | | | ` sse regs returned (0..2)
241 * | | ` gp regs passed (0..6)
242 * | ` sse regs passed (0..8)
243 * ` 1 if rax is used to pass data (0..1)
246 bits
247 amd64_sysv_retregs(Ref r, int p[2])
249 bits b;
250 int ni, nf;
252 assert(rtype(r) == RCall);
253 b = 0;
254 ni = r.val & 3;
255 nf = (r.val >> 2) & 3;
256 if (ni >= 1)
257 b |= BIT(RAX);
258 if (ni >= 2)
259 b |= BIT(RDX);
260 if (nf >= 1)
261 b |= BIT(XMM0);
262 if (nf >= 2)
263 b |= BIT(XMM1);
264 if (p) {
265 p[0] = ni;
266 p[1] = nf;
268 return b;
271 bits
272 amd64_sysv_argregs(Ref r, int p[2])
274 bits b;
275 int j, ni, nf, ra;
277 assert(rtype(r) == RCall);
278 b = 0;
279 ni = (r.val >> 4) & 15;
280 nf = (r.val >> 8) & 15;
281 ra = (r.val >> 12) & 1;
282 for (j=0; j<ni; j++)
283 b |= BIT(amd64_sysv_rsave[j]);
284 for (j=0; j<nf; j++)
285 b |= BIT(XMM0+j);
286 if (p) {
287 p[0] = ni + ra;
288 p[1] = nf;
290 return b | (ra ? BIT(RAX) : 0);
293 static Ref
294 rarg(int ty, int *ni, int *ns)
296 if (KBASE(ty) == 0)
297 return TMP(amd64_sysv_rsave[(*ni)++]);
298 else
299 return TMP(XMM0 + (*ns)++);
302 static void
303 selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
305 Ins *i;
306 AClass *ac, *a, aret;
307 int ca, ni, ns, al;
308 uint stk, off;
309 Ref r, r1, r2, reg[2], env;
310 RAlloc *ra;
312 env = R;
313 ac = alloc((i1-i0) * sizeof ac[0]);
315 if (!req(i1->arg[1], R)) {
316 assert(rtype(i1->arg[1]) == RType);
317 typclass(&aret, &typ[i1->arg[1].val]);
318 ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
319 } else
320 ca = argsclass(i0, i1, ac, Oarg, 0, &env);
322 for (stk=0, a=&ac[i1-i0]; a>ac;)
323 if ((--a)->inmem) {
324 if (a->align > 4)
325 err("sysv abi requires alignments of 16 or less");
326 stk += a->size;
327 if (a->align == 4)
328 stk += stk & 15;
330 stk += stk & 15;
331 if (stk) {
332 r = getcon(-(int64_t)stk, fn);
333 emit(Osalloc, Kl, R, r, R);
336 if (!req(i1->arg[1], R)) {
337 if (aret.inmem) {
338 /* get the return location from eax
339 * it saves one callee-save reg */
340 r1 = newtmp("abi", Kl, fn);
341 emit(Ocopy, Kl, i1->to, TMP(RAX), R);
342 ca += 1;
343 } else {
344 /* todo, may read out of bounds.
345 * gcc did this up until 5.2, but
346 * this should still be fixed.
348 if (aret.size > 8) {
349 r = newtmp("abi", Kl, fn);
350 aret.ref[1] = newtmp("abi", aret.cls[1], fn);
351 emit(Ostorel, 0, R, aret.ref[1], r);
352 emit(Oadd, Kl, r, i1->to, getcon(8, fn));
354 aret.ref[0] = newtmp("abi", aret.cls[0], fn);
355 emit(Ostorel, 0, R, aret.ref[0], i1->to);
356 ca += retr(reg, &aret);
357 if (aret.size > 8)
358 emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
359 emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
360 r1 = i1->to;
362 /* allocate return pad */
363 ra = alloc(sizeof *ra);
364 /* specific to NAlign == 3 */
365 al = aret.align >= 2 ? aret.align - 2 : 0;
366 ra->i = (Ins){Oalloc+al, Kl, r1, {getcon(aret.size, fn)}};
367 ra->link = (*rap);
368 *rap = ra;
369 } else {
370 ra = 0;
371 if (KBASE(i1->cls) == 0) {
372 emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
373 ca += 1;
374 } else {
375 emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
376 ca += 1 << 2;
380 emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));
382 if (!req(R, env))
383 emit(Ocopy, Kl, TMP(RAX), env, R);
384 else if ((ca >> 12) & 1) /* vararg call */
385 emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
387 ni = ns = 0;
388 if (ra && aret.inmem)
389 emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
391 for (i=i0, a=ac; i<i1; i++, a++) {
392 if (i->op >= Oarge || a->inmem)
393 continue;
394 r1 = rarg(a->cls[0], &ni, &ns);
395 if (i->op == Oargc) {
396 if (a->size > 8) {
397 r2 = rarg(a->cls[1], &ni, &ns);
398 r = newtmp("abi", Kl, fn);
399 emit(Oload, a->cls[1], r2, r, R);
400 emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
402 emit(Oload, a->cls[0], r1, i->arg[1], R);
403 } else
404 emit(Ocopy, i->cls, r1, i->arg[0], R);
407 if (!stk)
408 return;
410 r = newtmp("abi", Kl, fn);
411 for (i=i0, a=ac, off=0; i<i1; i++, a++) {
412 if (i->op >= Oarge || !a->inmem)
413 continue;
414 r1 = newtmp("abi", Kl, fn);
415 if (i->op == Oargc) {
416 if (a->align == 4)
417 off += off & 15;
418 emit(Oblit1, 0, R, INT(a->type->size), R);
419 emit(Oblit0, 0, R, i->arg[1], r1);
420 } else
421 emit(Ostorel, 0, R, i->arg[0], r1);
422 emit(Oadd, Kl, r1, r, getcon(off, fn));
423 off += a->size;
425 emit(Osalloc, Kl, r, getcon(stk, fn), R);
428 static int
429 selpar(Fn *fn, Ins *i0, Ins *i1)
431 AClass *ac, *a, aret;
432 Ins *i;
433 int ni, ns, s, al, fa;
434 Ref r, env;
436 env = R;
437 ac = alloc((i1-i0) * sizeof ac[0]);
438 curi = &insb[NIns];
439 ni = ns = 0;
441 if (fn->retty >= 0) {
442 typclass(&aret, &typ[fn->retty]);
443 fa = argsclass(i0, i1, ac, Opar, &aret, &env);
444 } else
445 fa = argsclass(i0, i1, ac, Opar, 0, &env);
446 fn->reg = amd64_sysv_argregs(CALL(fa), 0);
448 for (i=i0, a=ac; i<i1; i++, a++) {
449 if (i->op != Oparc || a->inmem)
450 continue;
451 if (a->size > 8) {
452 r = newtmp("abi", Kl, fn);
453 a->ref[1] = newtmp("abi", Kl, fn);
454 emit(Ostorel, 0, R, a->ref[1], r);
455 emit(Oadd, Kl, r, i->to, getcon(8, fn));
457 a->ref[0] = newtmp("abi", Kl, fn);
458 emit(Ostorel, 0, R, a->ref[0], i->to);
459 /* specific to NAlign == 3 */
460 al = a->align >= 2 ? a->align - 2 : 0;
461 emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
464 if (fn->retty >= 0 && aret.inmem) {
465 r = newtmp("abi", Kl, fn);
466 emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
467 fn->retr = r;
470 for (i=i0, a=ac, s=4; i<i1; i++, a++) {
471 switch (a->inmem) {
472 case 1:
473 if (a->align > 4)
474 err("sysv abi requires alignments of 16 or less");
475 if (a->align == 4)
476 s = (s+3) & -4;
477 fn->tmp[i->to.val].slot = -s;
478 s += a->size / 4;
479 continue;
480 case 2:
481 emit(Oload, i->cls, i->to, SLOT(-s), R);
482 s += 2;
483 continue;
485 if (i->op == Opare)
486 continue;
487 r = rarg(a->cls[0], &ni, &ns);
488 if (i->op == Oparc) {
489 emit(Ocopy, a->cls[0], a->ref[0], r, R);
490 if (a->size > 8) {
491 r = rarg(a->cls[1], &ni, &ns);
492 emit(Ocopy, a->cls[1], a->ref[1], r, R);
494 } else
495 emit(Ocopy, i->cls, i->to, r, R);
498 if (!req(R, env))
499 emit(Ocopy, Kl, env, TMP(RAX), R);
501 return fa | (s*4)<<12;
504 static Blk *
505 split(Fn *fn, Blk *b)
507 Blk *bn;
509 ++fn->nblk;
510 bn = newblk();
511 bn->nins = &insb[NIns] - curi;
512 idup(&bn->ins, curi, bn->nins);
513 curi = &insb[NIns];
514 bn->visit = ++b->visit;
515 strf(bn->name, "%s.%d", b->name, b->visit);
516 bn->loop = b->loop;
517 bn->link = b->link;
518 b->link = bn;
519 return bn;
522 static void
523 chpred(Blk *b, Blk *bp, Blk *bp1)
525 Phi *p;
526 uint a;
528 for (p=b->phi; p; p=p->link) {
529 for (a=0; p->blk[a]!=bp; a++)
530 assert(a+1<p->narg);
531 p->blk[a] = bp1;
535 static void
536 selvaarg(Fn *fn, Blk *b, Ins *i)
538 Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
539 Blk *b0, *bstk, *breg;
540 int isint;
542 c4 = getcon(4, fn);
543 c8 = getcon(8, fn);
544 c16 = getcon(16, fn);
545 ap = i->arg[0];
546 isint = KBASE(i->cls) == 0;
548 /* @b [...]
549 r0 =l add ap, (0 or 4)
550 nr =l loadsw r0
551 r1 =w cultw nr, (48 or 176)
552 jnz r1, @breg, @bstk
553 @breg
554 r0 =l add ap, 16
555 r1 =l loadl r0
556 lreg =l add r1, nr
557 r0 =w add nr, (8 or 16)
558 r1 =l add ap, (0 or 4)
559 storew r0, r1
560 @bstk
561 r0 =l add ap, 8
562 lstk =l loadl r0
563 r1 =l add lstk, 8
564 storel r1, r0
566 %loc =l phi @breg %lreg, @bstk %lstk
567 i->to =(i->cls) load %loc
570 loc = newtmp("abi", Kl, fn);
571 emit(Oload, i->cls, i->to, loc, R);
572 b0 = split(fn, b);
573 b0->jmp = b->jmp;
574 b0->s1 = b->s1;
575 b0->s2 = b->s2;
576 if (b->s1)
577 chpred(b->s1, b, b0);
578 if (b->s2 && b->s2 != b->s1)
579 chpred(b->s2, b, b0);
581 lreg = newtmp("abi", Kl, fn);
582 nr = newtmp("abi", Kl, fn);
583 r0 = newtmp("abi", Kw, fn);
584 r1 = newtmp("abi", Kl, fn);
585 emit(Ostorew, Kw, R, r0, r1);
586 emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
587 emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
588 r0 = newtmp("abi", Kl, fn);
589 r1 = newtmp("abi", Kl, fn);
590 emit(Oadd, Kl, lreg, r1, nr);
591 emit(Oload, Kl, r1, r0, R);
592 emit(Oadd, Kl, r0, ap, c16);
593 breg = split(fn, b);
594 breg->jmp.type = Jjmp;
595 breg->s1 = b0;
597 lstk = newtmp("abi", Kl, fn);
598 r0 = newtmp("abi", Kl, fn);
599 r1 = newtmp("abi", Kl, fn);
600 emit(Ostorel, Kw, R, r1, r0);
601 emit(Oadd, Kl, r1, lstk, c8);
602 emit(Oload, Kl, lstk, r0, R);
603 emit(Oadd, Kl, r0, ap, c8);
604 bstk = split(fn, b);
605 bstk->jmp.type = Jjmp;
606 bstk->s1 = b0;
608 b0->phi = alloc(sizeof *b0->phi);
609 *b0->phi = (Phi){
610 .cls = Kl, .to = loc,
611 .narg = 2,
612 .blk = vnew(2, sizeof b0->phi->blk[0], PFn),
613 .arg = vnew(2, sizeof b0->phi->arg[0], PFn),
615 b0->phi->blk[0] = bstk;
616 b0->phi->blk[1] = breg;
617 b0->phi->arg[0] = lstk;
618 b0->phi->arg[1] = lreg;
619 r0 = newtmp("abi", Kl, fn);
620 r1 = newtmp("abi", Kw, fn);
621 b->jmp.type = Jjnz;
622 b->jmp.arg = r1;
623 b->s1 = breg;
624 b->s2 = bstk;
625 c = getcon(isint ? 48 : 176, fn);
626 emit(Ocmpw+Ciult, Kw, r1, nr, c);
627 emit(Oloadsw, Kl, nr, r0, R);
628 emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
631 static void
632 selvastart(Fn *fn, int fa, Ref ap)
634 Ref r0, r1;
635 int gp, fp, sp;
637 gp = ((fa >> 4) & 15) * 8;
638 fp = 48 + ((fa >> 8) & 15) * 16;
639 sp = fa >> 12;
640 r0 = newtmp("abi", Kl, fn);
641 r1 = newtmp("abi", Kl, fn);
642 emit(Ostorel, Kw, R, r1, r0);
643 emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
644 emit(Oadd, Kl, r0, ap, getcon(16, fn));
645 r0 = newtmp("abi", Kl, fn);
646 r1 = newtmp("abi", Kl, fn);
647 emit(Ostorel, Kw, R, r1, r0);
648 emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
649 emit(Oadd, Kl, r0, ap, getcon(8, fn));
650 r0 = newtmp("abi", Kl, fn);
651 emit(Ostorew, Kw, R, getcon(fp, fn), r0);
652 emit(Oadd, Kl, r0, ap, getcon(4, fn));
653 emit(Ostorew, Kw, R, getcon(gp, fn), ap);
656 void
657 amd64_sysv_abi(Fn *fn)
659 Blk *b;
660 Ins *i, *i0, *ip;
661 RAlloc *ral;
662 int n, fa;
664 for (b=fn->start; b; b=b->link)
665 b->visit = 0;
667 /* lower parameters */
668 for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
669 if (!ispar(i->op))
670 break;
671 fa = selpar(fn, b->ins, i);
672 n = b->nins - (i - b->ins) + (&insb[NIns] - curi);
673 i0 = alloc(n * sizeof(Ins));
674 ip = icpy(ip = i0, curi, &insb[NIns] - curi);
675 ip = icpy(ip, i, &b->ins[b->nins] - i);
676 b->nins = n;
677 b->ins = i0;
679 /* lower calls, returns, and vararg instructions */
680 ral = 0;
681 b = fn->start;
682 do {
683 if (!(b = b->link))
684 b = fn->start; /* do it last */
685 if (b->visit)
686 continue;
687 curi = &insb[NIns];
688 selret(b, fn);
689 for (i=&b->ins[b->nins]; i!=b->ins;)
690 switch ((--i)->op) {
691 default:
692 emiti(*i);
693 break;
694 case Ocall:
695 for (i0=i; i0>b->ins; i0--)
696 if (!isarg((i0-1)->op))
697 break;
698 selcall(fn, i0, i, &ral);
699 i = i0;
700 break;
701 case Ovastart:
702 selvastart(fn, fa, i->arg[0]);
703 break;
704 case Ovaarg:
705 selvaarg(fn, b, i);
706 break;
707 case Oarg:
708 case Oargc:
709 die("unreachable");
711 if (b == fn->start)
712 for (; ral; ral=ral->link)
713 emiti(ral->i);
714 b->nins = &insb[NIns] - curi;
715 idup(&b->ins, curi, b->nins);
716 } while (b != fn->start);
718 if (debug['A']) {
719 fprintf(stderr, "\n> After ABI lowering:\n");
720 printfn(fn, stderr);