From 626f0b278137ff6f8b7d910d9b3fc3cbdfbb39fc Mon Sep 17 00:00:00 2001 From: Quentin Carbonneaux Date: Tue, 20 Aug 2024 15:20:42 +0200 Subject: [PATCH] skip preludes for some leaf fns When rbp is not necessary to compile a leaf function, we skip saving and restoring it. --- all.h | 2 + amd64/emit.c | 238 +++++++++++++++++++++++++++++++++++------------------------ amd64/isel.c | 1 + parse.c | 2 + 4 files changed, 145 insertions(+), 98 deletions(-) diff --git a/all.h b/all.h index 3479d27..97cc41c 100644 --- a/all.h +++ b/all.h @@ -393,8 +393,10 @@ struct Fn { Blk **rpo; bits reg; int slot; + int salign; char vararg; char dynalloc; + char leaf; char name[NString]; Lnk lnk; }; diff --git a/amd64/emit.c b/amd64/emit.c index 00dd80f..8f36188 100644 --- a/amd64/emit.c +++ b/amd64/emit.c @@ -1,6 +1,16 @@ #include "all.h" +typedef struct E E; + +struct E { + FILE *f; + Fn *fn; + int fp; + uint64_t fsz; + int nclob; +}; + #define CMP(X) \ X(Ciule, "be") \ X(Ciult, "b") \ @@ -142,23 +152,29 @@ static char *rname[][4] = { static int -slot(Ref r, Fn *fn) +slot(Ref r, E *e) { int s; s = rsval(r); - assert(s <= fn->slot); + assert(s <= e->fn->slot); /* specific to NAlign == 3 */ - if (s < 0) - return -4 * s; - else if (fn->vararg) - return -176 + -4 * (fn->slot - s); + if (s < 0) { + if (e->fp == RSP) + return 4*-s - 8 + e->fsz + e->nclob*8; + else + return 4*-s; + } + else if (e->fp == RSP) + return 4*s + e->nclob*8; + else if (e->fn->vararg) + return -176 + -4 * (e->fn->slot - s); else - return -4 * (fn->slot - s); + return -4 * (e->fn->slot - s); } static void -emitcon(Con *con, FILE *f) +emitcon(Con *con, E *e) { char *p, *l; @@ -168,16 +184,16 @@ emitcon(Con *con, FILE *f) p = l[0] == '"' ? "" : T.assym; if (con->sym.type == SThr) { if (T.apple) - fprintf(f, "%s%s@TLVP", p, l); + fprintf(e->f, "%s%s@TLVP", p, l); else - fprintf(f, "%%fs:%s%s@tpoff", p, l); + fprintf(e->f, "%%fs:%s%s@tpoff", p, l); } else - fprintf(f, "%s%s", p, l); + fprintf(e->f, "%s%s", p, l); if (con->bits.i) - fprintf(f, "%+"PRId64, con->bits.i); + fprintf(e->f, "%+"PRId64, con->bits.i); break; case CBits: - fprintf(f, "%"PRId64, con->bits.i); + fprintf(e->f, "%"PRId64, con->bits.i); break; default: die("unreachable"); @@ -212,10 +228,10 @@ getarg(char c, Ins *i) } } -static void emitins(Ins, Fn *, FILE *); +static void emitins(Ins, E *); static void -emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f) +emitcopy(Ref r1, Ref r2, int k, E *e) { Ins icp; @@ -223,11 +239,11 @@ emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f) icp.arg[0] = r2; icp.to = r1; icp.cls = k; - emitins(icp, fn, f); + emitins(icp, e); } static void -emitf(char *s, Ins *i, Fn *fn, FILE *f) +emitf(char *s, Ins *i, E *e) { static char clstoa[][3] = {"l", "q", "ss", "sd"}; char c; @@ -247,25 +263,25 @@ emitf(char *s, Ins *i, Fn *fn, FILE *f) case '-': assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) && "cannot convert to 2-address"); - emitcopy(i->to, i->arg[0], i->cls, fn, f); + emitcopy(i->to, i->arg[0], i->cls, e); s++; break; } - fputc('\t', f); + fputc('\t', e->f); Next: while ((c = *s++) != '%') if (!c) { - fputc('\n', f); + fputc('\n', e->f); return; } else - fputc(c, f); + fputc(c, e->f); switch ((c = *s++)) { case '%': - fputc('%', f); + fputc('%', e->f); break; case 'k': - fputs(clstoa[i->cls], f); + fputs(clstoa[i->cls], e->f); break; case '0': case '1': @@ -282,37 +298,42 @@ Next: switch (rtype(ref)) { case RTmp: assert(isreg(ref)); - fprintf(f, "%%%s", regtoa(ref.val, sz)); + fprintf(e->f, "%%%s", regtoa(ref.val, sz)); break; case RSlot: - fprintf(f, "%d(%%rbp)", slot(ref, fn)); + fprintf(e->f, "%d(%%%s)", + slot(ref, e), + regtoa(e->fp, SLong) + ); break; case RMem: Mem: - m = &fn->mem[ref.val]; + m = &e->fn->mem[ref.val]; if (rtype(m->base) == RSlot) { off.type = CBits; - off.bits.i = slot(m->base, fn); + off.bits.i = slot(m->base, e); addcon(&m->offset, &off, 1); - m->base = TMP(RBP); + m->base = TMP(e->fp); } if (m->offset.type != CUndef) - emitcon(&m->offset, f); - fputc('(', f); + emitcon(&m->offset, e); + fputc('(', e->f); if (!req(m->base, R)) - fprintf(f, "%%%s", regtoa(m->base.val, SLong)); + fprintf(e->f, "%%%s", + regtoa(m->base.val, SLong) + ); else if (m->offset.type == CAddr) - fprintf(f, "%%rip"); + fprintf(e->f, "%%rip"); if (!req(m->index, R)) - fprintf(f, ", %%%s, %d", + fprintf(e->f, ", %%%s, %d", regtoa(m->index.val, SLong), m->scale ); - fputc(')', f); + fputc(')', e->f); break; case RCon: - fputc('$', f); - emitcon(&fn->con[ref.val], f); + fputc('$', e->f); + emitcon(&e->fn->con[ref.val], e); break; default: die("unreachable"); @@ -337,18 +358,21 @@ Next: case RMem: goto Mem; case RSlot: - fprintf(f, "%d(%%rbp)", slot(ref, fn)); + fprintf(e->f, "%d(%%%s)", + slot(ref, e), + regtoa(e->fp, SLong) + ); break; case RCon: - off = fn->con[ref.val]; - emitcon(&off, f); + off = e->fn->con[ref.val]; + emitcon(&off, e); if (off.type == CAddr) if (off.sym.type != SThr || T.apple) - fprintf(f, "(%%rip)"); + fprintf(e->f, "(%%rip)"); break; case RTmp: assert(isreg(ref)); - fprintf(f, "(%%%s)", regtoa(ref.val, SLong)); + fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong)); break; default: die("unreachable"); @@ -366,7 +390,7 @@ static void *negmask[4] = { }; static void -emitins(Ins i, Fn *fn, FILE *f) +emitins(Ins i, E *e) { Ref r; int64_t val; @@ -393,7 +417,7 @@ emitins(Ins i, Fn *fn, FILE *f) || (omap[o].cls == Ka)) break; } - emitf(omap[o].fmt, &i, fn, f); + emitf(omap[o].fmt, &i, e); break; case Onop: /* just do nothing for nops, they are inserted @@ -410,7 +434,7 @@ emitins(Ins i, Fn *fn, FILE *f) if (KBASE(i.cls) == 0 /* only available for ints */ && rtype(i.arg[0]) == RCon && rtype(i.arg[1]) == RTmp) { - emitf("imul%k %0, %1, %=", &i, fn, f); + emitf("imul%k %0, %1, %=", &i, e); break; } goto Table; @@ -419,18 +443,18 @@ emitins(Ins i, Fn *fn, FILE *f) * some 3-address subtractions */ if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) { ineg = (Ins){Oneg, i.cls, i.to, {i.to}}; - emitins(ineg, fn, f); - emitf("add%k %0, %=", &i, fn, f); + emitins(ineg, e); + emitf("add%k %0, %=", &i, e); break; } goto Table; case Oneg: if (!req(i.to, i.arg[0])) - emitf("mov%k %0, %=", &i, fn, f); + emitf("mov%k %0, %=", &i, e); if (KBASE(i.cls) == 0) - emitf("neg%k %=", &i, fn, f); + emitf("neg%k %=", &i, e); else - fprintf(f, + fprintf(e->f, "\txorp%c %sfp%d(%%rip), %%%s\n", "xxsd"[i.cls], T.asloc, @@ -443,8 +467,8 @@ emitins(Ins i, Fn *fn, FILE *f) * conversion to 2-address in emitf() would fail */ if (req(i.to, i.arg[1])) { i.arg[1] = TMP(XMM0+15); - emitf("mov%k %=, %1", &i, fn, f); - emitf("mov%k %0, %=", &i, fn, f); + emitf("mov%k %=, %1", &i, e); + emitf("mov%k %0, %=", &i, e); i.arg[0] = i.to; } goto Table; @@ -460,53 +484,54 @@ emitins(Ins i, Fn *fn, FILE *f) t0 = rtype(i.arg[0]); if (i.cls == Kl && t0 == RCon - && fn->con[i.arg[0].val].type == CBits) { - val = fn->con[i.arg[0].val].bits.i; + && e->fn->con[i.arg[0].val].type == CBits) { + val = e->fn->con[i.arg[0].val].bits.i; if (isreg(i.to)) if (val >= 0 && val <= UINT32_MAX) { - emitf("movl %W0, %W=", &i, fn, f); + emitf("movl %W0, %W=", &i, e); break; } if (rtype(i.to) == RSlot) if (val < INT32_MIN || val > INT32_MAX) { - emitf("movl %0, %=", &i, fn, f); - emitf("movl %0>>32, 4+%=", &i, fn, f); + emitf("movl %0, %=", &i, e); + emitf("movl %0>>32, 4+%=", &i, e); break; } } if (isreg(i.to) && t0 == RCon - && fn->con[i.arg[0].val].type == CAddr) { - emitf("lea%k %M0, %=", &i, fn, f); + && e->fn->con[i.arg[0].val].type == CAddr) { + emitf("lea%k %M0, %=", &i, e); break; } if (rtype(i.to) == RSlot && (t0 == RSlot || t0 == RMem)) { i.cls = KWIDE(i.cls) ? Kd : Ks; i.arg[1] = TMP(XMM0+15); - emitf("mov%k %0, %1", &i, fn, f); - emitf("mov%k %1, %=", &i, fn, f); + emitf("mov%k %0, %1", &i, e); + emitf("mov%k %1, %=", &i, e); break; } /* conveniently, the assembler knows if it * should use movabsq when reading movq */ - emitf("mov%k %0, %=", &i, fn, f); + emitf("mov%k %0, %=", &i, e); break; case Oaddr: if (!T.apple && rtype(i.arg[0]) == RCon - && fn->con[i.arg[0].val].sym.type == SThr) { + && e->fn->con[i.arg[0].val].sym.type == SThr) { /* derive the symbol address from the TCB * address at offset 0 of %fs */ assert(isreg(i.to)); - con = &fn->con[i.arg[0].val]; + con = &e->fn->con[i.arg[0].val]; sym = str(con->sym.id); - emitf("movq %%fs:0, %L=", &i, fn, f); - fprintf(f, "\tleaq %s%s@tpoff", + emitf("movq %%fs:0, %L=", &i, e); + fprintf(e->f, "\tleaq %s%s@tpoff", sym[0] == '"' ? "" : T.assym, sym); if (con->bits.i) - fprintf(f, "%+"PRId64, con->bits.i); - fprintf(f, "(%%%s), %%%s\n", + fprintf(e->f, "%+"PRId64, + con->bits.i); + fprintf(e->f, "(%%%s), %%%s\n", regtoa(i.to.val, SLong), regtoa(i.to.val, SLong)); break; @@ -517,12 +542,12 @@ emitins(Ins i, Fn *fn, FILE *f) * assembly... */ switch (rtype(i.arg[0])) { case RCon: - fprintf(f, "\tcallq "); - emitcon(&fn->con[i.arg[0].val], f); - fprintf(f, "\n"); + fprintf(e->f, "\tcallq "); + emitcon(&e->fn->con[i.arg[0].val], e); + fprintf(e->f, "\n"); break; case RTmp: - emitf("callq *%L0", &i, fn, f); + emitf("callq *%L0", &i, e); break; default: die("invalid call argument"); @@ -533,9 +558,10 @@ emitins(Ins i, Fn *fn, FILE *f) * maybe we should split Osalloc in 2 different * instructions depending on the result */ - emitf("subq %L0, %%rsp", &i, fn, f); + assert(e->fp == RBP); + emitf("subq %L0, %%rsp", &i, e); if (!req(i.to, R)) - emitcopy(i.to, TMP(RSP), Kl, fn, f); + emitcopy(i.to, TMP(RSP), Kl, e); break; case Oswap: if (KBASE(i.cls) == 0) @@ -543,27 +569,35 @@ emitins(Ins i, Fn *fn, FILE *f) /* for floats, there is no swap instruction * so we use xmm15 as a temporary */ - emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f); - emitcopy(i.arg[0], i.arg[1], i.cls, fn, f); - emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f); + emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e); + emitcopy(i.arg[0], i.arg[1], i.cls, e); + emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e); break; case Odbgloc: - emitdbgloc(i.arg[0].val, i.arg[1].val, f); + emitdbgloc(i.arg[0].val, i.arg[1].val, e->f); break; } } -static uint64_t -framesz(Fn *fn) +static void +framesz(E *e) { uint64_t i, o, f; /* specific to NAlign == 3 */ - for (i=0, o=0; ireg >> amd64_sysv_rclob[i]); - f = fn->slot; + o = 0; + if (!e->fn->leaf) { + for (i=0, o=0; ifn->reg >> amd64_sysv_rclob[i]; + o &= 1; + } + f = e->fn->slot; f = (f + 3) & -4; - return 4*f + 8*o + 176*fn->vararg; + if (f > 0 + && e->fp == RSP + && e->fn->salign == 4) + f += 2; + e->fsz = 4*f + 8*o + 176*e->fn->vararg; } void @@ -578,13 +612,19 @@ amd64_emitfn(Fn *fn, FILE *f) Blk *b, *s; Ins *i, itmp; int *r, c, o, n, lbl; - uint64_t fs; + E *e; + e = &(E){.f = f, .fn = fn}; emitfnlnk(fn->name, &fn->lnk, f); - fputs("\tendbr64\n\tpushq %rbp\n\tmovq %rsp, %rbp\n", f); - fs = framesz(fn); - if (fs) - fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", fs); + fputs("\tendbr64\n", f); + if (!fn->leaf || fn->vararg || fn->dynalloc) { + e->fp = RBP; + fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f); + } else + e->fp = RSP; + framesz(e); + if (e->fsz) + fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz); if (fn->vararg) { o = -176; for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8) @@ -595,15 +635,15 @@ amd64_emitfn(Fn *fn, FILE *f) for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++) if (fn->reg & BIT(*r)) { itmp.arg[0] = TMP(*r); - emitf("pushq %L0", &itmp, fn, f); - fs += 8; + emitf("pushq %L0", &itmp, e); + e->nclob++; } for (lbl=0, b=fn->start; b; b=b->link) { if (lbl || b->npred > 1) fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id); for (i=b->ins; i!=&b->ins[b->nins]; i++) - emitins(*i, fn, f); + emitins(*i, e); lbl = 1; switch (b->jmp.type) { case Jhlt: @@ -614,17 +654,19 @@ amd64_emitfn(Fn *fn, FILE *f) fprintf(f, "\tmovq %%rbp, %%rsp\n" "\tsubq $%"PRIu64", %%rsp\n", - fs - ); + e->fsz + e->nclob * 8); for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;) if (fn->reg & BIT(*--r)) { itmp.arg[0] = TMP(*r); - emitf("popq %L0", &itmp, fn, f); + emitf("popq %L0", &itmp, e); } - fprintf(f, - "\tleave\n" - "\tret\n" - ); + if (e->fp == RBP) + fputs("\tleave\n", f); + else if (e->fsz) + fprintf(f, + "\taddq $%"PRIu64", %%rsp\n", + e->fsz); + fputs("\tret\n", f); break; case Jjmp: Jmp: diff --git a/amd64/isel.c b/amd64/isel.c index 2b92878..bd645ce 100644 --- a/amd64/isel.c +++ b/amd64/isel.c @@ -808,6 +808,7 @@ amd64_isel(Fn *fn) die("alloc too large"); fn->tmp[i->to.val].slot = fn->slot; fn->slot += sz; + fn->salign = 2 + al - Oalloc; *i = (Ins){.op = Onop}; } diff --git a/parse.c b/parse.c index caab452..e896679 100644 --- a/parse.c +++ b/parse.c @@ -694,6 +694,7 @@ parseline(PState ps) goto Ins; } if (op == Tcall) { + curf->leaf = 0; arg[0] = parseref(); parserefl(1); op = Ocall; @@ -910,6 +911,7 @@ parsefn(Lnk *lnk) curf->con[0].bits.i = 0xdeaddead; /* UNDEF */ curf->con[1].type = CBits; curf->lnk = *lnk; + curf->leaf = 1; blink = &curf->start; curf->retty = Kx; if (peek() != Tglo) -- 2.11.4.GIT