zeta/parse.c

   1 // Parsing using grammars.
   2
   3 #include "a.h"
   4 #include "type.h"
   5 #include "runtime.h"
   6 #include "grammar.h"
   7 #include "listimpl.h"
   8
   9 int FLAGS_parsemergedebug = 0;
  10
  11 static ParseValue *freepvs;
  12
  13 ParseValue*
  14 mkParseValue(Ast *ast)
  15 {
  16         ParseValue *pv;
  17
  18         if((pv = freepvs) != nil)
  19                 freepvs = (void*)pv->ast;
  20         else
  21                 pv = emallocnz(sizeof(ParseValue));
  22         pv->ref = 1;
  23         pv->ast = ast;
  24         pv->line.file = 0;
  25         pv->line.line = 0;
  26         return pv;
  27 }
  28
  29 void
  30 pvfree(CfgSym *sym, ParseValue *pv)
  31 {
  32         if(pv == nil || --pv->ref > 0)
  33                 return;
  34         if(pv->ast)
  35                 astfree(pv->ast);
  36         pv->ast = (void*)freepvs;
  37         freepvs = pv;
  38 }
  39
  40 void
  41 pvdup(CfgSym *sym, ParseValue **a, ParseValue *b)
  42 {
  43         if(*a)
  44                 (*a)->ref++;
  45 }
  46
  47 #if 0
  48 void
  49 pvtouch(ParseValue *pv, int pass)
  50 {
  51         if(pv == nil)
  52                 return;
  53
  54         assert(pv->ref > 0);
  55         if(pv->ast)
  56                 assert(pv->ast->ref > 0);
  57
  58         switch(pass){
  59         case 0:
  60                 pv->refcheck = 0;
  61                 if(pv->ast)
  62                         astrefcheck(pv->ast, 0);
  63                 break;
  64         case 1:
  65                 if(pv->refcheck++ == 0)
  66                         if(pv->ast)
  67                                 astrefcheck(pv->ast, 1);
  68                 break;
  69         case 2:
  70                 if(pv->refcheck != -1){
  71                         if(pv->ref < pv->refcheck)
  72                                 fprint(2, "%d < %d | PV %p\n", pv->ref, pv->refcheck, pv);
  73                         if(pv->ast)
  74                                 astrefcheck(pv->ast, 2);
  75                         pv->refcheck = -1;
  76                 }
  77                 break;
  78         }
  79 }
  80 #endif
  81
  82 //============================================================
  83 // Parsing action invoked during GLR parsing.
  84
  85 static Line lastline;
  86 Type *parseactiontype;
  87
  88 int
  89 parseaction(CfgGram *cfg, CfgRule *rule, ParseValue **outp, ParseValue **in)
  90 {
  91         int i, n;
  92         Ast **right, *a;
  93         ParseValue *out;
  94
  95         if(!issubtype(rule->tgram, parseactiontype)){
  96                 //fprint(2, "rejecting: %R [%T <: %T]\n", rule, rule->tgram, parseactiontype);
  97                 return -1;
  98         }
  99
 100         right = emalloc(rule->nright*sizeof right[0]);
 101         n = 0;
 102         for(i=0; i<rule->nright; i++){
 103                 a = in[i]->ast;
 104
 105                 // Only save the ASTs that aren't literal strings.
 106                 // The literal strings can be inferred from the rule.
 107                 // (They're the "concrete" syntax, kind of.)
 108                 if(nametostr(rule->right[i]->name)[0] != '"')
 109                         right[n++] = a;
 110         }
 111
 112         if(n != rule->nsavedright)
 113                 panic("parseaction - %R %d %d", rule, n, rule->nsavedright);
 114
 115         *outp = out = mkParseValue(mkastrule(rule, right));
 116         for(i=0; i<rule->nright; i++){
 117                 if(out->line.file == nil)
 118                         out->line = in[i]->line;
 119         }
 120         out->ast->line = out->line;
 121         if(rule->nright == 0)
 122                 out->line = lastline;
 123         lastline = out->line;
 124         return 1;
 125 }
 126
 127
 128 // ========================================================
 129 // Parsing of regular text (not patterns)
 130
 131 static void lexsharp(RgParse*, char**);
 132
 133 // GLR lexer callback.  Should get the next token and
 134 // return a list of RgToken* representing the various
 135 // ways to interpret that token.  Each RgToken is a
 136 // CfgSym* paired with its corresponding Ast*.
 137 // The state, in vstate, is a char** pointing at the
 138 // current input location (itself a char*).
 139 RgTokenL*
 140 lextext(RgParse *parse, CfgGram *cfg, void *v)
 141 {
 142         char **pp, *p, *op;
 143         Gram *gram;
 144         RgToken *t;
 145         CfgSymL *l;
 146         ParseValue *pv;
 147         int len, sol;
 148         char *s;
 149         CfgSym *sym;
 150         Name name;
 151         RgTokenL *tl;
 152
 153         pp = v;
 154         gram = cfg->ggram;
 155         sol = parse->col == 1;
 156
 157         // Loop looking for a non-ignorable token.
 158         do{
 159                 p = *pp;
 160                 // check EOF
 161                 if(p == nil || *p == 0){
 162                         *pp = nil;
 163                         return mkRgTokenL(rgnewtoken(parse, cfg->eofsym), nil);
 164                 }
 165
 166                 // process # line directives
 167                 if(sol)
 168                         lexsharp(parse, &p);
 169
 170                 // get next token (actually token set)
 171                 op = p;
 172                 l = rundfa(gram->dfa, p, &p);
 173                 *pp = p;
 174                 if(p == op || l == nil)
 175                         return nil;
 176
 177                 // scan lexed text to update line, column
 178                 sol = 0;
 179                 for(s=op; s<p; s++){
 180                         if(*s == '\n'){
 181                                 parse->line++;
 182                                 parse->col = 1;
 183                                 sol = 1;
 184                         }else{
 185                                 parse->col++;
 186                                 if(sol && *s != ' ' && *s != '\t')
 187                                         sol = 0;
 188                         }
 189                 }
 190         }while(gramignoring(gram, l->hd));
 191
 192         len = p - op;
 193         name = nameofn(op, len);
 194         tl = nil;
 195         for(; l; l=l->tl){
 196                 sym = l->hd;
 197                 t = rgnewtoken(parse, sym);
 198                 pv = mkParseValue(mkaststring(sym, name));
 199                 t->val = pv;
 200                 pv->line.file = parse->file;
 201                 pv->line.line = parse->line;
 202                 tl = mkRgTokenL(t, tl);
 203         }
 204         return revRgTokenL(tl);
 205 }
 206
 207 // Parse the cpp's "# ..." directives, assuming that the current
 208 // position is "^".  This will eat all sol whitespace and record
 209 // line number info (if present).  We could do some fancier
 210 // parsing of pragmas, but we won't.
 211 static void
 212 lexsharp(RgParse *parse, char **pp)
 213 {
 214         char *op, buf[512];
 215         int line, i;
 216
 217         assert(pp);
 218         assert(*pp);
 219         op = *pp;
 220         while(**pp && (**pp == ' ' || **pp == '\t'))
 221                 (*pp)++;
 222         if(**pp != '#')
 223                 goto done;
 224         (*pp)++;
 225         // Line number
 226         while(**pp && (**pp == ' ' || **pp == '\t'))
 227                 (*pp)++;
 228         line = 0;
 229         if(**pp < '0' || '9' < **pp)
 230                 goto eatline;
 231         while(**pp && '0' <= **pp && **pp <= '9'){
 232                 line = line*10 + (**pp - '0');
 233                 (*pp)++;
 234         }
 235         parse->line = line - 1;  // the \n will add 1
 236         // File name
 237         while(**pp && (**pp == ' ' || **pp == '\t'))
 238                 (*pp)++;
 239         if(**pp != '"')
 240                 goto eatline;
 241         (*pp)++;
 242         for(i=0; **pp && **pp != '"'; i++){
 243                 if(**pp == '\\'){
 244                         (*pp)++;
 245                         if(!**pp)
 246                                 break;
 247                 }
 248                 if(i >= (int)sizeof buf){
 249                         buf[sizeof(buf)-1] = '\0';
 250                         panic("file name too long near '%s'", buf);
 251                 }
 252                 buf[i] = **pp;
 253                 (*pp)++;
 254         }
 255         buf[i] = '\0';
 256         parse->file = nameof(buf);
 257
 258 eatline:
 259         while(**pp && **pp != '\n')
 260                 (*pp)++;
 261 done:
 262         parse->col += *pp - op;
 263 }
 264
 265 // Two different parses, a and b, have been found for
 266 // the same segment of text.  Choose which to use
 267 // and update av in place.  It's worse than that, because
 268 // the RNGLR parser can use an Ast before it has been
 269 // fully merged.  Rather than fix that (a lot of work),
 270 // we make glrmerge sneakily use astswap to make sure
 271 // that av->ast updates in place.  This is only okay because
 272 // all the ASTs manipulated were just created in the parser,
 273 // so we know that no one else is pointing at them other
 274 // than the values created during the parse.
 275 void
 276 glrmerge(CfgSym *sym, ParseValue *av, ParseValue *bv)
 277 {
 278         Ast *a, *b, *aa, *m;
 279         int ta, tb, debug;
 280         CfgPrec *aprec, *bprec;
 281
 282         a = av->ast;
 283         b = bv->ast;
 284
 285         ta = -1;
 286         tb = -2;
 287         debug = 0;
 288
 289         // Can get these when some productions can yield empty strings.
 290         // For example:
 291         //      grammar {
 292         //              S: A S "b";
 293         //              S: "x";
 294         //              A: ;
 295         //      }
 296         //      `S{xbbbbb};
 297         if(a->rule && a->rule == b->rule && a->rule->nright == 0)
 298                 goto A_wins;
 299
 300         aprec = a->rule ? a->rule->prec : nil;
 301         bprec = b->rule ? b->rule->prec : nil;
 302
 303         // Rule preference resolution.
 304         if(aprec && bprec && aprec->kind == CfgPrecPrefer && bprec->kind == aprec->kind){
 305                 int r = cfgcompareprec(aprec, bprec);
 306                 if(r != 0)
 307                         debug = 1;
 308                 if(r > 0)
 309                         goto A_wins;
 310                 if(r < 0)
 311                         goto B_wins;
 312         }
 313
 314         // Rule priority resolution is done in the action rule
 315         // (because the merge-node parent actually does the resolution).
 316         // Associativity resolution is also done in the action rule,
 317         // so that it applies even if there is no ambiguity.
 318
 319         // Ambiguous.
 320         if(FLAGS_parsemergedebug)
 321                 fprint(2,"---- Merge [%p.%d/%p] ----\n%R\n%R\n", aprec, aprec ? aprec->assoc : 0, bprec, a->rule, b->rule);
 322
 323         // First make a copy of a at a new location,
 324         // since the merged node is going to use a's
 325         // location, and we don't want it pointing at itself.
 326         aa = mkAst(a->op, a->sym);
 327         astswap(aa, a);
 328
 329         // Now make the merge node.
 330         m = mkastmerge(aa, b);
 331
 332         // Now move the merge node into a's location.
 333         astswap(a, m);
 334
 335         // Free the scraps.
 336         astfree(m);
 337         astfree(aa);
 338
 339         if(FLAGS_parsemergedebug)
 340                 fprint(2, "%#lA\n----\n", av->ast);
 341         return;
 342
 343 A_wins:
 344         if(FLAGS_parsemergedebug){
 345                 if(debug == 1)
 346                         fprint(2,"---- A wins (prefer) ----\n%R\n%R\n", a->rule, b->rule);
 347                 else if(debug == 2)
 348                         fprint(2,"---- A wins (assoc) ----\n[%d] %#A\n[%d] %#A\n", ta, a, tb, b);
 349                 else
 350                         fprint(2,"---- A wins ----\n%R\n%R\n", a->rule, b->rule);
 351         }
 352         return;
 353 B_wins:
 354         if(FLAGS_parsemergedebug){
 355                 if(debug == 1)
 356                         fprint(2,"---- B wins (prefer) ----\n%R\n%R\n", a->rule, b->rule);
 357                 else if(debug == 2)
 358                         fprint(2,"---- B wins (assoc) ----\n[%d] %#A\n[%d] %#A\n", ta, a, tb, b);
 359                 else
 360                         fprint(2,"---- B wins ----\n%R\n%R\n", a->rule, b->rule);
 361         }
 362         bv->ast = nil;  // now b is ours
 363         astswap(a, b);
 364         astfree(b);
 365         return;
 366 }
 367
 368
 369 Ast*
 370 parsetext(Type *tg, Name text)
 371 {
 372         Ast *ast;
 373         RgParse *p;
 374         char *t;
 375         ParseValue *pv;
 376         CfgSym *gsym;
 377
 378         if(text == nil)
 379                 text = N("");
 380
 381         gsym = tg->sym;
 382         p = rgnewparse(gsym->gram);
 383         p->file = N("<string>");
 384         p->line = 1;
 385         p->col = 1;
 386         p->merge = glrmerge;
 387         p->free = pvfree;
 388         p->dup = pvdup;
 389 //      p->touch = pvtouch;
 390         t = nametostr(text);
 391         parseactiontype = typegramsym(tg, nil, nil);
 392         pv = rgparse(gsym->gram, gsym, p, lextext, &t);
 393         parseactiontype = nil;
 394         if(pv == nil){
 395                 // The lexer might have seen some # lines and
 396                 // updated p->file and p->line.
 397                 fprint(2, "%s:%d: error parsing, near %.20s\n",
 398                         p->file ? p->file : N("???"), p->line, t);
 399                 return nil;
 400         }
 401         if(t != nil){
 402                 fprint(2, "%s:%d: error parsing, didn't reach eof, near %.20s\n",
 403                         p->file ? p->file : N("???"), p->line, t);
 404                 return nil;
 405         }
 406         ast = astleak(pv->ast);
 407 //      glrfreeparse(p);
 408
 409         // XXX validateast ?
 410
 411         return ast;
 412 }
 413