gnu/dist/groff/src/preproc/refer/label.y

   1 /*      $NetBSD: label.y,v 1.1.1.4 2006/02/06 18:14:37 wiz Exp $        */
   2
   3 /* -*- C++ -*-
   4    Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
   5    Free Software Foundation, Inc.
   6      Written by James Clark (jjc@jclark.com)
   7
   8 This file is part of groff.
   9
  10 groff is free software; you can redistribute it and/or modify it under
  11 the terms of the GNU General Public License as published by the Free
  12 Software Foundation; either version 2, or (at your option) any later
  13 version.
  14
  15 groff is distributed in the hope that it will be useful, but WITHOUT ANY
  16 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  17 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  18 for more details.
  19
  20 You should have received a copy of the GNU General Public License along
  21 with groff; see the file COPYING.  If not, write to the Free Software
  22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  23
  24 %{
  25
  26 #include "refer.h"
  27 #include "refid.h"
  28 #include "ref.h"
  29 #include "token.h"
  30
  31 int yylex();
  32 void yyerror(const char *);
  33 int yyparse();
  34
  35 static const char *format_serial(char c, int n);
  36
  37 struct label_info {
  38   int start;
  39   int length;
  40   int count;
  41   int total;
  42   label_info(const string &);
  43 };
  44
  45 label_info *lookup_label(const string &label);
  46
  47 struct expression {
  48   enum {
  49     // Does the tentative label depend on the reference?
  50     CONTAINS_VARIABLE = 01,
  51     CONTAINS_STAR = 02,
  52     CONTAINS_FORMAT = 04,
  53     CONTAINS_AT = 010
  54   };
  55   virtual ~expression() { }
  56   virtual void evaluate(int, const reference &, string &,
  57                         substring_position &) = 0;
  58   virtual unsigned analyze() { return 0; }
  59 };
  60
  61 class at_expr : public expression {
  62 public:
  63   at_expr() { }
  64   void evaluate(int, const reference &, string &, substring_position &);
  65   unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
  66 };
  67
  68 class format_expr : public expression {
  69   char type;
  70   int width;
  71   int first_number;
  72 public:
  73   format_expr(char c, int w = 0, int f = 1)
  74     : type(c), width(w), first_number(f) { }
  75   void evaluate(int, const reference &, string &, substring_position &);
  76   unsigned analyze() { return CONTAINS_FORMAT; }
  77 };
  78
  79 class field_expr : public expression {
  80   int number;
  81   char name;
  82 public:
  83   field_expr(char nm, int num) : number(num), name(nm) { }
  84   void evaluate(int, const reference &, string &, substring_position &);
  85   unsigned analyze() { return CONTAINS_VARIABLE; }
  86 };
  87
  88 class literal_expr : public expression {
  89   string s;
  90 public:
  91   literal_expr(const char *ptr, int len) : s(ptr, len) { }
  92   void evaluate(int, const reference &, string &, substring_position &);
  93 };
  94
  95 class unary_expr : public expression {
  96 protected:
  97   expression *expr;
  98 public:
  99   unary_expr(expression *e) : expr(e) { }
 100   ~unary_expr() { delete expr; }
 101   void evaluate(int, const reference &, string &, substring_position &) = 0;
 102   unsigned analyze() { return expr ? expr->analyze() : 0; }
 103 };
 104
 105 // This caches the analysis of an expression.
 106
 107 class analyzed_expr : public unary_expr {
 108   unsigned flags;
 109 public:
 110   analyzed_expr(expression *);
 111   void evaluate(int, const reference &, string &, substring_position &);
 112   unsigned analyze() { return flags; }
 113 };
 114
 115 class star_expr : public unary_expr {
 116 public:
 117   star_expr(expression *e) : unary_expr(e) { }
 118   void evaluate(int, const reference &, string &, substring_position &);
 119   unsigned analyze() {
 120     return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
 121             | CONTAINS_STAR);
 122   }
 123 };
 124
 125 typedef void map_func(const char *, const char *, string &);
 126
 127 class map_expr : public unary_expr {
 128   map_func *func;
 129 public:
 130   map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
 131   void evaluate(int, const reference &, string &, substring_position &);
 132 };
 133
 134 typedef const char *extractor_func(const char *, const char *, const char **);
 135
 136 class extractor_expr : public unary_expr {
 137   int part;
 138   extractor_func *func;
 139 public:
 140   enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
 141   extractor_expr(expression *e, extractor_func *f, int pt)
 142     : unary_expr(e), part(pt), func(f) { }
 143   void evaluate(int, const reference &, string &, substring_position &);
 144 };
 145
 146 class truncate_expr : public unary_expr {
 147   int n;
 148 public:
 149   truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
 150   void evaluate(int, const reference &, string &, substring_position &);
 151 };
 152
 153 class separator_expr : public unary_expr {
 154 public:
 155   separator_expr(expression *e) : unary_expr(e) { }
 156   void evaluate(int, const reference &, string &, substring_position &);
 157 };
 158
 159 class binary_expr : public expression {
 160 protected:
 161   expression *expr1;
 162   expression *expr2;
 163 public:
 164   binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
 165   ~binary_expr() { delete expr1; delete expr2; }
 166   void evaluate(int, const reference &, string &, substring_position &) = 0;
 167   unsigned analyze() {
 168     return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
 169   }
 170 };
 171
 172 class alternative_expr : public binary_expr {
 173 public:
 174   alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
 175   void evaluate(int, const reference &, string &, substring_position &);
 176 };
 177
 178 class list_expr : public binary_expr {
 179 public:
 180   list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
 181   void evaluate(int, const reference &, string &, substring_position &);
 182 };
 183
 184 class substitute_expr : public binary_expr {
 185 public:
 186   substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
 187   void evaluate(int, const reference &, string &, substring_position &);
 188 };
 189
 190 class ternary_expr : public expression {
 191 protected:
 192   expression *expr1;
 193   expression *expr2;
 194   expression *expr3;
 195 public:
 196   ternary_expr(expression *e1, expression *e2, expression *e3)
 197     : expr1(e1), expr2(e2), expr3(e3) { }
 198   ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
 199   void evaluate(int, const reference &, string &, substring_position &) = 0;
 200   unsigned analyze() {
 201     return ((expr1 ? expr1->analyze() : 0)
 202             | (expr2 ? expr2->analyze() : 0)
 203             | (expr3 ? expr3->analyze() : 0));
 204   }
 205 };
 206
 207 class conditional_expr : public ternary_expr {
 208 public:
 209   conditional_expr(expression *e1, expression *e2, expression *e3)
 210     : ternary_expr(e1, e2, e3) { }
 211   void evaluate(int, const reference &, string &, substring_position &);
 212 };
 213
 214 static expression *parsed_label = 0;
 215 static expression *parsed_date_label = 0;
 216 static expression *parsed_short_label = 0;
 217
 218 static expression *parse_result;
 219
 220 string literals;
 221
 222 %}
 223
 224 %union {
 225   int num;
 226   expression *expr;
 227   struct { int ndigits; int val; } dig;
 228   struct { int start; int len; } str;
 229 }
 230
 231 /* uppercase or lowercase letter */
 232 %token <num> TOKEN_LETTER
 233 /* literal characters */
 234 %token <str> TOKEN_LITERAL
 235 /* digit */
 236 %token <num> TOKEN_DIGIT
 237
 238 %type <expr> conditional
 239 %type <expr> alternative
 240 %type <expr> list
 241 %type <expr> string
 242 %type <expr> substitute
 243 %type <expr> optional_conditional
 244 %type <num> number
 245 %type <dig> digits
 246 %type <num> optional_number
 247 %type <num> flag
 248
 249 %%
 250
 251 expr:
 252         optional_conditional
 253                 { parse_result = ($1 ? new analyzed_expr($1) : 0); }
 254         ;
 255
 256 conditional:
 257         alternative
 258                 { $$ = $1; }
 259         | alternative '?' optional_conditional ':' conditional
 260                 { $$ = new conditional_expr($1, $3, $5); }
 261         ;
 262
 263 optional_conditional:
 264         /* empty */
 265                 { $$ = 0; }
 266         | conditional
 267                 { $$ = $1; }
 268         ;
 269
 270 alternative:
 271         list
 272                 { $$ = $1; }
 273         | alternative '|' list
 274                 { $$ = new alternative_expr($1, $3); }
 275         | alternative '&' list
 276                 { $$ = new conditional_expr($1, $3, 0); }
 277         ;
 278
 279 list:
 280         substitute
 281                 { $$ = $1; }
 282         | list substitute
 283                 { $$ = new list_expr($1, $2); }
 284         ;
 285
 286 substitute:
 287         string
 288                 { $$ = $1; }
 289         | substitute '~' string
 290                 { $$ = new substitute_expr($1, $3); }
 291         ;
 292
 293 string:
 294         '@'
 295                 { $$ = new at_expr; }
 296         | TOKEN_LITERAL
 297                 {
 298                   $$ = new literal_expr(literals.contents() + $1.start,
 299                                         $1.len);
 300                 }
 301         | TOKEN_LETTER
 302                 { $$ = new field_expr($1, 0); }
 303         | TOKEN_LETTER number
 304                 { $$ = new field_expr($1, $2 - 1); }
 305         | '%' TOKEN_LETTER
 306                 {
 307                   switch ($2) {
 308                   case 'I':
 309                   case 'i':
 310                   case 'A':
 311                   case 'a':
 312                     $$ = new format_expr($2);
 313                     break;
 314                   default:
 315                     command_error("unrecognized format `%1'", char($2));
 316                     $$ = new format_expr('a');
 317                     break;
 318                   }
 319                 }
 320
 321         | '%' digits
 322                 {
 323                   $$ = new format_expr('0', $2.ndigits, $2.val);
 324                 }
 325         | string '.' flag TOKEN_LETTER optional_number
 326                 {
 327                   switch ($4) {
 328                   case 'l':
 329                     $$ = new map_expr($1, lowercase);
 330                     break;
 331                   case 'u':
 332                     $$ = new map_expr($1, uppercase);
 333                     break;
 334                   case 'c':
 335                     $$ = new map_expr($1, capitalize);
 336                     break;
 337                   case 'r':
 338                     $$ = new map_expr($1, reverse_name);
 339                     break;
 340                   case 'a':
 341                     $$ = new map_expr($1, abbreviate_name);
 342                     break;
 343                   case 'y':
 344                     $$ = new extractor_expr($1, find_year, $3);
 345                     break;
 346                   case 'n':
 347                     $$ = new extractor_expr($1, find_last_name, $3);
 348                     break;
 349                   default:
 350                     $$ = $1;
 351                     command_error("unknown function `%1'", char($4));
 352                     break;
 353                   }
 354                 }
 355
 356         | string '+' number
 357                 { $$ = new truncate_expr($1, $3); }
 358         | string '-' number
 359                 { $$ = new truncate_expr($1, -$3); }
 360         | string '*'
 361                 { $$ = new star_expr($1); }
 362         | '(' optional_conditional ')'
 363                 { $$ = $2; }
 364         | '<' optional_conditional '>'
 365                 { $$ = new separator_expr($2); }
 366         ;
 367
 368 optional_number:
 369         /* empty */
 370                 { $$ = -1; }
 371         | number
 372                 { $$ = $1; }
 373         ;
 374
 375 number:
 376         TOKEN_DIGIT
 377                 { $$ = $1; }
 378         | number TOKEN_DIGIT
 379                 { $$ = $1*10 + $2; }
 380         ;
 381
 382 digits:
 383         TOKEN_DIGIT
 384                 { $$.ndigits = 1; $$.val = $1; }
 385         | digits TOKEN_DIGIT
 386                 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
 387         ;
 388
 389
 390 flag:
 391         /* empty */
 392                 { $$ = 0; }
 393         | '+'
 394                 { $$ = 1; }
 395         | '-'
 396                 { $$ = -1; }
 397         ;
 398
 399 %%
 400
 401 /* bison defines const to be empty unless __STDC__ is defined, which it
 402 isn't under cfront */
 403
 404 #ifdef const
 405 #undef const
 406 #endif
 407
 408 const char *spec_ptr;
 409 const char *spec_end;
 410 const char *spec_cur;
 411
 412 static char uppercase_array[] = {
 413   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 414   'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 415   'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 416   'Y', 'Z',
 417 };
 418
 419 static char lowercase_array[] = {
 420   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
 421   'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
 422   'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
 423   'y', 'z',
 424 };
 425
 426 int yylex()
 427 {
 428   while (spec_ptr < spec_end && csspace(*spec_ptr))
 429     spec_ptr++;
 430   spec_cur = spec_ptr;
 431   if (spec_ptr >= spec_end)
 432     return 0;
 433   unsigned char c = *spec_ptr++;
 434   if (csalpha(c)) {
 435     yylval.num = c;
 436     return TOKEN_LETTER;
 437   }
 438   if (csdigit(c)) {
 439     yylval.num = c - '0';
 440     return TOKEN_DIGIT;
 441   }
 442   if (c == '\'') {
 443     yylval.str.start = literals.length();
 444     for (; spec_ptr < spec_end; spec_ptr++) {
 445       if (*spec_ptr == '\'') {
 446         if (++spec_ptr < spec_end && *spec_ptr == '\'')
 447           literals += '\'';
 448         else {
 449           yylval.str.len = literals.length() - yylval.str.start;
 450           return TOKEN_LITERAL;
 451         }
 452       }
 453       else
 454         literals += *spec_ptr;
 455     }
 456     yylval.str.len = literals.length() - yylval.str.start;
 457     return TOKEN_LITERAL;
 458   }
 459   return c;
 460 }
 461
 462 int set_label_spec(const char *label_spec)
 463 {
 464   spec_cur = spec_ptr = label_spec;
 465   spec_end = strchr(label_spec, '\0');
 466   literals.clear();
 467   if (yyparse())
 468     return 0;
 469   delete parsed_label;
 470   parsed_label = parse_result;
 471   return 1;
 472 }
 473
 474 int set_date_label_spec(const char *label_spec)
 475 {
 476   spec_cur = spec_ptr = label_spec;
 477   spec_end = strchr(label_spec, '\0');
 478   literals.clear();
 479   if (yyparse())
 480     return 0;
 481   delete parsed_date_label;
 482   parsed_date_label = parse_result;
 483   return 1;
 484 }
 485
 486 int set_short_label_spec(const char *label_spec)
 487 {
 488   spec_cur = spec_ptr = label_spec;
 489   spec_end = strchr(label_spec, '\0');
 490   literals.clear();
 491   if (yyparse())
 492     return 0;
 493   delete parsed_short_label;
 494   parsed_short_label = parse_result;
 495   return 1;
 496 }
 497
 498 void yyerror(const char *message)
 499 {
 500   if (spec_cur < spec_end)
 501     command_error("label specification %1 before `%2'", message, spec_cur);
 502   else
 503     command_error("label specification %1 at end of string",
 504                   message, spec_cur);
 505 }
 506
 507 void at_expr::evaluate(int tentative, const reference &ref,
 508                        string &result, substring_position &)
 509 {
 510   if (tentative)
 511     ref.canonicalize_authors(result);
 512   else {
 513     const char *end, *start = ref.get_authors(&end);
 514     if (start)
 515       result.append(start, end - start);
 516   }
 517 }
 518
 519 void format_expr::evaluate(int tentative, const reference &ref,
 520                            string &result, substring_position &)
 521 {
 522   if (tentative)
 523     return;
 524   const label_info *lp = ref.get_label_ptr();
 525   int num = lp == 0 ? ref.get_number() : lp->count;
 526   if (type != '0')
 527     result += format_serial(type, num + 1);
 528   else {
 529     const char *ptr = i_to_a(num + first_number);
 530     int pad = width - strlen(ptr);
 531     while (--pad >= 0)
 532       result += '0';
 533     result += ptr;
 534   }
 535 }
 536
 537 static const char *format_serial(char c, int n)
 538 {
 539   assert(n > 0);
 540   static char buf[128]; // more than enough.
 541   switch (c) {
 542   case 'i':
 543   case 'I':
 544     {
 545       char *p = buf;
 546       // troff uses z and w to represent 10000 and 5000 in Roman
 547       // numerals; I can find no historical basis for this usage
 548       const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
 549       if (n >= 40000)
 550         return i_to_a(n);
 551       while (n >= 10000) {
 552         *p++ = s[0];
 553         n -= 10000;
 554       }
 555       for (int i = 1000; i > 0; i /= 10, s += 2) {
 556         int m = n/i;
 557         n -= m*i;
 558         switch (m) {
 559         case 3:
 560           *p++ = s[2];
 561           /* falls through */
 562         case 2:
 563           *p++ = s[2];
 564           /* falls through */
 565         case 1:
 566           *p++ = s[2];
 567           break;
 568         case 4:
 569           *p++ = s[2];
 570           *p++ = s[1];
 571           break;
 572         case 8:
 573           *p++ = s[1];
 574           *p++ = s[2];
 575           *p++ = s[2];
 576           *p++ = s[2];
 577           break;
 578         case 7:
 579           *p++ = s[1];
 580           *p++ = s[2];
 581           *p++ = s[2];
 582           break;
 583         case 6:
 584           *p++ = s[1];
 585           *p++ = s[2];
 586           break;
 587         case 5:
 588           *p++ = s[1];
 589           break;
 590         case 9:
 591           *p++ = s[2];
 592           *p++ = s[0];
 593         }
 594       }
 595       *p = 0;
 596       break;
 597     }
 598   case 'a':
 599   case 'A':
 600     {
 601       char *p = buf;
 602       // this is derived from troff/reg.c
 603       while (n > 0) {
 604         int d = n % 26;
 605         if (d == 0)
 606           d = 26;
 607         n -= d;
 608         n /= 26;
 609         *p++ = c == 'a' ? lowercase_array[d - 1] :
 610                                uppercase_array[d - 1];
 611       }
 612       *p-- = 0;
 613       // Reverse it.
 614       char *q = buf;
 615       while (q < p) {
 616         char temp = *q;
 617         *q = *p;
 618         *p = temp;
 619         --p;
 620         ++q;
 621       }
 622       break;
 623     }
 624   default:
 625     assert(0);
 626   }
 627   return buf;
 628 }
 629
 630 void field_expr::evaluate(int, const reference &ref,
 631                           string &result, substring_position &)
 632 {
 633   const char *end;
 634   const char *start = ref.get_field(name, &end);
 635   if (start) {
 636     start = nth_field(number, start, &end);
 637     if (start)
 638       result.append(start, end - start);
 639   }
 640 }
 641
 642 void literal_expr::evaluate(int, const reference &,
 643                             string &result, substring_position &)
 644 {
 645   result += s;
 646 }
 647
 648 analyzed_expr::analyzed_expr(expression *e)
 649 : unary_expr(e), flags(e ? e->analyze() : 0)
 650 {
 651 }
 652
 653 void analyzed_expr::evaluate(int tentative, const reference &ref,
 654                              string &result, substring_position &pos)
 655 {
 656   if (expr)
 657     expr->evaluate(tentative, ref, result, pos);
 658 }
 659
 660 void star_expr::evaluate(int tentative, const reference &ref,
 661                          string &result, substring_position &pos)
 662 {
 663   const label_info *lp = ref.get_label_ptr();
 664   if (!tentative
 665       && (lp == 0 || lp->total > 1)
 666       && expr)
 667     expr->evaluate(tentative, ref, result, pos);
 668 }
 669
 670 void separator_expr::evaluate(int tentative, const reference &ref,
 671                               string &result, substring_position &pos)
 672 {
 673   int start_length = result.length();
 674   int is_first = pos.start < 0;
 675   if (expr)
 676     expr->evaluate(tentative, ref, result, pos);
 677   if (is_first) {
 678     pos.start = start_length;
 679     pos.length = result.length() - start_length;
 680   }
 681 }
 682
 683 void map_expr::evaluate(int tentative, const reference &ref,
 684                         string &result, substring_position &)
 685 {
 686   if (expr) {
 687     string temp;
 688     substring_position temp_pos;
 689     expr->evaluate(tentative, ref, temp, temp_pos);
 690     (*func)(temp.contents(), temp.contents() + temp.length(), result);
 691   }
 692 }
 693
 694 void extractor_expr::evaluate(int tentative, const reference &ref,
 695                               string &result, substring_position &)
 696 {
 697   if (expr) {
 698     string temp;
 699     substring_position temp_pos;
 700     expr->evaluate(tentative, ref, temp, temp_pos);
 701     const char *end, *start = (*func)(temp.contents(),
 702                                       temp.contents() + temp.length(),
 703                                       &end);
 704     switch (part) {
 705     case BEFORE:
 706       if (start)
 707         result.append(temp.contents(), start - temp.contents());
 708       else
 709         result += temp;
 710       break;
 711     case MATCH:
 712       if (start)
 713         result.append(start, end - start);
 714       break;
 715     case AFTER:
 716       if (start)
 717         result.append(end, temp.contents() + temp.length() - end);
 718       break;
 719     default:
 720       assert(0);
 721     }
 722   }
 723 }
 724
 725 static void first_part(int len, const char *ptr, const char *end,
 726                           string &result)
 727 {
 728   for (;;) {
 729     const char *token_start = ptr;
 730     if (!get_token(&ptr, end))
 731       break;
 732     const token_info *ti = lookup_token(token_start, ptr);
 733     int counts = ti->sortify_non_empty(token_start, ptr);
 734     if (counts && --len < 0)
 735       break;
 736     if (counts || ti->is_accent())
 737       result.append(token_start, ptr - token_start);
 738   }
 739 }
 740
 741 static void last_part(int len, const char *ptr, const char *end,
 742                       string &result)
 743 {
 744   const char *start = ptr;
 745   int count = 0;
 746   for (;;) {
 747     const char *token_start = ptr;
 748     if (!get_token(&ptr, end))
 749       break;
 750     const token_info *ti = lookup_token(token_start, ptr);
 751     if (ti->sortify_non_empty(token_start, ptr))
 752       count++;
 753   }
 754   ptr = start;
 755   int skip = count - len;
 756   if (skip > 0) {
 757     for (;;) {
 758       const char *token_start = ptr;
 759       if (!get_token(&ptr, end))
 760         assert(0);
 761       const token_info *ti = lookup_token(token_start, ptr);
 762       if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
 763         ptr = token_start;
 764         break;
 765       }
 766     }
 767   }
 768   first_part(len, ptr, end, result);
 769 }
 770
 771 void truncate_expr::evaluate(int tentative, const reference &ref,
 772                              string &result, substring_position &)
 773 {
 774   if (expr) {
 775     string temp;
 776     substring_position temp_pos;
 777     expr->evaluate(tentative, ref, temp, temp_pos);
 778     const char *start = temp.contents();
 779     const char *end = start + temp.length();
 780     if (n > 0)
 781       first_part(n, start, end, result);
 782     else if (n < 0)
 783       last_part(-n, start, end, result);
 784   }
 785 }
 786
 787 void alternative_expr::evaluate(int tentative, const reference &ref,
 788                                 string &result, substring_position &pos)
 789 {
 790   int start_length = result.length();
 791   if (expr1)
 792     expr1->evaluate(tentative, ref, result, pos);
 793   if (result.length() == start_length && expr2)
 794     expr2->evaluate(tentative, ref, result, pos);
 795 }
 796
 797 void list_expr::evaluate(int tentative, const reference &ref,
 798                          string &result, substring_position &pos)
 799 {
 800   if (expr1)
 801     expr1->evaluate(tentative, ref, result, pos);
 802   if (expr2)
 803     expr2->evaluate(tentative, ref, result, pos);
 804 }
 805
 806 void substitute_expr::evaluate(int tentative, const reference &ref,
 807                                string &result, substring_position &pos)
 808 {
 809   int start_length = result.length();
 810   if (expr1)
 811     expr1->evaluate(tentative, ref, result, pos);
 812   if (result.length() > start_length && result[result.length() - 1] == '-') {
 813     // ought to see if pos covers the -
 814     result.set_length(result.length() - 1);
 815     if (expr2)
 816       expr2->evaluate(tentative, ref, result, pos);
 817   }
 818 }
 819
 820 void conditional_expr::evaluate(int tentative, const reference &ref,
 821                                 string &result, substring_position &pos)
 822 {
 823   string temp;
 824   substring_position temp_pos;
 825   if (expr1)
 826     expr1->evaluate(tentative, ref, temp, temp_pos);
 827   if (temp.length() > 0) {
 828     if (expr2)
 829       expr2->evaluate(tentative, ref, result, pos);
 830   }
 831   else {
 832     if (expr3)
 833       expr3->evaluate(tentative, ref, result, pos);
 834   }
 835 }
 836
 837 void reference::pre_compute_label()
 838 {
 839   if (parsed_label != 0
 840       && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
 841     label.clear();
 842     substring_position temp_pos;
 843     parsed_label->evaluate(1, *this, label, temp_pos);
 844     label_ptr = lookup_label(label);
 845   }
 846 }
 847
 848 void reference::compute_label()
 849 {
 850   label.clear();
 851   if (parsed_label)
 852     parsed_label->evaluate(0, *this, label, separator_pos);
 853   if (short_label_flag && parsed_short_label)
 854     parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
 855   if (date_as_label) {
 856     string new_date;
 857     if (parsed_date_label) {
 858       substring_position temp_pos;
 859       parsed_date_label->evaluate(0, *this, new_date, temp_pos);
 860     }
 861     set_date(new_date);
 862   }
 863   if (label_ptr)
 864     label_ptr->count += 1;
 865 }
 866
 867 void reference::immediate_compute_label()
 868 {
 869   if (label_ptr)
 870     label_ptr->total = 2;       // force use of disambiguator
 871   compute_label();
 872 }
 873
 874 int reference::merge_labels(reference **v, int n, label_type type,
 875                             string &result)
 876 {
 877   if (abbreviate_label_ranges)
 878     return merge_labels_by_number(v, n, type, result);
 879   else
 880     return merge_labels_by_parts(v, n, type, result);
 881 }
 882
 883 int reference::merge_labels_by_number(reference **v, int n, label_type type,
 884                                       string &result)
 885 {
 886   if (n <= 1)
 887     return 0;
 888   int num = get_number();
 889   // Only merge three or more labels.
 890   if (v[0]->get_number() != num + 1
 891       || v[1]->get_number() != num + 2)
 892     return 0;
 893   int i;
 894   for (i = 2; i < n; i++)
 895     if (v[i]->get_number() != num + i + 1)
 896       break;
 897   result = get_label(type);
 898   result += label_range_indicator;
 899   result += v[i - 1]->get_label(type);
 900   return i;
 901 }
 902
 903 const substring_position &reference::get_separator_pos(label_type type) const
 904 {
 905   if (type == SHORT_LABEL && short_label_flag)
 906     return short_separator_pos;
 907   else
 908     return separator_pos;
 909 }
 910
 911 const string &reference::get_label(label_type type) const
 912 {
 913   if (type == SHORT_LABEL && short_label_flag)
 914     return short_label;
 915   else
 916     return label;
 917 }
 918
 919 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
 920                                      string &result)
 921 {
 922   if (n <= 0)
 923     return 0;
 924   const string &lb = get_label(type);
 925   const substring_position &sp = get_separator_pos(type);
 926   if (sp.start < 0
 927       || sp.start != v[0]->get_separator_pos(type).start
 928       || memcmp(lb.contents(), v[0]->get_label(type).contents(),
 929                 sp.start) != 0)
 930     return 0;
 931   result = lb;
 932   int i = 0;
 933   do {
 934     result += separate_label_second_parts;
 935     const substring_position &s = v[i]->get_separator_pos(type);
 936     int sep_end_pos = s.start + s.length;
 937     result.append(v[i]->get_label(type).contents() + sep_end_pos,
 938                   v[i]->get_label(type).length() - sep_end_pos);
 939   } while (++i < n
 940            && sp.start == v[i]->get_separator_pos(type).start
 941            && memcmp(lb.contents(), v[i]->get_label(type).contents(),
 942                      sp.start) == 0);
 943   return i;
 944 }
 945
 946 string label_pool;
 947
 948 label_info::label_info(const string &s)
 949 : start(label_pool.length()), length(s.length()), count(0), total(1)
 950 {
 951   label_pool += s;
 952 }
 953
 954 static label_info **label_table = 0;
 955 static int label_table_size = 0;
 956 static int label_table_used = 0;
 957
 958 label_info *lookup_label(const string &label)
 959 {
 960   if (label_table == 0) {
 961     label_table = new label_info *[17];
 962     label_table_size = 17;
 963     for (int i = 0; i < 17; i++)
 964       label_table[i] = 0;
 965   }
 966   unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
 967   label_info **ptr;
 968   for (ptr = label_table + h;
 969        *ptr != 0;
 970        (ptr == label_table)
 971        ? (ptr = label_table + label_table_size - 1)
 972        : ptr--)
 973     if ((*ptr)->length == label.length()
 974         && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
 975                   label.length()) == 0) {
 976       (*ptr)->total += 1;
 977       return *ptr;
 978     }
 979   label_info *result = *ptr = new label_info(label);
 980   if (++label_table_used * 2 > label_table_size) {
 981     // Rehash the table.
 982     label_info **old_table = label_table;
 983     int old_size = label_table_size;
 984     label_table_size = next_size(label_table_size);
 985     label_table = new label_info *[label_table_size];
 986     int i;
 987     for (i = 0; i < label_table_size; i++)
 988       label_table[i] = 0;
 989     for (i = 0; i < old_size; i++)
 990       if (old_table[i]) {
 991         h = hash_string(label_pool.contents() + old_table[i]->start,
 992                         old_table[i]->length);
 993         label_info **p;
 994         for (p = label_table + (h % label_table_size);
 995              *p != 0;
 996              (p == label_table)
 997              ? (p = label_table + label_table_size - 1)
 998              : --p)
 999             ;
1000         *p = old_table[i];
1001         }
1002     a_delete old_table;
1003   }
1004   return result;
1005 }
1006
1007 void clear_labels()
1008 {
1009   for (int i = 0; i < label_table_size; i++) {
1010     delete label_table[i];
1011     label_table[i] = 0;
1012   }
1013   label_table_used = 0;
1014   label_pool.clear();
1015 }
1016
1017 static void consider_authors(reference **start, reference **end, int i);
1018
1019 void compute_labels(reference **v, int n)
1020 {
1021   if (parsed_label
1022       && (parsed_label->analyze() & expression::CONTAINS_AT)
1023       && sort_fields.length() >= 2
1024       && sort_fields[0] == 'A'
1025       && sort_fields[1] == '+')
1026     consider_authors(v, v + n, 0);
1027   for (int i = 0; i < n; i++)
1028     v[i]->compute_label();
1029 }
1030
1031
1032 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1033 where 0 <= i <= N if there exists a reference with a list of authors
1034 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1035 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1036 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1037 <B0,B1,...,BM>.  If a reference needs author i we only have to call
1038 need_author(j) for some j >= i such that the reference also needs
1039 author j. */
1040
1041 /* This function handles 2 tasks:
1042 determine which authors are needed (cannot be elided with et al.);
1043 determine which authors can have only last names in the labels.
1044
1045 References >= start and < end have the same first i author names.
1046 Also they're sorted by A+. */
1047
1048 static void consider_authors(reference **start, reference **end, int i)
1049 {
1050   if (start >= end)
1051     return;
1052   reference **p = start;
1053   if (i >= (*p)->get_nauthors()) {
1054     for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1055       ;
1056     if (p < end && i > 0) {
1057       // If we have an author list <A B C> and an author list <A B C D>,
1058       // then both lists need C.
1059       for (reference **q = start; q < end; q++)
1060         (*q)->need_author(i - 1);
1061     }
1062     start = p;
1063   }
1064   while (p < end) {
1065     reference **last_name_start = p;
1066     reference **name_start = p;
1067     for (++p;
1068          p < end && i < (*p)->get_nauthors()
1069          && same_author_last_name(**last_name_start, **p, i);
1070          p++) {
1071       if (!same_author_name(**name_start, **p, i)) {
1072         consider_authors(name_start, p, i + 1);
1073         name_start = p;
1074       }
1075     }
1076     consider_authors(name_start, p, i + 1);
1077     if (last_name_start == name_start) {
1078       for (reference **q = last_name_start; q < p; q++)
1079         (*q)->set_last_name_unambiguous(i);
1080     }
1081     // If we have an author list <A B C D> and <A B C E>, then the lists
1082     // need author D and E respectively.
1083     if (name_start > start || p < end) {
1084       for (reference **q = last_name_start; q < p; q++)
1085         (*q)->need_author(i);
1086     }
1087   }
1088 }
1089
1090 int same_author_last_name(const reference &r1, const reference &r2, int n)
1091 {
1092   const char *ae1;
1093   const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1094   const char *ae2;
1095   const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1096   if (!as1 && !as2) return 1;   // they are the same
1097   if (!as1 || !as2) return 0;
1098   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1099 }
1100
1101 int same_author_name(const reference &r1, const reference &r2, int n)
1102 {
1103   const char *ae1;
1104   const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1105   const char *ae2;
1106   const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1107   if (!as1 && !as2) return 1;   // they are the same
1108   if (!as1 || !as2) return 0;
1109   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1110 }
1111
1112
1113 void int_set::set(int i)
1114 {
1115   assert(i >= 0);
1116   int bytei = i >> 3;
1117   if (bytei >= v.length()) {
1118     int old_length = v.length();
1119     v.set_length(bytei + 1);
1120     for (int j = old_length; j <= bytei; j++)
1121       v[j] = 0;
1122   }
1123   v[bytei] |= 1 << (i & 7);
1124 }
1125
1126 int int_set::get(int i) const
1127 {
1128   assert(i >= 0);
1129   int bytei = i >> 3;
1130   return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1131 }
1132
1133 void reference::set_last_name_unambiguous(int i)
1134 {
1135   last_name_unambiguous.set(i);
1136 }
1137
1138 void reference::need_author(int n)
1139 {
1140   if (n > last_needed_author)
1141     last_needed_author = n;
1142 }
1143
1144 const char *reference::get_authors(const char **end) const
1145 {
1146   if (!computed_authors) {
1147     ((reference *)this)->computed_authors = 1;
1148     string &result = ((reference *)this)->authors;
1149     int na = get_nauthors();
1150     result.clear();
1151     for (int i = 0; i < na; i++) {
1152       if (last_name_unambiguous.get(i)) {
1153         const char *e, *start = get_author_last_name(i, &e);
1154         assert(start != 0);
1155         result.append(start, e - start);
1156       }
1157       else {
1158         const char *e, *start = get_author(i, &e);
1159         assert(start != 0);
1160         result.append(start, e - start);
1161       }
1162       if (i == last_needed_author
1163           && et_al.length() > 0
1164           && et_al_min_elide > 0
1165           && last_needed_author + et_al_min_elide < na
1166           && na >= et_al_min_total) {
1167         result += et_al;
1168         break;
1169       }
1170       if (i < na - 1) {
1171         if (na == 2)
1172           result += join_authors_exactly_two;
1173         else if (i < na - 2)
1174           result += join_authors_default;
1175         else
1176           result += join_authors_last_two;
1177       }
1178     }
1179   }
1180   const char *start = authors.contents();
1181   *end = start + authors.length();
1182   return start;
1183 }
1184
1185 int reference::get_nauthors() const
1186 {
1187   if (nauthors < 0) {
1188     const char *dummy;
1189     int na;
1190     for (na = 0; get_author(na, &dummy) != 0; na++)
1191       ;
1192     ((reference *)this)->nauthors = na;
1193   }
1194   return nauthors;
1195 }