gnu/dist/gettext/gettext-tools/src/x-lisp.c

   1 /* xgettext Lisp backend.
   2    Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <stdbool.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "message.h"
  31 #include "xgettext.h"
  32 #include "x-lisp.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "exit.h"
  36 #include "hash.h"
  37 #include "gettext.h"
  38
  39 #define _(s) gettext(s)
  40
  41
  42 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
  43    Since we are interested only in strings and in forms similar to
  44         (gettext msgid ...)
  45    or   (ngettext msgid msgid_plural ...)
  46    we make the following simplifications:
  47
  48    - Assume the keywords and strings are in an ASCII compatible encoding.
  49      This means we can read the input file one byte at a time, instead of
  50      one character at a time.  No need to worry about multibyte characters:
  51      If they occur as part of identifiers, they most probably act as
  52      constituent characters, and the byte based approach will do the same.
  53
  54    - Assume the read table is the standard Common Lisp read table.
  55      Non-standard read tables are mostly used to read data, not programs.
  56
  57    - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
  58
  59    - Don't interpret #n= and #n#, they usually don't appear in programs.
  60
  61    - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
  62
  63    The remaining syntax rules are:
  64
  65    - The syntax code assigned to each character, and how tokens are built
  66      up from characters (single escape, multiple escape etc.).
  67
  68    - Comment syntax: ';' and '#| ... |#'.
  69
  70    - String syntax: "..." with single escapes.
  71
  72    - Read macros and dispatch macro character '#'.  Needed to be able to
  73      tell which is the n-th argument of a function call.
  74
  75  */
  76
  77
  78 /* ========================= Lexer customization.  ========================= */
  79
  80 /* 'readtable_case' is the case conversion that is applied to non-escaped
  81     parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
  82
  83 enum rtcase
  84 {
  85   case_upcase,
  86   case_downcase,
  87   case_preserve,
  88   case_invert
  89 };
  90
  91 static enum rtcase readtable_case = case_upcase;
  92
  93 /* 'read_base' is the assumed radix of integers and rational numbers.
  94    In Common Lisp: *read-base*.  */
  95 static int read_base = 10;
  96
  97 /* 'read_preserve_whitespace' specifies whether a whitespace character
  98    that terminates a token must be pushed back on the input stream.
  99    We set it to true, because the special newline side effect in read_object()
 100    requires that read_object() sees every newline not inside a token.  */
 101 static bool read_preserve_whitespace = true;
 102
 103
 104 /* ====================== Keyword set customization.  ====================== */
 105
 106 /* If true extract all strings.  */
 107 static bool extract_all = false;
 108
 109 static hash_table keywords;
 110 static bool default_keywords = true;
 111
 112
 113 void
 114 x_lisp_extract_all ()
 115 {
 116   extract_all = true;
 117 }
 118
 119
 120 void
 121 x_lisp_keyword (const char *name)
 122 {
 123   if (name == NULL)
 124     default_keywords = false;
 125   else
 126     {
 127       const char *end;
 128       int argnum1;
 129       int argnum2;
 130       const char *colon;
 131       size_t len;
 132       char *symname;
 133       size_t i;
 134
 135       if (keywords.table == NULL)
 136         init_hash (&keywords, 100);
 137
 138       split_keywordspec (name, &end, &argnum1, &argnum2);
 139
 140       /* The characters between name and end should form a valid Lisp symbol.
 141          Extract the symbol name part.  */
 142       colon = strchr (name, ':');
 143       if (colon != NULL && colon < end)
 144         {
 145           name = colon + 1;
 146           if (name < end && *name == ':')
 147             name++;
 148           colon = strchr (name, ':');
 149           if (colon != NULL && colon < end)
 150             return;
 151         }
 152
 153       /* Uppercase it.  */
 154       len = end - name;
 155       symname = (char *) xmalloc (len);
 156       for (i = 0; i < len; i++)
 157         symname[i] =
 158           (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
 159
 160       if (argnum1 == 0)
 161         argnum1 = 1;
 162       insert_entry (&keywords, symname, len,
 163                     (void *) (long) (argnum1 + (argnum2 << 10)));
 164     }
 165 }
 166
 167 /* Finish initializing the keywords hash table.
 168    Called after argument processing, before each file is processed.  */
 169 static void
 170 init_keywords ()
 171 {
 172   if (default_keywords)
 173     {
 174       x_lisp_keyword ("gettext");       /* I18N:GETTEXT */
 175       x_lisp_keyword ("ngettext:1,2");  /* I18N:NGETTEXT */
 176       x_lisp_keyword ("gettext-noop");
 177       default_keywords = false;
 178     }
 179 }
 180
 181 void
 182 init_flag_table_lisp ()
 183 {
 184   xgettext_record_flag ("gettext:1:pass-lisp-format");
 185   xgettext_record_flag ("ngettext:1:pass-lisp-format");
 186   xgettext_record_flag ("ngettext:2:pass-lisp-format");
 187   xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
 188   xgettext_record_flag ("format:2:lisp-format");
 189 }
 190
 191
 192 /* ======================== Reading of characters.  ======================== */
 193
 194 /* Real filename, used in error messages about the input file.  */
 195 static const char *real_file_name;
 196
 197 /* Logical filename and line number, used to label the extracted messages.  */
 198 static char *logical_file_name;
 199 static int line_number;
 200
 201 /* The input file stream.  */
 202 static FILE *fp;
 203
 204
 205 /* Fetch the next character from the input file.  */
 206 static int
 207 do_getc ()
 208 {
 209   int c = getc (fp);
 210
 211   if (c == EOF)
 212     {
 213       if (ferror (fp))
 214         error (EXIT_FAILURE, errno, _("\
 215 error while reading \"%s\""), real_file_name);
 216     }
 217   else if (c == '\n')
 218    line_number++;
 219
 220   return c;
 221 }
 222
 223 /* Put back the last fetched character, not EOF.  */
 224 static void
 225 do_ungetc (int c)
 226 {
 227   if (c == '\n')
 228     line_number--;
 229   ungetc (c, fp);
 230 }
 231
 232
 233 /* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
 234
 235
 236 /* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
 237
 238 enum syntax_code
 239 {
 240   syntax_illegal,       /* non-printable, except whitespace     */
 241   syntax_single_esc,    /* '\' (single escape)                  */
 242   syntax_multi_esc,     /* '|' (multiple escape)                */
 243   syntax_constituent,   /* everything else (constituent)        */
 244   syntax_whitespace,    /* TAB,LF,FF,CR,' ' (whitespace)        */
 245   syntax_eof,           /* EOF                                  */
 246   syntax_t_macro,       /* '()'"' (terminating macro)           */
 247   syntax_nt_macro       /* '#' (non-terminating macro)          */
 248 };
 249
 250 /* Returns the syntax code of a character.  */
 251 static enum syntax_code
 252 syntax_code_of (unsigned char c)
 253 {
 254   switch (c)
 255     {
 256     case '\\':
 257       return syntax_single_esc;
 258     case '|':
 259       return syntax_multi_esc;
 260     case '\t': case '\n': case '\f': case '\r': case ' ':
 261       return syntax_whitespace;
 262     case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
 263       return syntax_t_macro;
 264     case '#':
 265       return syntax_nt_macro;
 266     default:
 267       if (c < ' ' && c != '\b')
 268         return syntax_illegal;
 269       else
 270         return syntax_constituent;
 271     }
 272 }
 273
 274 struct char_syntax
 275 {
 276   int ch;                       /* character */
 277   enum syntax_code scode;       /* syntax code */
 278 };
 279
 280 /* Returns the next character and its syntax code.  */
 281 static void
 282 read_char_syntax (struct char_syntax *p)
 283 {
 284   int c = do_getc ();
 285
 286   p->ch = c;
 287   p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
 288 }
 289
 290 /* Every character in a token has an attribute assigned.  The attributes
 291    help during interpretation of the token.  See
 292    CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
 293    and CLHS 2.1.4.2 "Constituent Traits".  */
 294
 295 enum attribute
 296 {
 297   a_illg,       /* invalid constituent */
 298   a_pack_m,     /* ':' package marker */
 299   a_alpha,      /* normal alphabetic */
 300   a_escaped,    /* alphabetic but not subject to case conversion */
 301   a_ratio,      /* '/' */
 302   a_dot,        /* '.' */
 303   a_sign,       /* '+-' */
 304   a_extens,     /* '_^' extension characters */
 305   a_digit,      /* '0123456789' */
 306   a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
 307   a_expodigit,  /* 'esfdlESFDL' below base */
 308   a_letter,     /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
 309   a_expo        /* 'esfdlESFDL' */
 310 };
 311
 312 #define is_letter_attribute(a) ((a) >= a_letter)
 313 #define is_number_attribute(a) ((a) >= a_ratio)
 314
 315 /* Returns the attribute of a character, assuming base 10.  */
 316 static enum attribute
 317 attribute_of (unsigned char c)
 318 {
 319   switch (c)
 320     {
 321     case ':':
 322       return a_pack_m;
 323     case '/':
 324       return a_ratio;
 325     case '.':
 326       return a_dot;
 327     case '+': case '-':
 328       return a_sign;
 329     case '_': case '^':
 330       return a_extens;
 331     case '0': case '1': case '2': case '3': case '4':
 332     case '5': case '6': case '7': case '8': case '9':
 333       return a_digit;
 334     case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
 335     case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 336     case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
 337     case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
 338     case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 339     case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
 340       return a_letter;
 341     case 'e': case 's': case 'd': case 'f': case 'l':
 342     case 'E': case 'S': case 'D': case 'F': case 'L':
 343       return a_expo;
 344     default:
 345       /* Treat everything as valid.  Never return a_illg.  */
 346       return a_alpha;
 347     }
 348 }
 349
 350 struct token_char
 351 {
 352   unsigned char ch;             /* character */
 353   unsigned char attribute;      /* attribute */
 354 };
 355
 356 /* A token consists of a sequence of characters with associated attribute.  */
 357 struct token
 358 {
 359   int allocated;                /* number of allocated 'token_char's */
 360   int charcount;                /* number of used 'token_char's */
 361   struct token_char *chars;     /* the token's constituents */
 362   bool with_escape;             /* whether single-escape or multiple escape occurs */
 363 };
 364
 365 /* Initialize a 'struct token'.  */
 366 static inline void
 367 init_token (struct token *tp)
 368 {
 369   tp->allocated = 10;
 370   tp->chars =
 371     (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char));
 372   tp->charcount = 0;
 373 }
 374
 375 /* Free the memory pointed to by a 'struct token'.  */
 376 static inline void
 377 free_token (struct token *tp)
 378 {
 379   free (tp->chars);
 380 }
 381
 382 /* Ensure there is enough room in the token for one more character.  */
 383 static inline void
 384 grow_token (struct token *tp)
 385 {
 386   if (tp->charcount == tp->allocated)
 387     {
 388       tp->allocated *= 2;
 389       tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
 390     }
 391 }
 392
 393 /* Read the next token.  If 'first' is given, it points to the first
 394    character, which has already been read.
 395    The algorithm follows CLHS 2.2 "Reader Algorithm".  */
 396 static void
 397 read_token (struct token *tp, const struct char_syntax *first)
 398 {
 399   bool multiple_escape_flag;
 400   struct char_syntax curr;
 401
 402   init_token (tp);
 403   tp->with_escape = false;
 404
 405   multiple_escape_flag = false;
 406   if (first)
 407     curr = *first;
 408   else
 409     read_char_syntax (&curr);
 410
 411   for (;; read_char_syntax (&curr))
 412     {
 413       switch (curr.scode)
 414         {
 415         case syntax_illegal:
 416           /* Invalid input.  Be tolerant, no error message.  */
 417           do_ungetc (curr.ch);
 418           return;
 419
 420         case syntax_single_esc:
 421           tp->with_escape = true;
 422           read_char_syntax (&curr);
 423           if (curr.scode == syntax_eof)
 424             /* Invalid input.  Be tolerant, no error message.  */
 425             return;
 426           grow_token (tp);
 427           tp->chars[tp->charcount].ch = curr.ch;
 428           tp->chars[tp->charcount].attribute = a_escaped;
 429           tp->charcount++;
 430           break;
 431
 432         case syntax_multi_esc:
 433           multiple_escape_flag = !multiple_escape_flag;
 434           tp->with_escape = true;
 435           break;
 436
 437         case syntax_constituent:
 438         case syntax_nt_macro:
 439           grow_token (tp);
 440           if (multiple_escape_flag)
 441             {
 442               tp->chars[tp->charcount].ch = curr.ch;
 443               tp->chars[tp->charcount].attribute = a_escaped;
 444               tp->charcount++;
 445             }
 446           else
 447             {
 448               tp->chars[tp->charcount].ch = curr.ch;
 449               tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
 450               tp->charcount++;
 451             }
 452           break;
 453
 454         case syntax_whitespace:
 455         case syntax_t_macro:
 456           if (multiple_escape_flag)
 457             {
 458               grow_token (tp);
 459               tp->chars[tp->charcount].ch = curr.ch;
 460               tp->chars[tp->charcount].attribute = a_escaped;
 461               tp->charcount++;
 462             }
 463           else
 464             {
 465               if (curr.scode != syntax_whitespace || read_preserve_whitespace)
 466                 do_ungetc (curr.ch);
 467               return;
 468             }
 469           break;
 470
 471         case syntax_eof:
 472           if (multiple_escape_flag)
 473             /* Invalid input.  Be tolerant, no error message.  */
 474             ;
 475           return;
 476         }
 477     }
 478 }
 479
 480 /* A potential number is a token which
 481    1. consists only of digits, '+','-','/','^','_','.' and number markers.
 482       The base for digits is context dependent, but always 10 if a dot '.'
 483       occurs. A number marker is a non-digit letter which is not adjacent
 484       to a non-digit letter.
 485    2. has at least one digit.
 486    3. starts with a digit, '+','-','.','^' or '_'.
 487    4. does not end with '+' or '-'.
 488    See CLHS 2.3.1.1 "Potential Numbers as Tokens".
 489  */
 490
 491 static inline bool
 492 has_a_dot (const struct token *tp)
 493 {
 494   int n = tp->charcount;
 495   int i;
 496
 497   for (i = 0; i < n; i++)
 498     if (tp->chars[i].attribute == a_dot)
 499       return true;
 500   return false;
 501 }
 502
 503 static inline bool
 504 all_a_number (const struct token *tp)
 505 {
 506   int n = tp->charcount;
 507   int i;
 508
 509   for (i = 0; i < n; i++)
 510     if (!is_number_attribute (tp->chars[i].attribute))
 511       return false;
 512   return true;
 513 }
 514
 515 static inline void
 516 a_letter_to_digit (const struct token *tp, int base)
 517 {
 518   int n = tp->charcount;
 519   int i;
 520
 521   for (i = 0; i < n; i++)
 522     if (is_letter_attribute (tp->chars[i].attribute))
 523       {
 524         int c = tp->chars[i].ch;
 525
 526         if (c >= 'a')
 527           c -= 'a' - 'A';
 528         if (c - 'A' + 10 < base)
 529           tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
 530                                           a_expo -> a_expodigit */
 531       }
 532 }
 533
 534 static inline bool
 535 has_a_digit (const struct token *tp)
 536 {
 537   int n = tp->charcount;
 538   int i;
 539
 540   for (i = 0; i < n; i++)
 541     if (tp->chars[i].attribute == a_digit
 542         || tp->chars[i].attribute == a_letterdigit
 543         || tp->chars[i].attribute == a_expodigit)
 544       return true;
 545   return false;
 546 }
 547
 548 static inline bool
 549 has_adjacent_letters (const struct token *tp)
 550 {
 551   int n = tp->charcount;
 552   int i;
 553
 554   for (i = 1; i < n; i++)
 555     if (is_letter_attribute (tp->chars[i-1].attribute)
 556         && is_letter_attribute (tp->chars[i].attribute))
 557       return true;
 558   return false;
 559 }
 560
 561 static bool
 562 is_potential_number (const struct token *tp, int *basep)
 563 {
 564   /* CLHS 2.3.1.1.1:
 565      "A potential number cannot contain any escape characters."  */
 566   if (tp->with_escape)
 567     return false;
 568
 569   if (has_a_dot (tp))
 570     *basep = 10;
 571
 572   if (!all_a_number (tp))
 573     return false;
 574
 575   a_letter_to_digit (tp, *basep);
 576
 577   if (!has_a_digit (tp))
 578     return false;
 579
 580   if (has_adjacent_letters (tp))
 581     return false;
 582
 583   if (!(tp->chars[0].attribute >= a_dot
 584         && tp->chars[0].attribute <= a_expodigit))
 585     return false;
 586
 587   if (tp->chars[tp->charcount - 1].attribute == a_sign)
 588     return false;
 589
 590   return true;
 591 }
 592
 593 /* A number is one of integer, ratio, float.  Each has a particular syntax.
 594    See CLHS 2.3.1 "Numbers as Tokens".
 595    But note a mistake: The exponent rule should read:
 596        exponent ::= exponent-marker [sign] {decimal-digit}+
 597    (see 22.1.3.1.3 "Printing Floats").  */
 598
 599 enum number_type
 600 {
 601   n_none,
 602   n_integer,
 603   n_ratio,
 604   n_float
 605 };
 606
 607 static enum number_type
 608 is_number (const struct token *tp, int *basep)
 609 {
 610   struct token_char *ptr_limit;
 611   struct token_char *ptr1;
 612
 613   if (!is_potential_number (tp, basep))
 614     return n_none;
 615
 616   /* is_potential_number guarantees
 617      - all attributes are >= a_ratio,
 618      - there is at least one a_digit or a_letterdigit or a_expodigit, and
 619      - if there is an a_dot, then *basep = 10.  */
 620
 621   ptr1 = &tp->chars[0];
 622   ptr_limit = &tp->chars[tp->charcount];
 623
 624   if (ptr1->attribute == a_sign)
 625     ptr1++;
 626
 627   /* Test for syntax
 628    * { a_sign | }
 629    * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
 630    */
 631   {
 632     bool seen_a_ratio = false;
 633     bool seen_a_digit = false;  /* seen a digit in last digit block? */
 634     struct token_char *ptr;
 635
 636     for (ptr = ptr1;; ptr++)
 637       {
 638         if (ptr >= ptr_limit)
 639           {
 640             if (!seen_a_digit)
 641               break;
 642             if (seen_a_ratio)
 643               return n_ratio;
 644             else
 645               return n_integer;
 646           }
 647         if (ptr->attribute == a_digit
 648             || ptr->attribute == a_letterdigit
 649             || ptr->attribute == a_expodigit)
 650           {
 651             int c = ptr->ch;
 652
 653             c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
 654             if (c >= *basep)
 655               break;
 656             seen_a_digit = true;
 657           }
 658         else if (ptr->attribute == a_ratio)
 659           {
 660             if (seen_a_ratio || !seen_a_digit)
 661               break;
 662             seen_a_ratio = true;
 663             seen_a_digit = false;
 664           }
 665         else
 666           break;
 667       }
 668   }
 669
 670   /* Test for syntax
 671    * { a_sign | }
 672    * { a_digit }* { a_dot { a_digit }* | }
 673    * { a_expo { a_sign | } { a_digit }+ | }
 674    *
 675    * If there is an exponent part, there must be digits before the dot or
 676    * after the dot. The result is a float.
 677    * If there is no exponen:
 678    *   If there is no dot, it would an integer in base 10, but is has already
 679    *   been verified to not be an integer in the current base.
 680    *   If there is a dot:
 681    *     If there are digits after the dot, it's a float.
 682    *     Otherwise, if there are digits before the dot, it's an integer.
 683    */
 684   *basep = 10;
 685   {
 686     bool seen_a_dot = false;
 687     bool seen_a_dot_with_leading_digits = false;
 688     bool seen_a_digit = false;  /* seen a digit in last digit block? */
 689     struct token_char *ptr;
 690
 691     for (ptr = ptr1;; ptr++)
 692       {
 693         if (ptr >= ptr_limit)
 694           {
 695             /* no exponent */
 696             if (!seen_a_dot)
 697               return n_none;
 698             if (seen_a_digit)
 699               return n_float;
 700             if (seen_a_dot_with_leading_digits)
 701               return n_integer;
 702             else
 703               return n_none;
 704           }
 705         if (ptr->attribute == a_digit)
 706           {
 707             seen_a_digit = true;
 708           }
 709         else if (ptr->attribute == a_dot)
 710           {
 711             if (seen_a_dot)
 712               return n_none;
 713             seen_a_dot = true;
 714             if (seen_a_digit)
 715               seen_a_dot_with_leading_digits = true;
 716             seen_a_digit = false;
 717           }
 718         else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
 719           break;
 720         else
 721           return n_none;
 722       }
 723     ptr++;
 724     if (!seen_a_dot_with_leading_digits || !seen_a_digit)
 725       return n_none;
 726     if (ptr >= ptr_limit)
 727       return n_none;
 728     if (ptr->attribute == a_sign)
 729       ptr++;
 730     seen_a_digit = false;
 731     for (;; ptr++)
 732       {
 733         if (ptr >= ptr_limit)
 734           break;
 735         if (ptr->attribute != a_digit)
 736           return n_none;
 737         seen_a_digit = true;
 738       }
 739     if (!seen_a_digit)
 740       return n_none;
 741     return n_float;
 742   }
 743 }
 744
 745 /* A token representing a symbol must be case converted.
 746    For portability, we convert only ASCII characters here.  */
 747
 748 static void
 749 upcase_token (struct token *tp)
 750 {
 751   int n = tp->charcount;
 752   int i;
 753
 754   for (i = 0; i < n; i++)
 755     if (tp->chars[i].attribute != a_escaped)
 756       {
 757         unsigned char c = tp->chars[i].ch;
 758         if (c >= 'a' && c <= 'z')
 759           tp->chars[i].ch = c - 'a' + 'A';
 760       }
 761 }
 762
 763 static void
 764 downcase_token (struct token *tp)
 765 {
 766   int n = tp->charcount;
 767   int i;
 768
 769   for (i = 0; i < n; i++)
 770     if (tp->chars[i].attribute != a_escaped)
 771       {
 772         unsigned char c = tp->chars[i].ch;
 773         if (c >= 'A' && c <= 'Z')
 774           tp->chars[i].ch = c - 'A' + 'a';
 775       }
 776 }
 777
 778 static void
 779 case_convert_token (struct token *tp)
 780 {
 781   int n = tp->charcount;
 782   int i;
 783
 784   switch (readtable_case)
 785     {
 786     case case_upcase:
 787       upcase_token (tp);
 788       break;
 789
 790     case case_downcase:
 791       downcase_token (tp);
 792       break;
 793
 794     case case_preserve:
 795       break;
 796
 797     case case_invert:
 798       {
 799         bool seen_uppercase = false;
 800         bool seen_lowercase = false;
 801         for (i = 0; i < n; i++)
 802           if (tp->chars[i].attribute != a_escaped)
 803             {
 804               unsigned char c = tp->chars[i].ch;
 805               if (c >= 'a' && c <= 'z')
 806                 seen_lowercase = true;
 807               if (c >= 'A' && c <= 'Z')
 808                 seen_uppercase = true;
 809             }
 810         if (seen_uppercase)
 811           {
 812             if (!seen_lowercase)
 813               downcase_token (tp);
 814           }
 815         else
 816           {
 817             if (seen_lowercase)
 818               upcase_token (tp);
 819           }
 820       }
 821       break;
 822     }
 823 }
 824
 825
 826 /* ========================= Accumulating comments ========================= */
 827
 828
 829 static char *buffer;
 830 static size_t bufmax;
 831 static size_t buflen;
 832
 833 static inline void
 834 comment_start ()
 835 {
 836   buflen = 0;
 837 }
 838
 839 static inline void
 840 comment_add (int c)
 841 {
 842   if (buflen >= bufmax)
 843     {
 844       bufmax = 2 * bufmax + 10;
 845       buffer = xrealloc (buffer, bufmax);
 846     }
 847   buffer[buflen++] = c;
 848 }
 849
 850 static inline void
 851 comment_line_end (size_t chars_to_remove)
 852 {
 853   buflen -= chars_to_remove;
 854   while (buflen >= 1
 855          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 856     --buflen;
 857   if (chars_to_remove == 0 && buflen >= bufmax)
 858     {
 859       bufmax = 2 * bufmax + 10;
 860       buffer = xrealloc (buffer, bufmax);
 861     }
 862   buffer[buflen] = '\0';
 863   xgettext_comment_add (buffer);
 864 }
 865
 866
 867 /* These are for tracking whether comments count as immediately before
 868    keyword.  */
 869 static int last_comment_line;
 870 static int last_non_comment_line;
 871
 872
 873 /* ========================= Accumulating messages ========================= */
 874
 875
 876 static message_list_ty *mlp;
 877
 878
 879 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
 880
 881
 882 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
 883    Other objects need not to be represented precisely.  */
 884 enum object_type
 885 {
 886   t_symbol,     /* symbol */
 887   t_string,     /* string */
 888   t_other,      /* other kind of real object */
 889   t_dot,        /* '.' pseudo object */
 890   t_close,      /* ')' pseudo object */
 891   t_eof         /* EOF marker */
 892 };
 893
 894 struct object
 895 {
 896   enum object_type type;
 897   struct token *token;          /* for t_symbol and t_string */
 898   int line_number_at_start;     /* for t_string */
 899 };
 900
 901 /* Free the memory pointed to by a 'struct object'.  */
 902 static inline void
 903 free_object (struct object *op)
 904 {
 905   if (op->type == t_symbol || op->type == t_string)
 906     {
 907       free_token (op->token);
 908       free (op->token);
 909     }
 910 }
 911
 912 /* Convert a t_symbol/t_string token to a char*.  */
 913 static char *
 914 string_of_object (const struct object *op)
 915 {
 916   char *str;
 917   const struct token_char *p;
 918   char *q;
 919   int n;
 920
 921   if (!(op->type == t_symbol || op->type == t_string))
 922     abort ();
 923   n = op->token->charcount;
 924   str = (char *) xmalloc (n + 1);
 925   q = str;
 926   for (p = op->token->chars; n > 0; p++, n--)
 927     *q++ = p->ch;
 928   *q = '\0';
 929   return str;
 930 }
 931
 932 /* Context lookup table.  */
 933 static flag_context_list_table_ty *flag_context_list_table;
 934
 935 /* Read the next object.  */
 936 static void
 937 read_object (struct object *op, flag_context_ty outer_context)
 938 {
 939   for (;;)
 940     {
 941       struct char_syntax curr;
 942
 943       read_char_syntax (&curr);
 944
 945       switch (curr.scode)
 946         {
 947         case syntax_eof:
 948           op->type = t_eof;
 949           return;
 950
 951         case syntax_whitespace:
 952           if (curr.ch == '\n')
 953             /* Comments assumed to be grouped with a message must immediately
 954                precede it, with no non-whitespace token on a line between
 955                both.  */
 956             if (last_non_comment_line > last_comment_line)
 957               xgettext_comment_reset ();
 958           continue;
 959
 960         case syntax_illegal:
 961           op->type = t_other;
 962           return;
 963
 964         case syntax_single_esc:
 965         case syntax_multi_esc:
 966         case syntax_constituent:
 967           /* Start reading a token.  */
 968           op->token = (struct token *) xmalloc (sizeof (struct token));
 969           read_token (op->token, &curr);
 970           last_non_comment_line = line_number;
 971
 972           /* Interpret the token.  */
 973
 974           /* Dots.  */
 975           if (!op->token->with_escape
 976               && op->token->charcount == 1
 977               && op->token->chars[0].attribute == a_dot)
 978             {
 979               free_token (op->token);
 980               free (op->token);
 981               op->type = t_dot;
 982               return;
 983             }
 984           /* Tokens consisting entirely of dots are illegal, but be tolerant
 985              here.  */
 986
 987           /* Number.  */
 988           {
 989             int base = read_base;
 990
 991             if (is_number (op->token, &base) != n_none)
 992               {
 993                 free_token (op->token);
 994                 free (op->token);
 995                 op->type = t_other;
 996                 return;
 997               }
 998           }
 999
1000           /* We interpret all other tokens as symbols (including 'reserved
1001              tokens', i.e. potential numbers which are not numbers).  */
1002           case_convert_token (op->token);
1003           op->type = t_symbol;
1004           return;
1005
1006         case syntax_t_macro:
1007         case syntax_nt_macro:
1008           /* Read a macro.  */
1009           switch (curr.ch)
1010             {
1011             case '(':
1012               {
1013                 int arg = 0;            /* Current argument number.  */
1014                 flag_context_list_iterator_ty context_iter;
1015                 int argnum1 = 0;        /* First string position.  */
1016                 int argnum2 = 0;        /* Plural string position.  */
1017                 message_ty *plural_mp = NULL;   /* Remember the msgid.  */
1018
1019                 for (;; arg++)
1020                   {
1021                     struct object inner;
1022                     flag_context_ty inner_context;
1023
1024                     if (arg == 0)
1025                       inner_context = null_context;
1026                     else
1027                       inner_context =
1028                         inherited_context (outer_context,
1029                                            flag_context_list_iterator_advance (
1030                                              &context_iter));
1031
1032                     read_object (&inner, inner_context);
1033
1034                     /* Recognize end of list.  */
1035                     if (inner.type == t_close)
1036                       {
1037                         op->type = t_other;
1038                         /* Don't bother converting "()" to "NIL".  */
1039                         last_non_comment_line = line_number;
1040                         return;
1041                       }
1042
1043                     /* Dots are not allowed in every position.
1044                        But be tolerant.  */
1045
1046                     /* EOF inside list is illegal.
1047                        But be tolerant.  */
1048                     if (inner.type == t_eof)
1049                       break;
1050
1051                     if (arg == 0)
1052                       {
1053                         /* This is the function position.  */
1054                         if (inner.type == t_symbol)
1055                           {
1056                             char *symbol_name = string_of_object (&inner);
1057                             int i;
1058                             int prefix_len;
1059                             void *keyword_value;
1060
1061                             /* Omit any package name.  */
1062                             i = inner.token->charcount;
1063                             while (i > 0
1064                                    && inner.token->chars[i-1].attribute != a_pack_m)
1065                               i--;
1066                             prefix_len = i;
1067
1068                             if (find_entry (&keywords,
1069                                             symbol_name + prefix_len,
1070                                             strlen (symbol_name + prefix_len),
1071                                             &keyword_value)
1072                                 == 0)
1073                               {
1074                                 argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1075                                 argnum2 = (int) (long) keyword_value >> 10;
1076                               }
1077
1078                             context_iter =
1079                               flag_context_list_iterator (
1080                                 flag_context_list_table_lookup (
1081                                   flag_context_list_table,
1082                                   symbol_name, strlen (symbol_name)));
1083
1084                             free (symbol_name);
1085                           }
1086                         else
1087                           context_iter = null_context_list_iterator;
1088                       }
1089                     else
1090                       {
1091                         /* These are the argument positions.
1092                            Extract a string if we have reached the right
1093                            argument position.  */
1094                         if (arg == argnum1)
1095                           {
1096                             if (inner.type == t_string)
1097                               {
1098                                 lex_pos_ty pos;
1099                                 message_ty *mp;
1100
1101                                 pos.file_name = logical_file_name;
1102                                 pos.line_number = inner.line_number_at_start;
1103                                 mp = remember_a_message (mlp, string_of_object (&inner),
1104                                                          inner_context, &pos);
1105                                 if (argnum2 > 0)
1106                                   plural_mp = mp;
1107                               }
1108                           }
1109                         else if (arg == argnum2)
1110                           {
1111                             if (inner.type == t_string && plural_mp != NULL)
1112                               {
1113                                 lex_pos_ty pos;
1114
1115                                 pos.file_name = logical_file_name;
1116                                 pos.line_number = inner.line_number_at_start;
1117                                 remember_a_message_plural (plural_mp, string_of_object (&inner),
1118                                                            inner_context, &pos);
1119                               }
1120                           }
1121                       }
1122
1123                     free_object (&inner);
1124                   }
1125               }
1126               op->type = t_other;
1127               last_non_comment_line = line_number;
1128               return;
1129
1130             case ')':
1131               /* Tell the caller about the end of list.
1132                  Unmatched closing parenthesis is illegal.
1133                  But be tolerant.  */
1134               op->type = t_close;
1135               last_non_comment_line = line_number;
1136               return;
1137
1138             case ',':
1139               {
1140                 int c = do_getc ();
1141                 /* The ,@ handling inside lists is wrong anyway, because
1142                    ,@form expands to an unknown number of elements.  */
1143                 if (c != EOF && c != '@' && c != '.')
1144                   do_ungetc (c);
1145               }
1146               /*FALLTHROUGH*/
1147             case '\'':
1148             case '`':
1149               {
1150                 struct object inner;
1151
1152                 read_object (&inner, null_context);
1153
1154                 /* Dots and EOF are not allowed here.  But be tolerant.  */
1155
1156                 free_object (&inner);
1157
1158                 op->type = t_other;
1159                 last_non_comment_line = line_number;
1160                 return;
1161               }
1162
1163             case ';':
1164               {
1165                 bool all_semicolons = true;
1166
1167                 last_comment_line = line_number;
1168                 comment_start ();
1169                 for (;;)
1170                   {
1171                     int c = do_getc ();
1172                     if (c == EOF || c == '\n')
1173                       break;
1174                     if (c != ';')
1175                       all_semicolons = false;
1176                     if (!all_semicolons)
1177                       {
1178                         /* We skip all leading white space, but not EOLs.  */
1179                         if (!(buflen == 0 && (c == ' ' || c == '\t')))
1180                           comment_add (c);
1181                       }
1182                   }
1183                 comment_line_end (0);
1184                 continue;
1185               }
1186
1187             case '"':
1188               {
1189                 op->token = (struct token *) xmalloc (sizeof (struct token));
1190                 init_token (op->token);
1191                 op->line_number_at_start = line_number;
1192                 for (;;)
1193                   {
1194                     int c = do_getc ();
1195                     if (c == EOF)
1196                       /* Invalid input.  Be tolerant, no error message.  */
1197                       break;
1198                     if (c == '"')
1199                       break;
1200                     if (c == '\\') /* syntax_single_esc */
1201                       {
1202                         c = do_getc ();
1203                         if (c == EOF)
1204                           /* Invalid input.  Be tolerant, no error message.  */
1205                           break;
1206                       }
1207                     grow_token (op->token);
1208                     op->token->chars[op->token->charcount++].ch = c;
1209                   }
1210                 op->type = t_string;
1211
1212                 if (extract_all)
1213                   {
1214                     lex_pos_ty pos;
1215
1216                     pos.file_name = logical_file_name;
1217                     pos.line_number = op->line_number_at_start;
1218                     remember_a_message (mlp, string_of_object (op),
1219                                         null_context, &pos);
1220                   }
1221                 last_non_comment_line = line_number;
1222                 return;
1223               }
1224
1225             case '#':
1226               /* Dispatch macro handling.  */
1227               {
1228                 int c;
1229
1230                 for (;;)
1231                   {
1232                     c = do_getc ();
1233                     if (c == EOF)
1234                       /* Invalid input.  Be tolerant, no error message.  */
1235                       {
1236                         op->type = t_other;
1237                         return;
1238                       }
1239                     if (!(c >= '0' && c <= '9'))
1240                       break;
1241                   }
1242
1243                 switch (c)
1244                   {
1245                   case '(':
1246                   case '"':
1247                     do_ungetc (c);
1248                     /*FALLTHROUGH*/
1249                   case '\'':
1250                   case ':':
1251                   case '.':
1252                   case ',':
1253                   case 'A': case 'a':
1254                   case 'C': case 'c':
1255                   case 'P': case 'p':
1256                   case 'S': case 's':
1257                     {
1258                       struct object inner;
1259                       read_object (&inner, null_context);
1260                       /* Dots and EOF are not allowed here.
1261                          But be tolerant.  */
1262                       free_object (&inner);
1263                       op->type = t_other;
1264                       last_non_comment_line = line_number;
1265                       return;
1266                     }
1267
1268                   case '|':
1269                     {
1270                       int depth = 0;
1271                       int c;
1272
1273                       comment_start ();
1274                       c = do_getc ();
1275                       for (;;)
1276                         {
1277                           if (c == EOF)
1278                             break;
1279                           if (c == '|')
1280                             {
1281                               c = do_getc ();
1282                               if (c == EOF)
1283                                 break;
1284                               if (c == '#')
1285                                 {
1286                                   if (depth == 0)
1287                                     {
1288                                       comment_line_end (0);
1289                                       break;
1290                                     }
1291                                   depth--;
1292                                   comment_add ('|');
1293                                   comment_add ('#');
1294                                   c = do_getc ();
1295                                 }
1296                               else
1297                                 comment_add ('|');
1298                             }
1299                           else if (c == '#')
1300                             {
1301                               c = do_getc ();
1302                               if (c == EOF)
1303                                 break;
1304                               comment_add ('#');
1305                               if (c == '|')
1306                                 {
1307                                   depth++;
1308                                   comment_add ('|');
1309                                   c = do_getc ();
1310                                 }
1311                             }
1312                           else
1313                             {
1314                               /* We skip all leading white space.  */
1315                               if (!(buflen == 0 && (c == ' ' || c == '\t')))
1316                                 comment_add (c);
1317                               if (c == '\n')
1318                                 {
1319                                   comment_line_end (1);
1320                                   comment_start ();
1321                                 }
1322                               c = do_getc ();
1323                             }
1324                         }
1325                       if (c == EOF)
1326                         {
1327                           /* EOF not allowed here.  But be tolerant.  */
1328                           op->type = t_eof;
1329                           return;
1330                         }
1331                       last_comment_line = line_number;
1332                       continue;
1333                     }
1334
1335                   case '\\':
1336                     {
1337                       struct token token;
1338                       struct char_syntax first;
1339                       first.ch = '\\';
1340                       first.scode = syntax_single_esc;
1341                       read_token (&token, &first);
1342                       free_token (&token);
1343                       op->type = t_other;
1344                       last_non_comment_line = line_number;
1345                       return;
1346                     }
1347
1348                   case 'B': case 'b':
1349                   case 'O': case 'o':
1350                   case 'X': case 'x':
1351                   case 'R': case 'r':
1352                   case '*':
1353                     {
1354                       struct token token;
1355                       read_token (&token, NULL);
1356                       free_token (&token);
1357                       op->type = t_other;
1358                       last_non_comment_line = line_number;
1359                       return;
1360                     }
1361
1362                   case '=':
1363                     /* Ignore read labels.  */
1364                     continue;
1365
1366                   case '#':
1367                     /* Don't bother looking up the corresponding object.  */
1368                     op->type = t_other;
1369                     last_non_comment_line = line_number;
1370                     return;
1371
1372                   case '+':
1373                   case '-':
1374                     /* Simply assume every feature expression is true.  */
1375                     {
1376                       struct object inner;
1377                       read_object (&inner, null_context);
1378                       /* Dots and EOF are not allowed here.
1379                          But be tolerant.  */
1380                       free_object (&inner);
1381                       continue;
1382                     }
1383
1384                   default:
1385                     op->type = t_other;
1386                     last_non_comment_line = line_number;
1387                     return;
1388                   }
1389                 /*NOTREACHED*/
1390                 abort ();
1391               }
1392
1393             default:
1394               /*NOTREACHED*/
1395               abort ();
1396             }
1397
1398         default:
1399           /*NOTREACHED*/
1400           abort ();
1401         }
1402     }
1403 }
1404
1405
1406 void
1407 extract_lisp (FILE *f,
1408               const char *real_filename, const char *logical_filename,
1409               flag_context_list_table_ty *flag_table,
1410               msgdomain_list_ty *mdlp)
1411 {
1412   mlp = mdlp->item[0]->messages;
1413
1414   fp = f;
1415   real_file_name = real_filename;
1416   logical_file_name = xstrdup (logical_filename);
1417   line_number = 1;
1418
1419   last_comment_line = -1;
1420   last_non_comment_line = -1;
1421
1422   flag_context_list_table = flag_table;
1423
1424   init_keywords ();
1425
1426   /* Eat tokens until eof is seen.  When read_object returns
1427      due to an unbalanced closing parenthesis, just restart it.  */
1428   do
1429     {
1430       struct object toplevel_object;
1431
1432       read_object (&toplevel_object, null_context);
1433
1434       if (toplevel_object.type == t_eof)
1435         break;
1436
1437       free_object (&toplevel_object);
1438     }
1439   while (!feof (fp));
1440
1441   /* Close scanner.  */
1442   fp = NULL;
1443   real_file_name = NULL;
1444   logical_file_name = NULL;
1445   line_number = 0;
1446 }