gnu/dist/gettext/gettext-tools/src/x-scheme.c

   1 /* xgettext Scheme backend.
   2    Copyright (C) 2004-2005 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <stdbool.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "message.h"
  31 #include "xgettext.h"
  32 #include "x-scheme.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "exit.h"
  36 #include "hash.h"
  37 #include "gettext.h"
  38
  39 #define _(s) gettext(s)
  40
  41
  42 /* The Scheme syntax is described in R5RS.  It is implemented in
  43    guile-1.6.4/libguile/read.c.
  44    Since we are interested only in strings and in forms similar to
  45         (gettext msgid ...)
  46    or   (ngettext msgid msgid_plural ...)
  47    we make the following simplifications:
  48
  49    - Assume the keywords and strings are in an ASCII compatible encoding.
  50      This means we can read the input file one byte at a time, instead of
  51      one character at a time.  No need to worry about multibyte characters:
  52      If they occur as part of identifiers, they most probably act as
  53      constituent characters, and the byte based approach will do the same.
  54
  55    - Assume the read-hash-procedures is in the default state.
  56      Non-standard reader extensions are mostly used to read data, not programs.
  57
  58    The remaining syntax rules are:
  59
  60    - The syntax code assigned to each character, and how tokens are built
  61      up from characters (single escape, multiple escape etc.).
  62
  63    - Comment syntax: ';' and '#! ... \n!#\n'.
  64
  65    - String syntax: "..." with single escapes.
  66
  67    - Read macros and dispatch macro character '#'.  Needed to be able to
  68      tell which is the n-th argument of a function call.
  69
  70  */
  71
  72
  73 /* ====================== Keyword set customization.  ====================== */
  74
  75 /* If true extract all strings.  */
  76 static bool extract_all = false;
  77
  78 static hash_table keywords;
  79 static bool default_keywords = true;
  80
  81
  82 void
  83 x_scheme_extract_all ()
  84 {
  85   extract_all = true;
  86 }
  87
  88
  89 void
  90 x_scheme_keyword (const char *name)
  91 {
  92   if (name == NULL)
  93     default_keywords = false;
  94   else
  95     {
  96       const char *end;
  97       int argnum1;
  98       int argnum2;
  99       const char *colon;
 100
 101       if (keywords.table == NULL)
 102         init_hash (&keywords, 100);
 103
 104       split_keywordspec (name, &end, &argnum1, &argnum2);
 105
 106       /* The characters between name and end should form a valid Lisp symbol.
 107          Extract the symbol name part.  */
 108       colon = strchr (name, ':');
 109       if (colon != NULL && colon < end)
 110         {
 111           name = colon + 1;
 112           if (name < end && *name == ':')
 113             name++;
 114           colon = strchr (name, ':');
 115           if (colon != NULL && colon < end)
 116             return;
 117         }
 118
 119       if (argnum1 == 0)
 120         argnum1 = 1;
 121       insert_entry (&keywords, name, end - name,
 122                     (void *) (long) (argnum1 + (argnum2 << 10)));
 123     }
 124 }
 125
 126 /* Finish initializing the keywords hash table.
 127    Called after argument processing, before each file is processed.  */
 128 static void
 129 init_keywords ()
 130 {
 131   if (default_keywords)
 132     {
 133       x_scheme_keyword ("gettext");             /* libguile/i18n.c */
 134       x_scheme_keyword ("ngettext:1,2");        /* libguile/i18n.c */
 135       x_scheme_keyword ("gettext-noop");
 136       default_keywords = false;
 137     }
 138 }
 139
 140 void
 141 init_flag_table_scheme ()
 142 {
 143   xgettext_record_flag ("gettext:1:pass-scheme-format");
 144   xgettext_record_flag ("ngettext:1:pass-scheme-format");
 145   xgettext_record_flag ("ngettext:2:pass-scheme-format");
 146   xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
 147   xgettext_record_flag ("format:2:scheme-format");
 148 }
 149
 150
 151 /* ======================== Reading of characters.  ======================== */
 152
 153 /* Real filename, used in error messages about the input file.  */
 154 static const char *real_file_name;
 155
 156 /* Logical filename and line number, used to label the extracted messages.  */
 157 static char *logical_file_name;
 158 static int line_number;
 159
 160 /* The input file stream.  */
 161 static FILE *fp;
 162
 163
 164 /* Fetch the next character from the input file.  */
 165 static int
 166 do_getc ()
 167 {
 168   int c = getc (fp);
 169
 170   if (c == EOF)
 171     {
 172       if (ferror (fp))
 173         error (EXIT_FAILURE, errno, _("\
 174 error while reading \"%s\""), real_file_name);
 175     }
 176   else if (c == '\n')
 177    line_number++;
 178
 179   return c;
 180 }
 181
 182 /* Put back the last fetched character, not EOF.  */
 183 static void
 184 do_ungetc (int c)
 185 {
 186   if (c == '\n')
 187     line_number--;
 188   ungetc (c, fp);
 189 }
 190
 191
 192 /* ========================== Reading of tokens.  ========================== */
 193
 194
 195 /* A token consists of a sequence of characters.  */
 196 struct token
 197 {
 198   int allocated;                /* number of allocated 'token_char's */
 199   int charcount;                /* number of used 'token_char's */
 200   char *chars;                  /* the token's constituents */
 201 };
 202
 203 /* Initialize a 'struct token'.  */
 204 static inline void
 205 init_token (struct token *tp)
 206 {
 207   tp->allocated = 10;
 208   tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
 209   tp->charcount = 0;
 210 }
 211
 212 /* Free the memory pointed to by a 'struct token'.  */
 213 static inline void
 214 free_token (struct token *tp)
 215 {
 216   free (tp->chars);
 217 }
 218
 219 /* Ensure there is enough room in the token for one more character.  */
 220 static inline void
 221 grow_token (struct token *tp)
 222 {
 223   if (tp->charcount == tp->allocated)
 224     {
 225       tp->allocated *= 2;
 226       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
 227     }
 228 }
 229
 230 /* Read the next token.  'first' is the first character, which has already
 231    been read.  */
 232 static void
 233 read_token (struct token *tp, int first)
 234 {
 235   init_token (tp);
 236
 237   grow_token (tp);
 238   tp->chars[tp->charcount++] = first;
 239
 240   for (;;)
 241     {
 242       int c = do_getc ();
 243
 244       if (c == EOF)
 245         break;
 246       if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
 247           || c == '"' || c == '(' || c == ')' || c == ';')
 248         {
 249           do_ungetc (c);
 250           break;
 251         }
 252       grow_token (tp);
 253       tp->chars[tp->charcount++] = c;
 254     }
 255 }
 256
 257 /* Tests if a token represents an integer.
 258    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int().  */
 259 static inline bool
 260 is_integer_syntax (const char *str, int len, int radix)
 261 {
 262   const char *p = str;
 263   const char *p_end = str + len;
 264
 265   /* The accepted syntax is
 266        ['+'|'-'] DIGIT+
 267      where DIGIT is a hexadecimal digit whose value is below radix.  */
 268
 269   if (p == p_end)
 270     return false;
 271   if (*p == '+' || *p == '-')
 272     {
 273       p++;
 274       if (p == p_end)
 275         return false;
 276     }
 277   do
 278     {
 279       int c = *p++;
 280
 281       if (c >= '0' && c <= '9')
 282         c = c - '0';
 283       else if (c >= 'A' && c <= 'F')
 284         c = c - 'A' + 10;
 285       else if (c >= 'a' && c <= 'f')
 286         c = c - 'a' + 10;
 287       else
 288         return false;
 289       if (c >= radix)
 290         return false;
 291     }
 292   while (p < p_end);
 293   return true;
 294 }
 295
 296 /* Tests if a token represents a rational, floating-point or complex number.
 297    If unconstrained is false, only real numbers are accepted; otherwise,
 298    complex numbers are accepted as well.
 299    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo().  */
 300 static inline bool
 301 is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
 302 {
 303   const char *p = str;
 304   const char *p_end = str + len;
 305   bool seen_sign;
 306   bool seen_digits;
 307
 308   /* The accepted syntaxes are:
 309      for a floating-point number:
 310        ['+'|'-'] DIGIT+ [EXPONENT]
 311        ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
 312        where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
 313        (Dot and exponent are allowed only if radix is 10.)
 314      for a rational number:
 315        ['+'|'-'] DIGIT+ '/' DIGIT+
 316      for a complex number:
 317        REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 318        REAL-NUMBER {'+'|'-'} 'i'
 319        {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 320        {'+'|'-'} 'i'
 321        REAL-NUMBER '@' REAL-NUMBER
 322    */
 323   if (p == p_end)
 324     return false;
 325   /* Parse leading sign.  */
 326   seen_sign = false;
 327   if (*p == '+' || *p == '-')
 328     {
 329       p++;
 330       if (p == p_end)
 331         return false;
 332       seen_sign = true;
 333       /* Recognize complex number syntax: {'+'|'-'} 'i'  */
 334       if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
 335         return true;
 336     }
 337   /* Parse digits before dot or exponent or slash.  */
 338   seen_digits = false;
 339   do
 340     {
 341       int c = *p;
 342
 343       if (c >= '0' && c <= '9')
 344         c = c - '0';
 345       else if (c >= 'A' && c <= 'F')
 346         {
 347           if (c >= 'D' && radix == 10) /* exponent? */
 348             break;
 349           c = c - 'A' + 10;
 350         }
 351       else if (c >= 'a' && c <= 'f')
 352         {
 353           if (c >= 'd' && radix == 10) /* exponent? */
 354             break;
 355           c = c - 'a' + 10;
 356         }
 357       else
 358         break;
 359       if (c >= radix)
 360         return false;
 361       seen_digits = true;
 362       p++;
 363     }
 364   while (p < p_end);
 365   /* If p == p_end, we know that seen_digits = true, and the number is an
 366      integer without exponent.  */
 367   if (p < p_end)
 368     {
 369       /* If we have no digits so far, we need a decimal point later.  */
 370       if (!seen_digits && !(*p == '.' && radix == 10))
 371         return false;
 372       /* Trailing '#' signs are equivalent to zeroes.  */
 373       while (p < p_end && *p == '#')
 374         p++;
 375       if (p < p_end)
 376         {
 377           if (*p == '/')
 378             {
 379               /* Parse digits after the slash.  */
 380               bool all_zeroes = true;
 381               p++;
 382               for (; p < p_end; p++)
 383                 {
 384                   int c = *p;
 385
 386                   if (c >= '0' && c <= '9')
 387                     c = c - '0';
 388                   else if (c >= 'A' && c <= 'F')
 389                     c = c - 'A' + 10;
 390                   else if (c >= 'a' && c <= 'f')
 391                     c = c - 'a' + 10;
 392                   else
 393                     break;
 394                   if (c >= radix)
 395                     return false;
 396                   if (c != 0)
 397                     all_zeroes = false;
 398                 }
 399               /* A zero denominator is not allowed.  */
 400               if (all_zeroes)
 401                 return false;
 402               /* Trailing '#' signs are equivalent to zeroes.  */
 403               while (p < p_end && *p == '#')
 404                 p++;
 405             }
 406           else
 407             {
 408               if (*p == '.')
 409                 {
 410                   /* Decimal point notation.  */
 411                   if (radix != 10)
 412                     return false;
 413                   /* Parse digits after the decimal point.  */
 414                   p++;
 415                   for (; p < p_end; p++)
 416                     {
 417                       int c = *p;
 418
 419                       if (c >= '0' && c <= '9')
 420                         seen_digits = true;
 421                       else
 422                         break;
 423                     }
 424                   /* Digits are required before or after the decimal point.  */
 425                   if (!seen_digits)
 426                     return false;
 427                   /* Trailing '#' signs are equivalent to zeroes.  */
 428                   while (p < p_end && *p == '#')
 429                     p++;
 430                 }
 431               if (p < p_end)
 432                 {
 433                   /* Parse exponent.  */
 434                   switch (*p)
 435                     {
 436                     case 'D': case 'd':
 437                     case 'E': case 'e':
 438                     case 'F': case 'f':
 439                     case 'L': case 'l':
 440                     case 'S': case 's':
 441                       if (radix != 10)
 442                         return false;
 443                       p++;
 444                       if (p == p_end)
 445                         return false;
 446                       if (*p == '+' || *p == '-')
 447                         {
 448                           p++;
 449                           if (p == p_end)
 450                             return false;
 451                         }
 452                       if (!(*p >= '0' && *p <= '9'))
 453                         return false;
 454                       for (;;)
 455                         {
 456                           p++;
 457                           if (p == p_end)
 458                             break;
 459                           if (!(*p >= '0' && *p <= '9'))
 460                             break;
 461                         }
 462                       break;
 463                     default:
 464                       break;
 465                     }
 466                 }
 467             }
 468         }
 469     }
 470   if (p == p_end)
 471     return true;
 472   /* Recognize complex number syntax.  */
 473   if (unconstrained)
 474     {
 475       /* Recognize the syntax  {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'  */
 476       if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
 477         return true;
 478       /* Recognize the syntaxes
 479            REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 480            REAL-NUMBER {'+'|'-'} 'i'
 481        */
 482       if (*p == '+' || *p == '-')
 483         return (p_end[-1] == 'I' || p_end[-1] == 'i')
 484                 && (p + 1 == p_end - 1
 485                     || is_other_number_syntax (p, p_end - 1 - p, radix, false));
 486       /* Recognize the syntax  REAL-NUMBER '@' REAL-NUMBER  */
 487       if (*p == '@')
 488         {
 489           p++;
 490           return is_other_number_syntax (p, p_end - p, radix, false);
 491         }
 492     }
 493   return false;
 494 }
 495
 496 /* Tests if a token represents a number.
 497    Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number().  */
 498 static bool
 499 is_number (const struct token *tp)
 500 {
 501   const char *str = tp->chars;
 502   int len = tp->charcount;
 503   int radix = 10;
 504   enum { unknown, exact, inexact } exactness = unknown;
 505   bool seen_radix_prefix = false;
 506   bool seen_exactness_prefix = false;
 507
 508   if (len == 1)
 509     if (*str == '+' || *str == '-')
 510       return false;
 511   while (len >= 2 && *str == '#')
 512     {
 513       switch (str[1])
 514         {
 515         case 'B': case 'b':
 516           if (seen_radix_prefix)
 517             return false;
 518           radix = 2;
 519           seen_radix_prefix = true;
 520           break;
 521         case 'O': case 'o':
 522           if (seen_radix_prefix)
 523             return false;
 524           radix = 8;
 525           seen_radix_prefix = true;
 526           break;
 527         case 'D': case 'd':
 528           if (seen_radix_prefix)
 529             return false;
 530           radix = 10;
 531           seen_radix_prefix = true;
 532           break;
 533         case 'X': case 'x':
 534           if (seen_radix_prefix)
 535             return false;
 536           radix = 16;
 537           seen_radix_prefix = true;
 538           break;
 539         case 'E': case 'e':
 540           if (seen_exactness_prefix)
 541             return false;
 542           exactness = exact;
 543           seen_exactness_prefix = true;
 544           break;
 545         case 'I': case 'i':
 546           if (seen_exactness_prefix)
 547             return false;
 548           exactness = inexact;
 549           seen_exactness_prefix = true;
 550           break;
 551         default:
 552           return false;
 553         }
 554       str += 2;
 555       len -= 2;
 556     }
 557   if (exactness != inexact)
 558     {
 559       /* Try to parse an integer.  */
 560       if (is_integer_syntax (str, len, 10))
 561         return true;
 562       /* FIXME: Other Scheme implementations support exact rational numbers
 563          or exact complex numbers.  */
 564     }
 565   if (exactness != exact)
 566     {
 567       /* Try to parse a rational, floating-point or complex number.  */
 568       if (is_other_number_syntax (str, len, 10, true))
 569         return true;
 570     }
 571   return false;
 572 }
 573
 574
 575 /* ========================= Accumulating comments ========================= */
 576
 577
 578 static char *buffer;
 579 static size_t bufmax;
 580 static size_t buflen;
 581
 582 static inline void
 583 comment_start ()
 584 {
 585   buflen = 0;
 586 }
 587
 588 static inline void
 589 comment_add (int c)
 590 {
 591   if (buflen >= bufmax)
 592     {
 593       bufmax = 2 * bufmax + 10;
 594       buffer = xrealloc (buffer, bufmax);
 595     }
 596   buffer[buflen++] = c;
 597 }
 598
 599 static inline void
 600 comment_line_end (size_t chars_to_remove)
 601 {
 602   buflen -= chars_to_remove;
 603   while (buflen >= 1
 604          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 605     --buflen;
 606   if (chars_to_remove == 0 && buflen >= bufmax)
 607     {
 608       bufmax = 2 * bufmax + 10;
 609       buffer = xrealloc (buffer, bufmax);
 610     }
 611   buffer[buflen] = '\0';
 612   xgettext_comment_add (buffer);
 613 }
 614
 615
 616 /* These are for tracking whether comments count as immediately before
 617    keyword.  */
 618 static int last_comment_line;
 619 static int last_non_comment_line;
 620
 621
 622 /* ========================= Accumulating messages ========================= */
 623
 624
 625 static message_list_ty *mlp;
 626
 627
 628 /* ========================== Reading of objects.  ========================= */
 629
 630
 631 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
 632    Other objects need not to be represented precisely.  */
 633 enum object_type
 634 {
 635   t_symbol,     /* symbol */
 636   t_string,     /* string */
 637   t_other,      /* other kind of real object */
 638   t_dot,        /* '.' pseudo object */
 639   t_close,      /* ')' pseudo object */
 640   t_eof         /* EOF marker */
 641 };
 642
 643 struct object
 644 {
 645   enum object_type type;
 646   struct token *token;          /* for t_symbol and t_string */
 647   int line_number_at_start;     /* for t_string */
 648 };
 649
 650 /* Free the memory pointed to by a 'struct object'.  */
 651 static inline void
 652 free_object (struct object *op)
 653 {
 654   if (op->type == t_symbol || op->type == t_string)
 655     {
 656       free_token (op->token);
 657       free (op->token);
 658     }
 659 }
 660
 661 /* Convert a t_symbol/t_string token to a char*.  */
 662 static char *
 663 string_of_object (const struct object *op)
 664 {
 665   char *str;
 666   int n;
 667
 668   if (!(op->type == t_symbol || op->type == t_string))
 669     abort ();
 670   n = op->token->charcount;
 671   str = (char *) xmalloc (n + 1);
 672   memcpy (str, op->token->chars, n);
 673   str[n] = '\0';
 674   return str;
 675 }
 676
 677 /* Context lookup table.  */
 678 static flag_context_list_table_ty *flag_context_list_table;
 679
 680 /* Read the next object.  */
 681 static void
 682 read_object (struct object *op, flag_context_ty outer_context)
 683 {
 684   for (;;)
 685     {
 686       int c = do_getc ();
 687
 688       switch (c)
 689         {
 690         case EOF:
 691           op->type = t_eof;
 692           return;
 693
 694         case ' ': case '\r': case '\f': case '\t':
 695           continue;
 696
 697         case '\n':
 698           /* Comments assumed to be grouped with a message must immediately
 699              precede it, with no non-whitespace token on a line between
 700              both.  */
 701           if (last_non_comment_line > last_comment_line)
 702             xgettext_comment_reset ();
 703           continue;
 704
 705         case ';':
 706           {
 707             bool all_semicolons = true;
 708
 709             last_comment_line = line_number;
 710             comment_start ();
 711             for (;;)
 712               {
 713                 c = do_getc ();
 714                 if (c == EOF || c == '\n')
 715                   break;
 716                 if (c != ';')
 717                   all_semicolons = false;
 718                 if (!all_semicolons)
 719                   {
 720                     /* We skip all leading white space, but not EOLs.  */
 721                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
 722                       comment_add (c);
 723                   }
 724               }
 725             comment_line_end (0);
 726             continue;
 727           }
 728
 729         case '(':
 730           {
 731              int arg = 0;               /* Current argument number.  */
 732              flag_context_list_iterator_ty context_iter;
 733              int argnum1 = 0;   /* First string position.  */
 734              int argnum2 = 0;   /* Plural string position.  */
 735              message_ty *plural_mp = NULL;      /* Remember the msgid.  */
 736
 737              for (;; arg++)
 738                {
 739                 struct object inner;
 740                 flag_context_ty inner_context;
 741
 742                 if (arg == 0)
 743                   inner_context = null_context;
 744                 else
 745                   inner_context =
 746                     inherited_context (outer_context,
 747                                        flag_context_list_iterator_advance (
 748                                          &context_iter));
 749
 750                 read_object (&inner, inner_context);
 751
 752                 /* Recognize end of list.  */
 753                 if (inner.type == t_close)
 754                   {
 755                     op->type = t_other;
 756                     last_non_comment_line = line_number;
 757                     return;
 758                   }
 759
 760                 /* Dots are not allowed in every position.
 761                    But be tolerant.  */
 762
 763                 /* EOF inside list is illegal.
 764                    But be tolerant.  */
 765                 if (inner.type == t_eof)
 766                   break;
 767
 768                 if (arg == 0)
 769                   {
 770                     /* This is the function position.  */
 771                     if (inner.type == t_symbol)
 772                       {
 773                         char *symbol_name = string_of_object (&inner);
 774                         void *keyword_value;
 775
 776                         if (find_entry (&keywords,
 777                                         symbol_name, strlen (symbol_name),
 778                                         &keyword_value)
 779                             == 0)
 780                           {
 781                             argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
 782                             argnum2 = (int) (long) keyword_value >> 10;
 783                           }
 784
 785                         context_iter =
 786                           flag_context_list_iterator (
 787                             flag_context_list_table_lookup (
 788                               flag_context_list_table,
 789                               symbol_name, strlen (symbol_name)));
 790
 791                         free (symbol_name);
 792                       }
 793                     else
 794                       context_iter = null_context_list_iterator;
 795                   }
 796                 else
 797                   {
 798                     /* These are the argument positions.
 799                        Extract a string if we have reached the right
 800                        argument position.  */
 801                     if (arg == argnum1)
 802                       {
 803                         if (inner.type == t_string)
 804                           {
 805                             lex_pos_ty pos;
 806                             message_ty *mp;
 807
 808                             pos.file_name = logical_file_name;
 809                             pos.line_number = inner.line_number_at_start;
 810                             mp = remember_a_message (mlp, string_of_object (&inner),
 811                                                      inner_context, &pos);
 812                             if (argnum2 > 0)
 813                               plural_mp = mp;
 814                           }
 815                       }
 816                     else if (arg == argnum2)
 817                       {
 818                         if (inner.type == t_string && plural_mp != NULL)
 819                           {
 820                             lex_pos_ty pos;
 821
 822                             pos.file_name = logical_file_name;
 823                             pos.line_number = inner.line_number_at_start;
 824                             remember_a_message_plural (plural_mp, string_of_object (&inner),
 825                                                        inner_context, &pos);
 826                           }
 827                       }
 828                   }
 829
 830                 free_object (&inner);
 831               }
 832           }
 833           op->type = t_other;
 834           last_non_comment_line = line_number;
 835           return;
 836
 837         case ')':
 838           /* Tell the caller about the end of list.
 839              Unmatched closing parenthesis is illegal.
 840              But be tolerant.  */
 841           op->type = t_close;
 842           last_non_comment_line = line_number;
 843           return;
 844
 845         case ',':
 846           {
 847             int c = do_getc ();
 848             /* The ,@ handling inside lists is wrong anyway, because
 849                ,@form expands to an unknown number of elements.  */
 850             if (c != EOF && c != '@')
 851               do_ungetc (c);
 852           }
 853           /*FALLTHROUGH*/
 854         case '\'':
 855         case '`':
 856           {
 857             struct object inner;
 858
 859             read_object (&inner, null_context);
 860
 861             /* Dots and EOF are not allowed here.  But be tolerant.  */
 862
 863             free_object (&inner);
 864
 865             op->type = t_other;
 866             last_non_comment_line = line_number;
 867             return;
 868           }
 869
 870         case '#':
 871           /* Dispatch macro handling.  */
 872           {
 873             c = do_getc ();
 874             if (c == EOF)
 875               /* Invalid input.  Be tolerant, no error message.  */
 876               {
 877                 op->type = t_other;
 878                 return;
 879               }
 880
 881             switch (c)
 882               {
 883               case '(': /* Vector */
 884                 do_ungetc (c);
 885                 {
 886                   struct object inner;
 887                   read_object (&inner, null_context);
 888                   /* Dots and EOF are not allowed here.
 889                      But be tolerant.  */
 890                   free_object (&inner);
 891                   op->type = t_other;
 892                   last_non_comment_line = line_number;
 893                   return;
 894                 }
 895
 896               case 'T': case 't': /* Boolean true */
 897               case 'F': case 'f': /* Boolean false */
 898                 op->type = t_other;
 899                 last_non_comment_line = line_number;
 900                 return;
 901
 902               case 'B': case 'b':
 903               case 'O': case 'o':
 904               case 'D': case 'd':
 905               case 'X': case 'x':
 906               case 'E': case 'e':
 907               case 'I': case 'i':
 908                 {
 909                   struct token token;
 910                   do_ungetc (c);
 911                   read_token (&token, '#');
 912                   if (is_number (&token))
 913                     {
 914                       /* A number.  */
 915                       free_token (&token);
 916                       op->type = t_other;
 917                       last_non_comment_line = line_number;
 918                       return;
 919                     }
 920                   else
 921                     {
 922                       if (token.charcount == 2
 923                           && (token.chars[1] == 'e' || token.chars[1] == 'i'))
 924                         {
 925                           c = do_getc ();
 926                           if (c != EOF)
 927                             do_ungetc (c);
 928                           if (c == '(')
 929                             /* Homogenous vector syntax, see arrays.scm.  */
 930                             case 'a':   /* Vectors of char */
 931                             case 'c':   /* Vectors of complex */
 932                           /*case 'e':*/ /* Vectors of long */
 933                             case 'h':   /* Vectors of short */
 934                           /*case 'i':*/ /* Vectors of double-float */
 935                             case 'l':   /* Vectors of long long */
 936                             case 's':   /* Vectors of single-float */
 937                             case 'u':   /* Vectors of unsigned long */
 938                             case 'y':   /* Vectors of byte */
 939                               {
 940                                 struct object inner;
 941                                 read_object (&inner, null_context);
 942                                 /* Dots and EOF are not allowed here.
 943                                    But be tolerant.  */
 944                                 free_token (&token);
 945                                 free_object (&inner);
 946                                 op->type = t_other;
 947                                 last_non_comment_line = line_number;
 948                                 return;
 949                               }
 950                         }
 951                       /* Unknown # object.  But be tolerant.  */
 952                       free_token (&token);
 953                       op->type = t_other;
 954                       last_non_comment_line = line_number;
 955                       return;
 956                     }
 957                 }
 958
 959               case '!':
 960                 /* Block comment '#! ... \n!#\n'.  We don't extract it
 961                    because it's only used to introduce scripts on Unix.  */
 962                 {
 963                   int last1 = 0;
 964                   int last2 = 0;
 965                   int last3 = 0;
 966
 967                   for (;;)
 968                     {
 969                       c = do_getc ();
 970                       if (c == EOF)
 971                         /* EOF is not allowed here.  But be tolerant.  */
 972                         break;
 973                       if (last3 == '\n' && last2 == '!' && last1 == '#'
 974                           && c == '\n')
 975                         break;
 976                       last3 = last2;
 977                       last2 = last1;
 978                       last1 = c;
 979                     }
 980                   continue;
 981                 }
 982
 983               case '*':
 984                 /* Bit vector.  */
 985                 {
 986                   struct token token;
 987                   read_token (&token, c);
 988                   /* The token should consists only of '0' and '1', except
 989                      for the initial '*'.  But be tolerant.  */
 990                   free_token (&token);
 991                   op->type = t_other;
 992                   last_non_comment_line = line_number;
 993                   return;
 994                 }
 995
 996               case '{':
 997                 /* Symbol with multiple escapes: #{...}#  */
 998                 {
 999                   op->token = (struct token *) xmalloc (sizeof (struct token));
1000
1001                   init_token (op->token);
1002
1003                   for (;;)
1004                     {
1005                       c = do_getc ();
1006
1007                       if (c == EOF)
1008                         break;
1009                       if (c == '\\')
1010                         {
1011                           c = do_getc ();
1012                           if (c == EOF)
1013                             break;
1014                         }
1015                       else if (c == '}')
1016                         {
1017                           c = do_getc ();
1018                           if (c == '#')
1019                             break;
1020                           if (c != EOF)
1021                             do_ungetc (c);
1022                           c = '}';
1023                         }
1024                       grow_token (op->token);
1025                       op->token->chars[op->token->charcount++] = c;
1026                     }
1027
1028                   op->type = t_symbol;
1029                   last_non_comment_line = line_number;
1030                   return;
1031                 }
1032
1033               case '\\':
1034                 /* Character.  */
1035                 {
1036                   struct token token;
1037                   c = do_getc ();
1038                   if (c != EOF)
1039                     {
1040                       read_token (&token, c);
1041                       free_token (&token);
1042                     }
1043                   op->type = t_other;
1044                   last_non_comment_line = line_number;
1045                   return;
1046                 }
1047
1048               case ':': /* Keyword.  */
1049               case '&': /* Deprecated keyword, installed in optargs.scm.  */
1050                 {
1051                   struct token token;
1052                   read_token (&token, '-');
1053                   free_token (&token);
1054                   op->type = t_other;
1055                   last_non_comment_line = line_number;
1056                   return;
1057                 }
1058
1059               /* The following are installed through read-hash-extend.  */
1060
1061               /* arrays.scm */
1062               case '0': case '1': case '2': case '3': case '4':
1063               case '5': case '6': case '7': case '8': case '9':
1064                 /* Multidimensional array syntax: #nx(...) where
1065                      n ::= DIGIT+
1066                      x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1067                  */
1068                 do
1069                   c = do_getc ();
1070                 while (c >= '0' && c <= '9');
1071                 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1072                    But be tolerant.  */
1073                 /*FALLTHROUGH*/
1074               case '\'': /* boot-9.scm */
1075               case '.': /* boot-9.scm */
1076               case ',': /* srfi-10.scm */
1077                 {
1078                   struct object inner;
1079                   read_object (&inner, null_context);
1080                   /* Dots and EOF are not allowed here.
1081                      But be tolerant.  */
1082                   free_object (&inner);
1083                   op->type = t_other;
1084                   last_non_comment_line = line_number;
1085                   return;
1086                 }
1087
1088               default:
1089                 /* Unknown.  */
1090                 op->type = t_other;
1091                 last_non_comment_line = line_number;
1092                 return;
1093               }
1094             /*NOTREACHED*/
1095             abort ();
1096           }
1097
1098         case '"':
1099           {
1100             op->token = (struct token *) xmalloc (sizeof (struct token));
1101             init_token (op->token);
1102             op->line_number_at_start = line_number;
1103             for (;;)
1104               {
1105                 int c = do_getc ();
1106                 if (c == EOF)
1107                   /* Invalid input.  Be tolerant, no error message.  */
1108                   break;
1109                 if (c == '"')
1110                   break;
1111                 if (c == '\\')
1112                   {
1113                     c = do_getc ();
1114                     if (c == EOF)
1115                       /* Invalid input.  Be tolerant, no error message.  */
1116                       break;
1117                     switch (c)
1118                       {
1119                       case '\n':
1120                         continue;
1121                       case '0':
1122                         c = '\0';
1123                         break;
1124                       case 'a':
1125                         c = '\a';
1126                         break;
1127                       case 'f':
1128                         c = '\f';
1129                         break;
1130                       case 'n':
1131                         c = '\n';
1132                         break;
1133                       case 'r':
1134                         c = '\r';
1135                         break;
1136                       case 't':
1137                         c = '\t';
1138                         break;
1139                       case 'v':
1140                         c = '\v';
1141                         break;
1142                       default:
1143                         break;
1144                       }
1145                   }
1146                 grow_token (op->token);
1147                 op->token->chars[op->token->charcount++] = c;
1148               }
1149             op->type = t_string;
1150
1151             if (extract_all)
1152               {
1153                 lex_pos_ty pos;
1154
1155                 pos.file_name = logical_file_name;
1156                 pos.line_number = op->line_number_at_start;
1157                 remember_a_message (mlp, string_of_object (op),
1158                                     null_context, &pos);
1159               }
1160             last_non_comment_line = line_number;
1161             return;
1162           }
1163
1164         case '0': case '1': case '2': case '3': case '4':
1165         case '5': case '6': case '7': case '8': case '9':
1166         case '+': case '-': case '.':
1167           /* Read a number or symbol token.  */
1168           op->token = (struct token *) xmalloc (sizeof (struct token));
1169           read_token (op->token, c);
1170           if (op->token->charcount == 1 && op->token->chars[0] == '.')
1171             {
1172               free_token (op->token);
1173               free (op->token);
1174               op->type = t_dot;
1175             }
1176           else if (is_number (op->token))
1177             {
1178               /* A number.  */
1179               free_token (op->token);
1180               free (op->token);
1181               op->type = t_other;
1182             }
1183           else
1184             {
1185               /* A symbol.  */
1186               op->type = t_symbol;
1187             }
1188           last_non_comment_line = line_number;
1189           return;
1190
1191         case ':':
1192         default:
1193           /* Read a symbol token.  */
1194           op->token = (struct token *) xmalloc (sizeof (struct token));
1195           read_token (op->token, c);
1196           op->type = t_symbol;
1197           last_non_comment_line = line_number;
1198           return;
1199         }
1200     }
1201 }
1202
1203
1204 void
1205 extract_scheme (FILE *f,
1206                 const char *real_filename, const char *logical_filename,
1207                 flag_context_list_table_ty *flag_table,
1208                 msgdomain_list_ty *mdlp)
1209 {
1210   mlp = mdlp->item[0]->messages;
1211
1212   fp = f;
1213   real_file_name = real_filename;
1214   logical_file_name = xstrdup (logical_filename);
1215   line_number = 1;
1216
1217   last_comment_line = -1;
1218   last_non_comment_line = -1;
1219
1220   flag_context_list_table = flag_table;
1221
1222   init_keywords ();
1223
1224   /* Eat tokens until eof is seen.  When read_object returns
1225      due to an unbalanced closing parenthesis, just restart it.  */
1226   do
1227     {
1228       struct object toplevel_object;
1229
1230       read_object (&toplevel_object, null_context);
1231
1232       if (toplevel_object.type == t_eof)
1233         break;
1234
1235       free_object (&toplevel_object);
1236     }
1237   while (!feof (fp));
1238
1239   /* Close scanner.  */
1240   fp = NULL;
1241   real_file_name = NULL;
1242   logical_file_name = NULL;
1243   line_number = 0;
1244 }