gnu/dist/gettext/gettext-tools/src/x-elisp.c

   1 /* xgettext Emacs Lisp backend.
   2    Copyright (C) 2001-2003 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001-2002.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <stdbool.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "message.h"
  31 #include "xgettext.h"
  32 #include "x-elisp.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "exit.h"
  36 #include "hash.h"
  37 #include "c-ctype.h"
  38 #include "gettext.h"
  39
  40 #define _(s) gettext(s)
  41
  42
  43 /* Summary of Emacs Lisp syntax:
  44    - ';' starts a comment until end of line.
  45    - '#@nn' starts a comment of nn bytes.
  46    - Integers are constituted of an optional prefix (#b, #B for binary,
  47      #o, #O for octal, #x, #X for hexadecimal, #nnr, #nnR for any radix),
  48      an optional sign (+ or -), the digits, and an optional trailing dot.
  49    - Characters are written as '?' followed by the character, possibly
  50      with an escape sequence, for examples '?a', '?\n', '?\177'.
  51    - Strings are delimited by double quotes. Backslash introduces an escape
  52      sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
  53      '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
  54    - Symbols: can contain meta-characters if preceded by backslash.
  55    - Uninterned symbols: written as #:SYMBOL.
  56    - () delimit lists.
  57    - [] delimit vectors.
  58    The reader is implemented in emacs-21.1/src/lread.c.  */
  59
  60
  61 /* ====================== Keyword set customization.  ====================== */
  62
  63 /* If true extract all strings.  */
  64 static bool extract_all = false;
  65
  66 static hash_table keywords;
  67 static bool default_keywords = true;
  68
  69
  70 void
  71 x_elisp_extract_all ()
  72 {
  73   extract_all = true;
  74 }
  75
  76
  77 void
  78 x_elisp_keyword (const char *name)
  79 {
  80   if (name == NULL)
  81     default_keywords = false;
  82   else
  83     {
  84       const char *end;
  85       int argnum1;
  86       int argnum2;
  87       const char *colon;
  88
  89       if (keywords.table == NULL)
  90         init_hash (&keywords, 100);
  91
  92       split_keywordspec (name, &end, &argnum1, &argnum2);
  93
  94       /* The characters between name and end should form a valid Lisp
  95          symbol.  */
  96       colon = strchr (name, ':');
  97       if (colon == NULL || colon >= end)
  98         {
  99           if (argnum1 == 0)
 100             argnum1 = 1;
 101           insert_entry (&keywords, name, end - name,
 102                         (void *) (long) (argnum1 + (argnum2 << 10)));
 103         }
 104     }
 105 }
 106
 107 /* Finish initializing the keywords hash table.
 108    Called after argument processing, before each file is processed.  */
 109 static void
 110 init_keywords ()
 111 {
 112   if (default_keywords)
 113     {
 114       x_elisp_keyword ("_");
 115       default_keywords = false;
 116     }
 117 }
 118
 119 void
 120 init_flag_table_elisp ()
 121 {
 122   xgettext_record_flag ("_:1:pass-elisp-format");
 123   xgettext_record_flag ("format:1:elisp-format");
 124 }
 125
 126
 127 /* ======================== Reading of characters.  ======================== */
 128
 129 /* Real filename, used in error messages about the input file.  */
 130 static const char *real_file_name;
 131
 132 /* Logical filename and line number, used to label the extracted messages.  */
 133 static char *logical_file_name;
 134 static int line_number;
 135
 136 /* The input file stream.  */
 137 static FILE *fp;
 138
 139
 140 /* Fetch the next character from the input file.  */
 141 static int
 142 do_getc ()
 143 {
 144   int c = getc (fp);
 145
 146   if (c == EOF)
 147     {
 148       if (ferror (fp))
 149         error (EXIT_FAILURE, errno, _("\
 150 error while reading \"%s\""), real_file_name);
 151     }
 152   else if (c == '\n')
 153    line_number++;
 154
 155   return c;
 156 }
 157
 158 /* Put back the last fetched character, not EOF.  */
 159 static void
 160 do_ungetc (int c)
 161 {
 162   if (c == '\n')
 163     line_number--;
 164   ungetc (c, fp);
 165 }
 166
 167
 168 /* ========================== Reading of tokens.  ========================== */
 169
 170
 171 /* A token consists of a sequence of characters.  */
 172 struct token
 173 {
 174   int allocated;                /* number of allocated 'token_char's */
 175   int charcount;                /* number of used 'token_char's */
 176   char *chars;                  /* the token's constituents */
 177 };
 178
 179 /* Initialize a 'struct token'.  */
 180 static inline void
 181 init_token (struct token *tp)
 182 {
 183   tp->allocated = 10;
 184   tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
 185   tp->charcount = 0;
 186 }
 187
 188 /* Free the memory pointed to by a 'struct token'.  */
 189 static inline void
 190 free_token (struct token *tp)
 191 {
 192   free (tp->chars);
 193 }
 194
 195 /* Ensure there is enough room in the token for one more character.  */
 196 static inline void
 197 grow_token (struct token *tp)
 198 {
 199   if (tp->charcount == tp->allocated)
 200     {
 201       tp->allocated *= 2;
 202       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
 203     }
 204 }
 205
 206 /* Test whether a token has integer syntax.  */
 207 static inline bool
 208 is_integer (const char *p)
 209 {
 210   /* NB: Yes, '+.' and '-.' both designate the integer 0.  */
 211   const char *p_start = p;
 212
 213   if (*p == '+' || *p == '-')
 214     p++;
 215   if (*p == '\0')
 216     return false;
 217   while (*p >= '0' && *p <= '9')
 218     p++;
 219   if (p > p_start && *p == '.')
 220     p++;
 221   return (*p == '\0');
 222 }
 223
 224 /* Test whether a token has float syntax.  */
 225 static inline bool
 226 is_float (const char *p)
 227 {
 228   enum { LEAD_INT = 1, DOT_CHAR = 2, TRAIL_INT = 4, E_CHAR = 8, EXP_INT = 16 };
 229   int state;
 230
 231   state = 0;
 232   if (*p == '+' || *p == '-')
 233     p++;
 234   if (*p >= '0' && *p <= '9')
 235     {
 236       state |= LEAD_INT;
 237       do
 238         p++;
 239       while (*p >= '0' && *p <= '9');
 240     }
 241   if (*p == '.')
 242     {
 243       state |= DOT_CHAR;
 244       p++;
 245     }
 246   if (*p >= '0' && *p <= '9')
 247     {
 248       state |= TRAIL_INT;
 249       do
 250         p++;
 251       while (*p >= '0' && *p <= '9');
 252     }
 253   if (*p == 'e' || *p == 'E')
 254     {
 255       state |= E_CHAR;
 256       p++;
 257       if (*p == '+' || *p == '-')
 258         p++;
 259       if (*p >= '0' && *p <= '9')
 260         {
 261           state |= EXP_INT;
 262           do
 263             p++;
 264           while (*p >= '0' && *p <= '9');
 265         }
 266       else if (p[-1] == '+'
 267                && ((p[0] == 'I' && p[1] == 'N' && p[2] == 'F')
 268                    || (p[0] == 'N' && p[1] == 'a' && p[2] == 'N')))
 269         {
 270           state |= EXP_INT;
 271           p += 3;
 272         }
 273     }
 274   return (*p == '\0')
 275          && (state == (LEAD_INT | DOT_CHAR | TRAIL_INT)
 276              || state == (DOT_CHAR | TRAIL_INT)
 277              || state == (LEAD_INT | E_CHAR | EXP_INT)
 278              || state == (LEAD_INT | DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT)
 279              || state == (DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT));
 280 }
 281
 282 /* Read the next token.  'first' is the first character, which has already
 283    been read.  Returns true for a symbol, false for a number.  */
 284 static bool
 285 read_token (struct token *tp, int first)
 286 {
 287   int c;
 288   bool quoted = false;
 289
 290   init_token (tp);
 291
 292   c = first;
 293
 294   for (;; c = do_getc ())
 295     {
 296       if (c == EOF)
 297         break;
 298       if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
 299         break;
 300       if (c == '\"' || c == '\'' || c == ';' || c == '(' || c == ')'
 301           || c == '[' || c == ']' || c == '#')
 302         break;
 303       if (c == '\\')
 304         {
 305           quoted = true;
 306           c = do_getc ();
 307           if (c == EOF)
 308             /* Invalid, but be tolerant.  */
 309             break;
 310         }
 311       grow_token (tp);
 312       tp->chars[tp->charcount++] = c;
 313     }
 314   if (c != EOF)
 315     do_ungetc (c);
 316
 317   if (quoted)
 318     return true; /* symbol */
 319
 320   /* Add a NUL byte at the end, for is_integer and is_float.  */
 321   grow_token (tp);
 322   tp->chars[tp->charcount] = '\0';
 323
 324   if (is_integer (tp->chars) || is_float (tp->chars))
 325     return false; /* number */
 326   else
 327     return true; /* symbol */
 328 }
 329
 330
 331 /* ========================= Accumulating comments ========================= */
 332
 333
 334 static char *buffer;
 335 static size_t bufmax;
 336 static size_t buflen;
 337
 338 static inline void
 339 comment_start ()
 340 {
 341   buflen = 0;
 342 }
 343
 344 static inline void
 345 comment_add (int c)
 346 {
 347   if (buflen >= bufmax)
 348     {
 349       bufmax = 2 * bufmax + 10;
 350       buffer = xrealloc (buffer, bufmax);
 351     }
 352   buffer[buflen++] = c;
 353 }
 354
 355 static inline void
 356 comment_line_end (size_t chars_to_remove)
 357 {
 358   buflen -= chars_to_remove;
 359   while (buflen >= 1
 360          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 361     --buflen;
 362   if (chars_to_remove == 0 && buflen >= bufmax)
 363     {
 364       bufmax = 2 * bufmax + 10;
 365       buffer = xrealloc (buffer, bufmax);
 366     }
 367   buffer[buflen] = '\0';
 368   xgettext_comment_add (buffer);
 369 }
 370
 371
 372 /* These are for tracking whether comments count as immediately before
 373    keyword.  */
 374 static int last_comment_line;
 375 static int last_non_comment_line;
 376
 377
 378 /* ========================= Accumulating messages ========================= */
 379
 380
 381 static message_list_ty *mlp;
 382
 383
 384 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
 385
 386
 387 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
 388    Other objects need not to be represented precisely.  */
 389 enum object_type
 390 {
 391   t_symbol,     /* symbol */
 392   t_string,     /* string */
 393   t_other,      /* other kind of real object */
 394   t_dot,        /* '.' pseudo object */
 395   t_listclose,  /* ')' pseudo object */
 396   t_vectorclose,/* ']' pseudo object */
 397   t_eof         /* EOF marker */
 398 };
 399
 400 struct object
 401 {
 402   enum object_type type;
 403   struct token *token;          /* for t_symbol and t_string */
 404   int line_number_at_start;     /* for t_string */
 405 };
 406
 407 /* Free the memory pointed to by a 'struct object'.  */
 408 static inline void
 409 free_object (struct object *op)
 410 {
 411   if (op->type == t_symbol || op->type == t_string)
 412     {
 413       free_token (op->token);
 414       free (op->token);
 415     }
 416 }
 417
 418 /* Convert a t_symbol/t_string token to a char*.  */
 419 static char *
 420 string_of_object (const struct object *op)
 421 {
 422   char *str;
 423   int n;
 424
 425   if (!(op->type == t_symbol || op->type == t_string))
 426     abort ();
 427   n = op->token->charcount;
 428   str = (char *) xmalloc (n + 1);
 429   memcpy (str, op->token->chars, n);
 430   str[n] = '\0';
 431   return str;
 432 }
 433
 434 /* Context lookup table.  */
 435 static flag_context_list_table_ty *flag_context_list_table;
 436
 437 /* Returns the character represented by an escape sequence.  */
 438 #define IGNORABLE_ESCAPE (EOF - 1)
 439 static int
 440 do_getc_escaped (int c, bool in_string)
 441 {
 442   switch (c)
 443     {
 444     case 'a':
 445       return '\a';
 446     case 'b':
 447       return '\b';
 448     case 'd':
 449       return 0x7F;
 450     case 'e':
 451       return 0x1B;
 452     case 'f':
 453       return '\f';
 454     case 'n':
 455       return '\n';
 456     case 'r':
 457       return '\r';
 458     case 't':
 459       return '\t';
 460     case 'v':
 461       return '\v';
 462
 463     case '\n':
 464       return IGNORABLE_ESCAPE;
 465
 466     case ' ':
 467       return (in_string ? IGNORABLE_ESCAPE : ' ');
 468
 469     case 'M': /* meta */
 470       c = do_getc ();
 471       if (c == EOF)
 472         return EOF;
 473       if (c != '-')
 474         /* Invalid input.  But be tolerant.  */
 475         return c;
 476       c = do_getc ();
 477       if (c == EOF)
 478         return EOF;
 479       if (c == '\\')
 480         {
 481           c = do_getc ();
 482           if (c == EOF)
 483             return EOF;
 484           c = do_getc_escaped (c, false);
 485         }
 486       return c | 0x80;
 487
 488     case 'S': /* shift */
 489       c = do_getc ();
 490       if (c == EOF)
 491         return EOF;
 492       if (c != '-')
 493         /* Invalid input.  But be tolerant.  */
 494         return c;
 495       c = do_getc ();
 496       if (c == EOF)
 497         return EOF;
 498       if (c == '\\')
 499         {
 500           c = do_getc ();
 501           if (c == EOF)
 502             return EOF;
 503           c = do_getc_escaped (c, false);
 504         }
 505       return (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 506
 507     case 'H': /* hyper */
 508     case 'A': /* alt */
 509     case 's': /* super */
 510       c = do_getc ();
 511       if (c == EOF)
 512         return EOF;
 513       if (c != '-')
 514         /* Invalid input.  But be tolerant.  */
 515         return c;
 516       c = do_getc ();
 517       if (c == EOF)
 518         return EOF;
 519       if (c == '\\')
 520         {
 521           c = do_getc ();
 522           if (c == EOF)
 523             return EOF;
 524           c = do_getc_escaped (c, false);
 525         }
 526       return c;
 527
 528     case 'C': /* ctrl */
 529       c = do_getc ();
 530       if (c == EOF)
 531         return EOF;
 532       if (c != '-')
 533         /* Invalid input.  But be tolerant.  */
 534         return c;
 535       /*FALLTHROUGH*/
 536     case '^':
 537       c = do_getc ();
 538       if (c == EOF)
 539         return EOF;
 540       if (c == '\\')
 541         {
 542           c = do_getc ();
 543           if (c == EOF)
 544             return EOF;
 545           c = do_getc_escaped (c, false);
 546         }
 547       if (c == '?')
 548         return 0x7F;
 549       if ((c & 0x5F) >= 0x41 && (c & 0x5F) <= 0x5A)
 550         return c & 0x9F;
 551       if ((c & 0x7F) >= 0x40 && (c & 0x7F) <= 0x5F)
 552         return c & 0x9F;
 553 #if 0 /* We cannot handle NUL bytes in strings.  */
 554       if (c == ' ')
 555         return 0x00;
 556 #endif
 557       return c;
 558
 559     case '0': case '1': case '2': case '3': case '4':
 560     case '5': case '6': case '7':
 561       /* An octal escape, as in ANSI C.  */
 562       {
 563         int n = c - '0';
 564
 565         c = do_getc ();
 566         if (c != EOF)
 567           {
 568             if (c >= '0' && c <= '7')
 569               {
 570                 n = (n << 3) + (c - '0');
 571                 c = do_getc ();
 572                 if (c != EOF)
 573                   {
 574                     if (c >= '0' && c <= '7')
 575                       n = (n << 3) + (c - '0');
 576                     else
 577                       do_ungetc (c);
 578                   }
 579               }
 580             else
 581               do_ungetc (c);
 582           }
 583         return (unsigned char) n;
 584       }
 585
 586     case 'x':
 587       /* A hexadecimal escape, as in ANSI C.  */
 588       {
 589         int n = 0;
 590
 591         for (;;)
 592           {
 593             c = do_getc ();
 594             if (c == EOF)
 595               break;
 596             else if (c >= '0' && c <= '9')
 597               n = (n << 4) + (c - '0');
 598             else if (c >= 'A' && c <= 'F')
 599               n = (n << 4) + (c - 'A' + 10);
 600             else if (c >= 'a' && c <= 'f')
 601               n = (n << 4) + (c - 'a' + 10);
 602             else
 603               {
 604                 do_ungetc (c);
 605                 break;
 606               }
 607           }
 608         return (unsigned char) n;
 609       }
 610
 611     default:
 612       /* Ignore Emacs multibyte character stuff.  All the strings we are
 613          interested in are ASCII strings.  */
 614       return c;
 615     }
 616 }
 617
 618 /* Read the next object.
 619    'first_in_list' and 'new_backquote_flag' are used for reading old
 620    backquote syntax and new backquote syntax.  */
 621 static void
 622 read_object (struct object *op, bool first_in_list, bool new_backquote_flag,
 623              flag_context_ty outer_context)
 624 {
 625   for (;;)
 626     {
 627       int c;
 628
 629       c = do_getc ();
 630
 631       switch (c)
 632         {
 633         case EOF:
 634           op->type = t_eof;
 635           return;
 636
 637         case '\n':
 638           /* Comments assumed to be grouped with a message must immediately
 639              precede it, with no non-whitespace token on a line between
 640              both.  */
 641           if (last_non_comment_line > last_comment_line)
 642             xgettext_comment_reset ();
 643           continue;
 644
 645         case '(':
 646           {
 647             int arg = 0;                /* Current argument number.  */
 648             flag_context_list_iterator_ty context_iter;
 649             int argnum1 = 0;            /* First string position.  */
 650             int argnum2 = 0;            /* Plural string position.  */
 651             message_ty *plural_mp = NULL;       /* Remember the msgid.  */
 652
 653             for (;; arg++)
 654               {
 655                 struct object inner;
 656                 flag_context_ty inner_context;
 657
 658                 if (arg == 0)
 659                   inner_context = null_context;
 660                 else
 661                   inner_context =
 662                     inherited_context (outer_context,
 663                                        flag_context_list_iterator_advance (
 664                                          &context_iter));
 665
 666                 read_object (&inner, arg == 0, new_backquote_flag,
 667                              inner_context);
 668
 669                 /* Recognize end of list.  */
 670                 if (inner.type == t_listclose)
 671                   {
 672                     op->type = t_other;
 673                     /* Don't bother converting "()" to "NIL".  */
 674                     last_non_comment_line = line_number;
 675                     return;
 676                   }
 677
 678                 /* Dots are not allowed in every position. ']' is not allowed.
 679                    But be tolerant.  */
 680
 681                 /* EOF inside list is illegal.  But be tolerant.  */
 682                 if (inner.type == t_eof)
 683                   break;
 684
 685                 if (arg == 0)
 686                   {
 687                     /* This is the function position.  */
 688                     if (inner.type == t_symbol)
 689                       {
 690                         char *symbol_name = string_of_object (&inner);
 691                         void *keyword_value;
 692
 693                         if (find_entry (&keywords,
 694                                         symbol_name, strlen (symbol_name),
 695                                         &keyword_value)
 696                             == 0)
 697                           {
 698                             argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
 699                             argnum2 = (int) (long) keyword_value >> 10;
 700                           }
 701
 702                         context_iter =
 703                           flag_context_list_iterator (
 704                             flag_context_list_table_lookup (
 705                               flag_context_list_table,
 706                               symbol_name, strlen (symbol_name)));
 707
 708                         free (symbol_name);
 709                       }
 710                     else
 711                       context_iter = null_context_list_iterator;
 712                   }
 713                 else
 714                   {
 715                     /* These are the argument positions.
 716                        Extract a string if we have reached the right
 717                        argument position.  */
 718                     if (arg == argnum1)
 719                       {
 720                         if (inner.type == t_string)
 721                           {
 722                             lex_pos_ty pos;
 723                             message_ty *mp;
 724
 725                             pos.file_name = logical_file_name;
 726                             pos.line_number = inner.line_number_at_start;
 727                             mp = remember_a_message (mlp, string_of_object (&inner),
 728                                                      inner_context, &pos);
 729                             if (argnum2 > 0)
 730                               plural_mp = mp;
 731                           }
 732                       }
 733                     else if (arg == argnum2)
 734                       {
 735                         if (inner.type == t_string && plural_mp != NULL)
 736                           {
 737                             lex_pos_ty pos;
 738
 739                             pos.file_name = logical_file_name;
 740                             pos.line_number = inner.line_number_at_start;
 741                             remember_a_message_plural (plural_mp, string_of_object (&inner),
 742                                                        inner_context, &pos);
 743                           }
 744                       }
 745                   }
 746
 747                 free_object (&inner);
 748               }
 749           }
 750           op->type = t_other;
 751           last_non_comment_line = line_number;
 752           return;
 753
 754         case ')':
 755           /* Tell the caller about the end of list.
 756              Unmatched closing parenthesis is illegal.  But be tolerant.  */
 757           op->type = t_listclose;
 758           last_non_comment_line = line_number;
 759           return;
 760
 761         case '[':
 762           {
 763             for (;;)
 764               {
 765                 struct object inner;
 766
 767                 read_object (&inner, false, new_backquote_flag, null_context);
 768
 769                 /* Recognize end of vector.  */
 770                 if (inner.type == t_vectorclose)
 771                   {
 772                     op->type = t_other;
 773                     last_non_comment_line = line_number;
 774                     return;
 775                   }
 776
 777                 /* Dots and ')' are not allowed.  But be tolerant.  */
 778
 779                 /* EOF inside vector is illegal.  But be tolerant.  */
 780                 if (inner.type == t_eof)
 781                   break;
 782
 783                 free_object (&inner);
 784               }
 785           }
 786           op->type = t_other;
 787           last_non_comment_line = line_number;
 788           return;
 789
 790         case ']':
 791           /* Tell the caller about the end of vector.
 792              Unmatched closing bracket is illegal.  But be tolerant.  */
 793           op->type = t_vectorclose;
 794           last_non_comment_line = line_number;
 795           return;
 796
 797         case '\'':
 798           {
 799             struct object inner;
 800
 801             read_object (&inner, false, new_backquote_flag, null_context);
 802
 803             /* Dots and EOF are not allowed here.  But be tolerant.  */
 804
 805             free_object (&inner);
 806
 807             op->type = t_other;
 808             last_non_comment_line = line_number;
 809             return;
 810           }
 811
 812         case '`':
 813           if (first_in_list)
 814             goto default_label;
 815           {
 816             struct object inner;
 817
 818             read_object (&inner, false, true, null_context);
 819
 820             /* Dots and EOF are not allowed here.  But be tolerant.  */
 821
 822             free_object (&inner);
 823
 824             op->type = t_other;
 825             last_non_comment_line = line_number;
 826             return;
 827           }
 828
 829         case ',':
 830           if (!new_backquote_flag)
 831             goto default_label;
 832           {
 833             int c = do_getc ();
 834             /* The ,@ handling inside lists is wrong anyway, because
 835                ,@form expands to an unknown number of elements.  */
 836             if (c != EOF && c != '@' && c != '.')
 837               do_ungetc (c);
 838           }
 839           {
 840             struct object inner;
 841
 842             read_object (&inner, false, false, null_context);
 843
 844             /* Dots and EOF are not allowed here.  But be tolerant.  */
 845
 846             free_object (&inner);
 847
 848             op->type = t_other;
 849             last_non_comment_line = line_number;
 850             return;
 851           }
 852
 853         case ';':
 854           {
 855             bool all_semicolons = true;
 856
 857             last_comment_line = line_number;
 858             comment_start ();
 859             for (;;)
 860               {
 861                 int c = do_getc ();
 862                 if (c == EOF || c == '\n')
 863                   break;
 864                 if (c != ';')
 865                   all_semicolons = false;
 866                 if (!all_semicolons)
 867                   {
 868                     /* We skip all leading white space, but not EOLs.  */
 869                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
 870                       comment_add (c);
 871                   }
 872               }
 873             comment_line_end (0);
 874             continue;
 875           }
 876
 877         case '"':
 878           {
 879             op->token = (struct token *) xmalloc (sizeof (struct token));
 880             init_token (op->token);
 881             op->line_number_at_start = line_number;
 882             for (;;)
 883               {
 884                 int c = do_getc ();
 885                 if (c == EOF)
 886                   /* Invalid input.  Be tolerant, no error message.  */
 887                   break;
 888                 if (c == '"')
 889                   break;
 890                 if (c == '\\')
 891                   {
 892                     c = do_getc ();
 893                     if (c == EOF)
 894                       /* Invalid input.  Be tolerant, no error message.  */
 895                       break;
 896                     c = do_getc_escaped (c, true);
 897                     if (c == EOF)
 898                       /* Invalid input.  Be tolerant, no error message.  */
 899                       break;
 900                     if (c == IGNORABLE_ESCAPE)
 901                       /* Ignore escaped newline and escaped space.  */
 902                       ;
 903                     else
 904                       {
 905                         grow_token (op->token);
 906                         op->token->chars[op->token->charcount++] = c;
 907                       }
 908                   }
 909                 else
 910                   {
 911                     grow_token (op->token);
 912                     op->token->chars[op->token->charcount++] = c;
 913                   }
 914               }
 915             op->type = t_string;
 916
 917             if (extract_all)
 918               {
 919                 lex_pos_ty pos;
 920
 921                 pos.file_name = logical_file_name;
 922                 pos.line_number = op->line_number_at_start;
 923                 remember_a_message (mlp, string_of_object (op),
 924                                     null_context, &pos);
 925               }
 926             last_non_comment_line = line_number;
 927             return;
 928           }
 929
 930         case '?':
 931           c = do_getc ();
 932           if (c == EOF)
 933             /* Invalid input.  Be tolerant, no error message.  */
 934             ;
 935           else if (c == '\\')
 936             {
 937               c = do_getc ();
 938               if (c == EOF)
 939                 /* Invalid input.  Be tolerant, no error message.  */
 940                 ;
 941               else
 942                 {
 943                   c = do_getc_escaped (c, false);
 944                   if (c == EOF)
 945                     /* Invalid input.  Be tolerant, no error message.  */
 946                     ;
 947                 }
 948             }
 949           /* Impossible to deal with Emacs multibyte character stuff here.  */
 950           op->type = t_other;
 951           last_non_comment_line = line_number;
 952           return;
 953
 954         case '#':
 955           /* Dispatch macro handling.  */
 956           c = do_getc ();
 957           if (c == EOF)
 958             /* Invalid input.  Be tolerant, no error message.  */
 959             {
 960               op->type = t_other;
 961               return;
 962             }
 963
 964           switch (c)
 965             {
 966             case '^':
 967               c = do_getc ();
 968               if (c == '^')
 969                 c = do_getc ();
 970               if (c == '[')
 971                 {
 972                   /* Read a char table, same syntax as a vector.  */
 973                   for (;;)
 974                     {
 975                       struct object inner;
 976
 977                       read_object (&inner, false, new_backquote_flag,
 978                                    null_context);
 979
 980                       /* Recognize end of vector.  */
 981                       if (inner.type == t_vectorclose)
 982                         {
 983                           op->type = t_other;
 984                           last_non_comment_line = line_number;
 985                           return;
 986                         }
 987
 988                       /* Dots and ')' are not allowed.  But be tolerant.  */
 989
 990                       /* EOF inside vector is illegal.  But be tolerant.  */
 991                       if (inner.type == t_eof)
 992                         break;
 993
 994                       free_object (&inner);
 995                     }
 996                   op->type = t_other;
 997                   last_non_comment_line = line_number;
 998                   return;
 999                 }
1000               else
1001                 /* Invalid input.  Be tolerant, no error message.  */
1002                 {
1003                   op->type = t_other;
1004                   if (c != EOF)
1005                     last_non_comment_line = line_number;
1006                   return;
1007                 }
1008
1009             case '&':
1010               /* Read a bit vector.  */
1011               {
1012                 struct object length;
1013                 read_object (&length, first_in_list, new_backquote_flag,
1014                              null_context);
1015                 /* Dots and EOF are not allowed here.
1016                    But be tolerant.  */
1017                 free_object (&length);
1018               }
1019               c = do_getc ();
1020               if (c == '"')
1021                 {
1022                   struct object string;
1023                   read_object (&string, first_in_list, new_backquote_flag,
1024                                null_context);
1025                   free_object (&string);
1026                 }
1027               else
1028                 /* Invalid input.  Be tolerant, no error message.  */
1029                 do_ungetc (c);
1030               op->type = t_other;
1031               last_non_comment_line = line_number;
1032               return;
1033
1034             case '[':
1035               /* Read a compiled function, same syntax as a vector.  */
1036             case '(':
1037               /* Read a string with properties, same syntax as a list.  */
1038               {
1039                 struct object inner;
1040                 do_ungetc (c);
1041                 read_object (&inner, false, new_backquote_flag, null_context);
1042                 /* Dots and EOF are not allowed here.
1043                    But be tolerant.  */
1044                 free_object (&inner);
1045                 op->type = t_other;
1046                 last_non_comment_line = line_number;
1047                 return;
1048               }
1049
1050             case '@':
1051               /* Read a comment consisting of a given number of bytes.  */
1052               {
1053                 unsigned int nskip = 0;
1054
1055                 for (;;)
1056                   {
1057                     c = do_getc ();
1058                     if (!(c >= '0' && c <= '9'))
1059                       break;
1060                     nskip = 10 * nskip + (c - '0');
1061                   }
1062                 if (c != EOF)
1063                   {
1064                     do_ungetc (c);
1065                     for (; nskip > 0; nskip--)
1066                       if (do_getc () == EOF)
1067                         break;
1068                   }
1069                 continue;
1070               }
1071
1072             case '$':
1073               op->type = t_other;
1074               last_non_comment_line = line_number;
1075               return;
1076
1077             case '\'':
1078             case ':':
1079             case 'S': case 's': /* XEmacs only */
1080               {
1081                 struct object inner;
1082                 read_object (&inner, false, new_backquote_flag, null_context);
1083                 /* Dots and EOF are not allowed here.
1084                    But be tolerant.  */
1085                 free_object (&inner);
1086                 op->type = t_other;
1087                 last_non_comment_line = line_number;
1088                 return;
1089               }
1090
1091             case '0': case '1': case '2': case '3': case '4':
1092             case '5': case '6': case '7': case '8': case '9':
1093               /* Read Common Lisp style #n# or #n=.  */
1094               for (;;)
1095                 {
1096                   c = do_getc ();
1097                   if (!(c >= '0' && c <= '9'))
1098                     break;
1099                 }
1100               if (c == EOF)
1101                 /* Invalid input.  Be tolerant, no error message.  */
1102                 {
1103                   op->type = t_other;
1104                   return;
1105                 }
1106               if (c == '=')
1107                 {
1108                   read_object (op, false, new_backquote_flag, outer_context);
1109                   last_non_comment_line = line_number;
1110                   return;
1111                 }
1112               if (c == '#')
1113                 {
1114                   op->type = t_other;
1115                   last_non_comment_line = line_number;
1116                   return;
1117                 }
1118               if (c == 'R' || c == 'r')
1119                 {
1120                   /* Read an integer.  */
1121                   c = do_getc ();
1122                   if (c == '+' || c == '-')
1123                     c = do_getc ();
1124                   for (; c != EOF; c = do_getc ())
1125                     if (!c_isalnum (c))
1126                       {
1127                         do_ungetc (c);
1128                         break;
1129                       }
1130                   op->type = t_other;
1131                   last_non_comment_line = line_number;
1132                   return;
1133                 }
1134               /* Invalid input.  Be tolerant, no error message.  */
1135               op->type = t_other;
1136               last_non_comment_line = line_number;
1137               return;
1138
1139             case 'X': case 'x':
1140             case 'O': case 'o':
1141             case 'B': case 'b':
1142               {
1143                 /* Read an integer.  */
1144                 c = do_getc ();
1145                 if (c == '+' || c == '-')
1146                   c = do_getc ();
1147                 for (; c != EOF; c = do_getc ())
1148                   if (!c_isalnum (c))
1149                     {
1150                       do_ungetc (c);
1151                       break;
1152                     }
1153                 op->type = t_other;
1154                 last_non_comment_line = line_number;
1155                 return;
1156               }
1157
1158             case '*': /* XEmacs only */
1159               {
1160                 /* Read a bit-vector.  */
1161                 do
1162                   c = do_getc ();
1163                 while (c == '0' || c == '1');
1164                 if (c != EOF)
1165                   do_ungetc (c);
1166                 op->type = t_other;
1167                 last_non_comment_line = line_number;
1168                 return;
1169               }
1170
1171             case '+': /* XEmacs only */
1172             case '-': /* XEmacs only */
1173               /* Simply assume every feature expression is true.  */
1174               {
1175                 struct object inner;
1176                 read_object (&inner, false, new_backquote_flag, null_context);
1177                 /* Dots and EOF are not allowed here.
1178                    But be tolerant.  */
1179                 free_object (&inner);
1180                 continue;
1181               }
1182
1183             default:
1184               /* Invalid input.  Be tolerant, no error message.  */
1185               op->type = t_other;
1186               last_non_comment_line = line_number;
1187               return;
1188             }
1189
1190           /*NOTREACHED*/
1191           abort ();
1192
1193         case '.':
1194           c = do_getc ();
1195           if (c != EOF)
1196             {
1197               do_ungetc (c);
1198               if (c <= ' ' /* FIXME: Assumes ASCII compatible encoding */
1199                   || strchr ("\"'`,(", c) != NULL)
1200                 {
1201                   op->type = t_dot;
1202                   last_non_comment_line = line_number;
1203                   return;
1204                 }
1205             }
1206           c = '.';
1207           /*FALLTHROUGH*/
1208         default:
1209         default_label:
1210           if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
1211             continue;
1212           /* Read a token.  */
1213           {
1214             bool symbol;
1215
1216             op->token = (struct token *) xmalloc (sizeof (struct token));
1217             symbol = read_token (op->token, c);
1218             if (symbol)
1219               {
1220                 op->type = t_symbol;
1221                 last_non_comment_line = line_number;
1222                 return;
1223               }
1224             else
1225               {
1226                 free_token (op->token);
1227                 free (op->token);
1228                 op->type = t_other;
1229                 last_non_comment_line = line_number;
1230                 return;
1231               }
1232           }
1233         }
1234     }
1235 }
1236
1237
1238 void
1239 extract_elisp (FILE *f,
1240                const char *real_filename, const char *logical_filename,
1241                flag_context_list_table_ty *flag_table,
1242                msgdomain_list_ty *mdlp)
1243 {
1244   mlp = mdlp->item[0]->messages;
1245
1246   fp = f;
1247   real_file_name = real_filename;
1248   logical_file_name = xstrdup (logical_filename);
1249   line_number = 1;
1250
1251   last_comment_line = -1;
1252   last_non_comment_line = -1;
1253
1254   flag_context_list_table = flag_table;
1255
1256   init_keywords ();
1257
1258   /* Eat tokens until eof is seen.  When read_object returns
1259      due to an unbalanced closing parenthesis, just restart it.  */
1260   do
1261     {
1262       struct object toplevel_object;
1263
1264       read_object (&toplevel_object, false, false, null_context);
1265
1266       if (toplevel_object.type == t_eof)
1267         break;
1268
1269       free_object (&toplevel_object);
1270     }
1271   while (!feof (fp));
1272
1273   /* Close scanner.  */
1274   fp = NULL;
1275   real_file_name = NULL;
1276   logical_file_name = NULL;
1277   line_number = 0;
1278 }