gnu/dist/gettext/gettext-tools/src/x-c.c

   1 /* xgettext C/C++/ObjectiveC backend.
   2    Copyright (C) 1995-1998, 2000-2004 Free Software Foundation, Inc.
   3
   4    This file was written by Peter Miller <millerp@canb.auug.org.au>
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <stdbool.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "message.h"
  31 #include "xgettext.h"
  32 #include "x-c.h"
  33 #include "error.h"
  34 #include "error-progname.h"
  35 #include "xalloc.h"
  36 #include "exit.h"
  37 #include "hash.h"
  38 #include "gettext.h"
  39
  40 #define _(s) gettext(s)
  41
  42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  43
  44
  45 /* The ANSI C standard defines several phases of translation:
  46
  47    1. Terminate line by \n, regardless of the external representation
  48       of a text line.  Stdio does this for us.
  49
  50    2. Convert trigraphs to their single character equivalents.
  51
  52    3. Concatenate each line ending in backslash (\) with the following
  53       line.
  54
  55    4. Replace each comment with a space character.
  56
  57    5. Parse each resulting logical line as preprocessing tokens a
  58       white space.
  59
  60    6. Recognize and carry out directives (it also expands macros on
  61       non-directive lines, which we do not do here).
  62
  63    7. Replaces escape sequences within character strings with their
  64       single character equivalents (we do this in step 5, because we
  65       don't have to worry about the #include argument).
  66
  67    8. Concatenates adjacent string literals to form single string
  68       literals (because we don't expand macros, there are a few things
  69       we will miss).
  70
  71    9. Converts the remaining preprocessing tokens to C tokens and
  72       discards any white space from the translation unit.
  73
  74    This lexer implements the above, and presents the scanner (in
  75    xgettext.c) with a stream of C tokens.  The comments are
  76    accumulated in a buffer, and given to xgettext when asked for.  */
  77
  78
  79 /* ========================= Lexer customization.  ========================= */
  80
  81 static bool trigraphs = false;
  82
  83 void
  84 x_c_trigraphs ()
  85 {
  86   trigraphs = true;
  87 }
  88
  89
  90 /* ====================== Keyword set customization.  ====================== */
  91
  92 /* If true extract all strings.  */
  93 static bool extract_all = false;
  94
  95 static hash_table c_keywords;
  96 static hash_table objc_keywords;
  97 static bool default_keywords = true;
  98
  99
 100 void
 101 x_c_extract_all ()
 102 {
 103   extract_all = true;
 104 }
 105
 106
 107 static void
 108 add_keyword (const char *name, hash_table *keywords)
 109 {
 110   if (name == NULL)
 111     default_keywords = false;
 112   else
 113     {
 114       const char *end;
 115       int argnum1;
 116       int argnum2;
 117       const char *colon;
 118
 119       if (keywords->table == NULL)
 120         init_hash (keywords, 100);
 121
 122       split_keywordspec (name, &end, &argnum1, &argnum2);
 123
 124       /* The characters between name and end should form a valid C identifier.
 125          A colon means an invalid parse in split_keywordspec().  */
 126       colon = strchr (name, ':');
 127       if (colon == NULL || colon >= end)
 128         {
 129           if (argnum1 == 0)
 130             argnum1 = 1;
 131           insert_entry (keywords, name, end - name,
 132                         (void *) (long) (argnum1 + (argnum2 << 10)));
 133         }
 134     }
 135 }
 136
 137 void
 138 x_c_keyword (const char *name)
 139 {
 140   add_keyword (name, &c_keywords);
 141 }
 142
 143 void
 144 x_objc_keyword (const char *name)
 145 {
 146   add_keyword (name, &objc_keywords);
 147 }
 148
 149 /* Finish initializing the keywords hash tables.
 150    Called after argument processing, before each file is processed.  */
 151 static void
 152 init_keywords ()
 153 {
 154   if (default_keywords)
 155     {
 156       x_c_keyword ("gettext");
 157       x_c_keyword ("dgettext:2");
 158       x_c_keyword ("dcgettext:2");
 159       x_c_keyword ("ngettext:1,2");
 160       x_c_keyword ("dngettext:2,3");
 161       x_c_keyword ("dcngettext:2,3");
 162       x_c_keyword ("gettext_noop");
 163
 164       x_objc_keyword ("gettext");
 165       x_objc_keyword ("dgettext:2");
 166       x_objc_keyword ("dcgettext:2");
 167       x_objc_keyword ("ngettext:1,2");
 168       x_objc_keyword ("dngettext:2,3");
 169       x_objc_keyword ("dcngettext:2,3");
 170       x_objc_keyword ("gettext_noop");
 171       x_objc_keyword ("NSLocalizedString");       /* similar to gettext */
 172       x_objc_keyword ("_");                       /* similar to gettext */
 173       x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
 174       x_objc_keyword ("__");                      /* similar to gettext_noop */
 175
 176       default_keywords = false;
 177     }
 178 }
 179
 180 void
 181 init_flag_table_c ()
 182 {
 183   xgettext_record_flag ("gettext:1:pass-c-format");
 184   xgettext_record_flag ("dgettext:2:pass-c-format");
 185   xgettext_record_flag ("dcgettext:2:pass-c-format");
 186   xgettext_record_flag ("ngettext:1:pass-c-format");
 187   xgettext_record_flag ("ngettext:2:pass-c-format");
 188   xgettext_record_flag ("dngettext:2:pass-c-format");
 189   xgettext_record_flag ("dngettext:3:pass-c-format");
 190   xgettext_record_flag ("dcngettext:2:pass-c-format");
 191   xgettext_record_flag ("dcngettext:3:pass-c-format");
 192   xgettext_record_flag ("gettext_noop:1:pass-c-format");
 193   /* <stdio.h> */
 194   xgettext_record_flag ("fprintf:2:c-format");
 195   xgettext_record_flag ("vfprintf:2:c-format");
 196   xgettext_record_flag ("printf:1:c-format");
 197   xgettext_record_flag ("vprintf:1:c-format");
 198   xgettext_record_flag ("sprintf:2:c-format");
 199   xgettext_record_flag ("vsprintf:2:c-format");
 200   xgettext_record_flag ("snprintf:3:c-format");
 201   xgettext_record_flag ("vsnprintf:3:c-format");
 202 #if 0 /* These functions are not standard.  */
 203   /* <stdio.h> */
 204   xgettext_record_flag ("asprintf:2:c-format");
 205   xgettext_record_flag ("vasprintf:2:c-format");
 206   xgettext_record_flag ("dprintf:2:c-format");
 207   xgettext_record_flag ("vdprintf:2:c-format");
 208   xgettext_record_flag ("obstack_printf:2:c-format");
 209   xgettext_record_flag ("obstack_vprintf:2:c-format");
 210   /* <error.h> */
 211   xgettext_record_flag ("error:3:c-format");
 212   xgettext_record_flag ("error_at_line:5:c-format");
 213   /* <argp.h> */
 214   xgettext_record_flag ("argp_error:2:c-format");
 215   xgettext_record_flag ("argp_failure:2:c-format");
 216 #endif
 217 }
 218
 219 void
 220 init_flag_table_objc ()
 221 {
 222   /* Since the settings done in init_flag_table_c() also have an effect for
 223      the ObjectiveC parser, we don't have to repeat them here.  */
 224   xgettext_record_flag ("gettext:1:pass-objc-format");
 225   xgettext_record_flag ("dgettext:2:pass-objc-format");
 226   xgettext_record_flag ("dcgettext:2:pass-objc-format");
 227   xgettext_record_flag ("ngettext:1:pass-objc-format");
 228   xgettext_record_flag ("ngettext:2:pass-objc-format");
 229   xgettext_record_flag ("dngettext:2:pass-objc-format");
 230   xgettext_record_flag ("dngettext:3:pass-objc-format");
 231   xgettext_record_flag ("dcngettext:2:pass-objc-format");
 232   xgettext_record_flag ("dcngettext:3:pass-objc-format");
 233   xgettext_record_flag ("gettext_noop:1:pass-objc-format");
 234   xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
 235   xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
 236   xgettext_record_flag ("_:1:pass-c-format");
 237   xgettext_record_flag ("_:1:pass-objc-format");
 238   xgettext_record_flag ("stringWithFormat::1:objc-format");
 239   xgettext_record_flag ("initWithFormat::1:objc-format");
 240   xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
 241   xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
 242   xgettext_record_flag ("appendFormat::1:objc-format");
 243 }
 244
 245 void
 246 init_flag_table_gcc_internal ()
 247 {
 248   xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
 249   xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
 250   xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
 251   xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
 252   xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
 253   xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
 254   xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
 255   xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
 256   xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
 257   xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
 258 #if 0 /* This should better be done inside GCC.  */
 259   /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
 260   /* c-format.c */
 261   xgettext_record_flag ("status_warning:2:gcc-internal-format");
 262   /* c-tree.h */
 263   xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
 264   /* collect2.h */
 265   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
 266   xgettext_record_flag ("notice:1:c-format");
 267   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
 268   xgettext_record_flag ("fatal_perror:1:c-format");
 269   /* cpplib.h */
 270   xgettext_record_flag ("cpp_error:3:c-format");
 271   xgettext_record_flag ("cpp_error_with_line:5:c-format");
 272   /* diagnostic.h */
 273   xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
 274   xgettext_record_flag ("output_printf:2:gcc-internal-format");
 275   xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
 276   xgettext_record_flag ("verbatim:1:gcc-internal-format");
 277   xgettext_record_flag ("inform:1:pass-gcc-internal-format");
 278   /* gcc.h */
 279   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
 280   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
 281   /* genattrtab.h */
 282   xgettext_record_flag ("attr_printf:2:pass-c-format");
 283   /* gengtype.h */
 284   xgettext_record_flag ("error_at_line:2:pass-c-format");
 285   xgettext_record_flag ("xvasprintf:2:pass-c-format");
 286   xgettext_record_flag ("xasprintf:1:pass-c-format");
 287   xgettext_record_flag ("oprintf:2:pass-c-format");
 288   /* gensupport.h */
 289   xgettext_record_flag ("message_with_line:2:pass-c-format");
 290   /* output.h */
 291   xgettext_record_flag ("output_operand_lossage:1:c-format");
 292   /* ra.h */
 293    xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
 294   /* toplev.h */
 295   xgettext_record_flag ("fnotice:2:c-format");
 296   xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
 297   xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
 298   xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
 299   xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
 300   xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
 301   xgettext_record_flag ("pedwarn:1:gcc-internal-format");
 302   xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
 303   xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
 304   xgettext_record_flag ("sorry:1:gcc-internal-format");
 305   xgettext_record_flag ("error:1:pass-gcc-internal-format");
 306   xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
 307   xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
 308   xgettext_record_flag ("warning:1:pass-gcc-internal-format");
 309   xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
 310   xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
 311   /* f/com.h */
 312   xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
 313   /* f/sts.h */
 314   xgettext_record_flag ("ffests_printf:2:pass-c-format");
 315   /* java/java-tree.h */
 316   xgettext_record_flag ("parse_error_context:2:pass-c-format");
 317 #endif
 318 }
 319
 320
 321 /* ======================== Reading of characters.  ======================== */
 322
 323 /* Real filename, used in error messages about the input file.  */
 324 static const char *real_file_name;
 325
 326 /* Logical filename and line number, used to label the extracted messages.  */
 327 static char *logical_file_name;
 328 static int line_number;
 329
 330 /* The input file stream.  */
 331 static FILE *fp;
 332
 333
 334 /* 0. Terminate line by \n, regardless whether the external representation of
 335    a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
 336    It is debatable whether supporting CR/LF line terminators in C sources
 337    on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
 338    unconditionally, it must be OK.
 339    The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
 340    automatically, but here we also need this conversion on Unix.  As a side
 341    effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
 342    is not a problem.  */
 343
 344
 345 static int
 346 phase0_getc ()
 347 {
 348   int c;
 349
 350   c = getc (fp);
 351   if (c == EOF)
 352     {
 353       if (ferror (fp))
 354         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 355                real_file_name);
 356       return EOF;
 357     }
 358
 359   if (c == '\r')
 360     {
 361       int c1 = getc (fp);
 362
 363       if (c1 != EOF && c1 != '\n')
 364         ungetc (c1, fp);
 365
 366       /* Seen line terminator CR or CR/LF.  */
 367       return '\n';
 368     }
 369
 370   return c;
 371 }
 372
 373
 374 /* Supports only one pushback character, and not '\n'.  */
 375 static inline void
 376 phase0_ungetc (int c)
 377 {
 378   if (c != EOF)
 379     ungetc (c, fp);
 380 }
 381
 382
 383 /* 1. line_number handling.  Combine backslash-newline to nothing.  */
 384
 385 static unsigned char phase1_pushback[2];
 386 static int phase1_pushback_length;
 387
 388
 389 static int
 390 phase1_getc ()
 391 {
 392   int c;
 393
 394   if (phase1_pushback_length)
 395     {
 396       c = phase1_pushback[--phase1_pushback_length];
 397       if (c == '\n')
 398         ++line_number;
 399       return c;
 400     }
 401   for (;;)
 402     {
 403       c = phase0_getc ();
 404       switch (c)
 405         {
 406         case '\n':
 407           ++line_number;
 408           return '\n';
 409
 410         case '\\':
 411           c = phase0_getc ();
 412           if (c != '\n')
 413             {
 414               phase0_ungetc (c);
 415               return '\\';
 416             }
 417           ++line_number;
 418           break;
 419
 420         default:
 421           return c;
 422         }
 423     }
 424 }
 425
 426
 427 /* Supports 2 characters of pushback.  */
 428 static void
 429 phase1_ungetc (int c)
 430 {
 431   switch (c)
 432     {
 433     case EOF:
 434       break;
 435
 436     case '\n':
 437       --line_number;
 438       /* FALLTHROUGH */
 439
 440     default:
 441       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 442         abort ();
 443       phase1_pushback[phase1_pushback_length++] = c;
 444       break;
 445     }
 446 }
 447
 448
 449 /* 2. Convert trigraphs to their single character equivalents.  Most
 450    sane human beings vomit copiously at the mention of trigraphs, which
 451    is why they are an option.  */
 452
 453 static unsigned char phase2_pushback[1];
 454 static int phase2_pushback_length;
 455
 456
 457 static int
 458 phase2_getc ()
 459 {
 460   int c;
 461
 462   if (phase2_pushback_length)
 463     return phase2_pushback[--phase2_pushback_length];
 464   if (!trigraphs)
 465     return phase1_getc ();
 466
 467   c = phase1_getc ();
 468   if (c != '?')
 469     return c;
 470   c = phase1_getc ();
 471   if (c != '?')
 472     {
 473       phase1_ungetc (c);
 474       return '?';
 475     }
 476   c = phase1_getc ();
 477   switch (c)
 478     {
 479     case '(':
 480       return '[';
 481     case '/':
 482       return '\\';
 483     case ')':
 484       return ']';
 485     case '\'':
 486       return '^';
 487     case '<':
 488       return '{';
 489     case '!':
 490       return '|';
 491     case '>':
 492       return '}';
 493     case '-':
 494       return '~';
 495     case '#':
 496       return '=';
 497     }
 498   phase1_ungetc (c);
 499   phase1_ungetc ('?');
 500   return '?';
 501 }
 502
 503
 504 /* Supports only one pushback character.  */
 505 static void
 506 phase2_ungetc (int c)
 507 {
 508   if (c != EOF)
 509     {
 510       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 511         abort ();
 512       phase2_pushback[phase2_pushback_length++] = c;
 513     }
 514 }
 515
 516
 517 /* 3. Concatenate each line ending in backslash (\) with the following
 518    line.  Basically, all you need to do is elide "\\\n" sequences from
 519    the input.  */
 520
 521 static unsigned char phase3_pushback[2];
 522 static int phase3_pushback_length;
 523
 524
 525 static int
 526 phase3_getc ()
 527 {
 528   if (phase3_pushback_length)
 529     return phase3_pushback[--phase3_pushback_length];
 530   for (;;)
 531     {
 532       int c = phase2_getc ();
 533       if (c != '\\')
 534         return c;
 535       c = phase2_getc ();
 536       if (c != '\n')
 537         {
 538           phase2_ungetc (c);
 539           return '\\';
 540         }
 541     }
 542 }
 543
 544
 545 /* Supports 2 characters of pushback.  */
 546 static void
 547 phase3_ungetc (int c)
 548 {
 549   if (c != EOF)
 550     {
 551       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 552         abort ();
 553       phase3_pushback[phase3_pushback_length++] = c;
 554     }
 555 }
 556
 557
 558 /* Accumulating comments.  */
 559
 560 static char *buffer;
 561 static size_t bufmax;
 562 static size_t buflen;
 563
 564 static inline void
 565 comment_start ()
 566 {
 567   buflen = 0;
 568 }
 569
 570 static inline void
 571 comment_add (int c)
 572 {
 573   if (buflen >= bufmax)
 574     {
 575       bufmax = 2 * bufmax + 10;
 576       buffer = xrealloc (buffer, bufmax);
 577     }
 578   buffer[buflen++] = c;
 579 }
 580
 581 static inline void
 582 comment_line_end (size_t chars_to_remove)
 583 {
 584   buflen -= chars_to_remove;
 585   while (buflen >= 1
 586          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 587     --buflen;
 588   if (chars_to_remove == 0 && buflen >= bufmax)
 589     {
 590       bufmax = 2 * bufmax + 10;
 591       buffer = xrealloc (buffer, bufmax);
 592     }
 593   buffer[buflen] = '\0';
 594   savable_comment_add (buffer);
 595 }
 596
 597
 598 /* These are for tracking whether comments count as immediately before
 599    keyword.  */
 600 static int last_comment_line;
 601 static int last_non_comment_line;
 602 static int newline_count;
 603
 604
 605 /* 4. Replace each comment that is not inside a character constant or
 606    string literal with a space character.  We need to remember the
 607    comment for later, because it may be attached to a keyword string.
 608    We also optionally understand C++ comments.  */
 609
 610 static int
 611 phase4_getc ()
 612 {
 613   int c;
 614   bool last_was_star;
 615
 616   c = phase3_getc ();
 617   if (c != '/')
 618     return c;
 619   c = phase3_getc ();
 620   switch (c)
 621     {
 622     default:
 623       phase3_ungetc (c);
 624       return '/';
 625
 626     case '*':
 627       /* C comment.  */
 628       comment_start ();
 629       last_was_star = false;
 630       for (;;)
 631         {
 632           c = phase3_getc ();
 633           if (c == EOF)
 634             break;
 635           /* We skip all leading white space, but not EOLs.  */
 636           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 637             comment_add (c);
 638           switch (c)
 639             {
 640             case '\n':
 641               comment_line_end (1);
 642               comment_start ();
 643               last_was_star = false;
 644               continue;
 645
 646             case '*':
 647               last_was_star = true;
 648               continue;
 649
 650             case '/':
 651               if (last_was_star)
 652                 {
 653                   comment_line_end (2);
 654                   break;
 655                 }
 656               /* FALLTHROUGH */
 657
 658             default:
 659               last_was_star = false;
 660               continue;
 661             }
 662           break;
 663         }
 664       last_comment_line = newline_count;
 665       return ' ';
 666
 667     case '/':
 668       /* C++ or ISO C 99 comment.  */
 669       comment_start ();
 670       for (;;)
 671         {
 672           c = phase3_getc ();
 673           if (c == '\n' || c == EOF)
 674             break;
 675           /* We skip all leading white space, but not EOLs.  */
 676           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 677             comment_add (c);
 678         }
 679       comment_line_end (0);
 680       last_comment_line = newline_count;
 681       return '\n';
 682     }
 683 }
 684
 685
 686 /* Supports only one pushback character.  */
 687 static void
 688 phase4_ungetc (int c)
 689 {
 690   phase3_ungetc (c);
 691 }
 692
 693
 694 /* ========================== Reading of tokens.  ========================== */
 695
 696
 697 /* True if ObjectiveC extensions are recognized.  */
 698 static bool objc_extensions;
 699
 700 enum token_type_ty
 701 {
 702   token_type_character_constant,        /* 'x' */
 703   token_type_eof,
 704   token_type_eoln,
 705   token_type_hash,                      /* # */
 706   token_type_lparen,                    /* ( */
 707   token_type_rparen,                    /* ) */
 708   token_type_comma,                     /* , */
 709   token_type_colon,                     /* : */
 710   token_type_name,                      /* abc */
 711   token_type_number,                    /* 2.7 */
 712   token_type_string_literal,            /* "abc" */
 713   token_type_symbol,                    /* < > = etc. */
 714   token_type_objc_special,              /* @ */
 715   token_type_white_space
 716 };
 717 typedef enum token_type_ty token_type_ty;
 718
 719 typedef struct token_ty token_ty;
 720 struct token_ty
 721 {
 722   token_type_ty type;
 723   char *string;         /* for token_type_name, token_type_string_literal */
 724   refcounted_string_list_ty *comment;   /* for token_type_string_literal,
 725                                            token_type_objc_special */
 726   long number;
 727   int line_number;
 728 };
 729
 730
 731 /* 7. Replace escape sequences within character strings with their
 732    single character equivalents.  This is called from phase 5, because
 733    we don't have to worry about the #include argument.  There are
 734    pathological cases which could bite us (like the DOS directory
 735    separator), but just pretend it can't happen.  */
 736
 737 #define P7_QUOTES (1000 + '"')
 738 #define P7_QUOTE (1000 + '\'')
 739 #define P7_NEWLINE (1000 + '\n')
 740
 741 static int
 742 phase7_getc ()
 743 {
 744   int c, n, j;
 745
 746   /* Use phase 3, because phase 4 elides comments.  */
 747   c = phase3_getc ();
 748
 749   /* Return a magic newline indicator, so that we can distinguish
 750      between the user requesting a newline in the string (e.g. using
 751      "\n" or "\012") from the user failing to terminate the string or
 752      character constant.  The ANSI C standard says: 3.1.3.4 Character
 753      Constants contain ``any character except single quote, backslash or
 754      newline; or an escape sequence'' and 3.1.4 String Literals contain
 755      ``any character except double quote, backslash or newline; or an
 756      escape sequence''.
 757
 758      Most compilers give a fatal error in this case, however gcc is
 759      stupidly silent, even though this is a very common typo.  OK, so
 760      gcc --pedantic will tell me, but that gripes about too much other
 761      stuff.  Could I have a ``gcc -Wnewline-in-string'' option, or
 762      better yet a ``gcc -fno-newline-in-string'' option, please?  Gcc is
 763      also inconsistent between string literals and character constants:
 764      you may not embed newlines in character constants; try it, you get
 765      a useful diagnostic.  --PMiller  */
 766   if (c == '\n')
 767     return P7_NEWLINE;
 768
 769   if (c == '"')
 770     return P7_QUOTES;
 771   if (c == '\'')
 772     return P7_QUOTE;
 773   if (c != '\\')
 774     return c;
 775   c = phase3_getc ();
 776   switch (c)
 777     {
 778     default:
 779       /* Unknown escape sequences really should be an error, but just
 780          ignore them, and let the real compiler complain.  */
 781       phase3_ungetc (c);
 782       return '\\';
 783
 784     case '"':
 785     case '\'':
 786     case '?':
 787     case '\\':
 788       return c;
 789
 790     case 'a':
 791       return '\a';
 792     case 'b':
 793       return '\b';
 794
 795       /* The \e escape is preculiar to gcc, and assumes an ASCII
 796          character set (or superset).  We don't provide support for it
 797          here.  */
 798
 799     case 'f':
 800       return '\f';
 801     case 'n':
 802       return '\n';
 803     case 'r':
 804       return '\r';
 805     case 't':
 806       return '\t';
 807     case 'v':
 808       return '\v';
 809
 810     case 'x':
 811       c = phase3_getc ();
 812       switch (c)
 813         {
 814         default:
 815           phase3_ungetc (c);
 816           phase3_ungetc ('x');
 817           return '\\';
 818
 819         case '0': case '1': case '2': case '3': case '4':
 820         case '5': case '6': case '7': case '8': case '9':
 821         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 822         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 823           break;
 824         }
 825       n = 0;
 826       for (;;)
 827         {
 828           switch (c)
 829             {
 830             default:
 831               phase3_ungetc (c);
 832               return n;
 833
 834             case '0': case '1': case '2': case '3': case '4':
 835             case '5': case '6': case '7': case '8': case '9':
 836               n = n * 16 + c - '0';
 837               break;
 838
 839             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 840               n = n * 16 + 10 + c - 'A';
 841               break;
 842
 843             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 844               n = n * 16 + 10 + c - 'a';
 845               break;
 846             }
 847           c = phase3_getc ();
 848         }
 849       return n;
 850
 851     case '0': case '1': case '2': case '3':
 852     case '4': case '5': case '6': case '7':
 853       n = 0;
 854       for (j = 0; j < 3; ++j)
 855         {
 856           n = n * 8 + c - '0';
 857           c = phase3_getc ();
 858           switch (c)
 859             {
 860             default:
 861               break;
 862
 863             case '0': case '1': case '2': case '3':
 864             case '4': case '5': case '6': case '7':
 865               continue;
 866             }
 867           break;
 868         }
 869       phase3_ungetc (c);
 870       return n;
 871     }
 872 }
 873
 874
 875 static void
 876 phase7_ungetc (int c)
 877 {
 878   phase3_ungetc (c);
 879 }
 880
 881
 882 /* Free the memory pointed to by a 'struct token_ty'.  */
 883 static inline void
 884 free_token (token_ty *tp)
 885 {
 886   if (tp->type == token_type_name || tp->type == token_type_string_literal)
 887     free (tp->string);
 888   if (tp->type == token_type_string_literal
 889       || tp->type == token_type_objc_special)
 890     drop_reference (tp->comment);
 891 }
 892
 893
 894 /* 5. Parse each resulting logical line as preprocessing tokens and
 895    white space.  Preprocessing tokens and C tokens don't always match.  */
 896
 897 static token_ty phase5_pushback[1];
 898 static int phase5_pushback_length;
 899
 900
 901 static void
 902 phase5_get (token_ty *tp)
 903 {
 904   static char *buffer;
 905   static int bufmax;
 906   int bufpos;
 907   int c;
 908
 909   if (phase5_pushback_length)
 910     {
 911       *tp = phase5_pushback[--phase5_pushback_length];
 912       return;
 913     }
 914   tp->string = NULL;
 915   tp->number = 0;
 916   tp->line_number = line_number;
 917   c = phase4_getc ();
 918   switch (c)
 919     {
 920     case EOF:
 921       tp->type = token_type_eof;
 922       return;
 923
 924     case '\n':
 925       tp->type = token_type_eoln;
 926       return;
 927
 928     case ' ':
 929     case '\f':
 930     case '\t':
 931       for (;;)
 932         {
 933           c = phase4_getc ();
 934           switch (c)
 935             {
 936             case ' ':
 937             case '\f':
 938             case '\t':
 939               continue;
 940
 941             default:
 942               phase4_ungetc (c);
 943               break;
 944             }
 945           break;
 946         }
 947       tp->type = token_type_white_space;
 948       return;
 949
 950     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
 951     case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
 952     case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
 953     case 'V': case 'W': case 'X': case 'Y': case 'Z':
 954     case '_':
 955     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
 956     case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
 957     case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
 958     case 'v': case 'w': case 'x': case 'y': case 'z':
 959       bufpos = 0;
 960       for (;;)
 961         {
 962           if (bufpos >= bufmax)
 963             {
 964               bufmax = 2 * bufmax + 10;
 965               buffer = xrealloc (buffer, bufmax);
 966             }
 967           buffer[bufpos++] = c;
 968           c = phase4_getc ();
 969           switch (c)
 970             {
 971             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 972             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 973             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 974             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 975             case 'Y': case 'Z':
 976             case '_':
 977             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 978             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 979             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 980             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 981             case 'y': case 'z':
 982             case '0': case '1': case '2': case '3': case '4':
 983             case '5': case '6': case '7': case '8': case '9':
 984               continue;
 985
 986             default:
 987               phase4_ungetc (c);
 988               break;
 989             }
 990           break;
 991         }
 992       if (bufpos >= bufmax)
 993         {
 994           bufmax = 2 * bufmax + 10;
 995           buffer = xrealloc (buffer, bufmax);
 996         }
 997       buffer[bufpos] = 0;
 998       tp->string = xstrdup (buffer);
 999       tp->type = token_type_name;
1000       return;
1001
1002     case '.':
1003       c = phase4_getc ();
1004       phase4_ungetc (c);
1005       switch (c)
1006         {
1007         default:
1008           tp->type = token_type_symbol;
1009           return;
1010
1011         case '0': case '1': case '2': case '3': case '4':
1012         case '5': case '6': case '7': case '8': case '9':
1013           c = '.';
1014           break;
1015         }
1016       /* FALLTHROUGH */
1017
1018     case '0': case '1': case '2': case '3': case '4':
1019     case '5': case '6': case '7': case '8': case '9':
1020       /* The preprocessing number token is more "generous" than the C
1021          number tokens.  This is mostly due to token pasting (another
1022          thing we can ignore here).  */
1023       bufpos = 0;
1024       for (;;)
1025         {
1026           if (bufpos >= bufmax)
1027             {
1028               bufmax = 2 * bufmax + 10;
1029               buffer = xrealloc (buffer, bufmax);
1030             }
1031           buffer[bufpos++] = c;
1032           c = phase4_getc ();
1033           switch (c)
1034             {
1035             case 'e':
1036             case 'E':
1037               if (bufpos >= bufmax)
1038                 {
1039                   bufmax = 2 * bufmax + 10;
1040                   buffer = xrealloc (buffer, bufmax);
1041                 }
1042               buffer[bufpos++] = c;
1043               c = phase4_getc ();
1044               if (c != '+' || c != '-')
1045                 {
1046                   phase4_ungetc (c);
1047                   break;
1048                 }
1049               continue;
1050
1051             case 'A': case 'B': case 'C': case 'D':           case 'F':
1052             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1053             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1054             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1055             case 'Y': case 'Z':
1056             case 'a': case 'b': case 'c': case 'd':           case 'f':
1057             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1058             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1059             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1060             case 'y': case 'z':
1061             case '0': case '1': case '2': case '3': case '4':
1062             case '5': case '6': case '7': case '8': case '9':
1063             case '.':
1064               continue;
1065
1066             default:
1067               phase4_ungetc (c);
1068               break;
1069             }
1070           break;
1071         }
1072       if (bufpos >= bufmax)
1073         {
1074           bufmax = 2 * bufmax + 10;
1075           buffer = xrealloc (buffer, bufmax);
1076         }
1077       buffer[bufpos] = 0;
1078       tp->type = token_type_number;
1079       tp->number = atol (buffer);
1080       return;
1081
1082     case '\'':
1083       /* We could worry about the 'L' before wide character constants,
1084          but ignoring it has no effect unless one of the keywords is
1085          "L".  Just pretend it won't happen.  Also, we don't need to
1086          remember the character constant.  */
1087       for (;;)
1088         {
1089           c = phase7_getc ();
1090           if (c == P7_NEWLINE)
1091             {
1092               error_with_progname = false;
1093               error (0, 0, _("%s:%d: warning: unterminated character constant"),
1094                      logical_file_name, line_number - 1);
1095               error_with_progname = true;
1096               phase7_ungetc ('\n');
1097               break;
1098             }
1099           if (c == EOF || c == P7_QUOTE)
1100             break;
1101         }
1102       tp->type = token_type_character_constant;
1103       return;
1104
1105     case '"':
1106       /* We could worry about the 'L' before wide string constants,
1107          but since gettext's argument is not a wide character string,
1108          let the compiler complain about the argument not matching the
1109          prototype.  Just pretend it won't happen.  */
1110       bufpos = 0;
1111       for (;;)
1112         {
1113           c = phase7_getc ();
1114           if (c == P7_NEWLINE)
1115             {
1116               error_with_progname = false;
1117               error (0, 0, _("%s:%d: warning: unterminated string literal"),
1118                      logical_file_name, line_number - 1);
1119               error_with_progname = true;
1120               phase7_ungetc ('\n');
1121               break;
1122             }
1123           if (c == EOF || c == P7_QUOTES)
1124             break;
1125           if (c == P7_QUOTE)
1126             c = '\'';
1127           if (bufpos >= bufmax)
1128             {
1129               bufmax = 2 * bufmax + 10;
1130               buffer = xrealloc (buffer, bufmax);
1131             }
1132           buffer[bufpos++] = c;
1133         }
1134       if (bufpos >= bufmax)
1135         {
1136           bufmax = 2 * bufmax + 10;
1137           buffer = xrealloc (buffer, bufmax);
1138         }
1139       buffer[bufpos] = 0;
1140       tp->type = token_type_string_literal;
1141       tp->string = xstrdup (buffer);
1142       tp->comment = add_reference (savable_comment);
1143       return;
1144
1145     case '(':
1146       tp->type = token_type_lparen;
1147       return;
1148
1149     case ')':
1150       tp->type = token_type_rparen;
1151       return;
1152
1153     case ',':
1154       tp->type = token_type_comma;
1155       return;
1156
1157     case '#':
1158       tp->type = token_type_hash;
1159       return;
1160
1161     case ':':
1162       tp->type = token_type_colon;
1163       return;
1164
1165     case '@':
1166       if (objc_extensions)
1167         {
1168           tp->type = token_type_objc_special;
1169           tp->comment = add_reference (savable_comment);
1170           return;
1171         }
1172       /* FALLTHROUGH */
1173
1174     default:
1175       /* We could carefully recognize each of the 2 and 3 character
1176         operators, but it is not necessary, as we only need to recognize
1177         gettext invocations.  Don't bother.  */
1178       tp->type = token_type_symbol;
1179       return;
1180     }
1181 }
1182
1183
1184 /* Supports only one pushback token.  */
1185 static void
1186 phase5_unget (token_ty *tp)
1187 {
1188   if (tp->type != token_type_eof)
1189     {
1190       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1191         abort ();
1192       phase5_pushback[phase5_pushback_length++] = *tp;
1193     }
1194 }
1195
1196
1197 /* X. Recognize a leading # symbol.  Leave leading hash as a hash, but
1198    turn hash in the middle of a line into a plain symbol token.  This
1199    makes the phase 6 easier.  */
1200
1201 static void
1202 phaseX_get (token_ty *tp)
1203 {
1204   static bool middle;   /* false at the beginning of a line, true otherwise.  */
1205
1206   phase5_get (tp);
1207
1208   if (tp->type == token_type_eoln || tp->type == token_type_eof)
1209     middle = false;
1210   else
1211     {
1212       if (middle)
1213         {
1214           /* Turn hash in the middle of a line into a plain symbol token.  */
1215           if (tp->type == token_type_hash)
1216             tp->type = token_type_symbol;
1217         }
1218       else
1219         {
1220           /* When we see leading whitespace followed by a hash sign,
1221              discard the leading white space token.  The hash is all
1222              phase 6 is interested in.  */
1223           if (tp->type == token_type_white_space)
1224             {
1225               token_ty next;
1226
1227               phase5_get (&next);
1228               if (next.type == token_type_hash)
1229                 *tp = next;
1230               else
1231                 phase5_unget (&next);
1232             }
1233           middle = true;
1234         }
1235     }
1236 }
1237
1238
1239 /* 6. Recognize and carry out directives (it also expands macros on
1240    non-directive lines, which we do not do here).  The only directive
1241    we care about are the #line and #define directive.  We throw all the
1242    others away.  */
1243
1244 static token_ty phase6_pushback[2];
1245 static int phase6_pushback_length;
1246
1247
1248 static void
1249 phase6_get (token_ty *tp)
1250 {
1251   static token_ty *buf;
1252   static int bufmax;
1253   int bufpos;
1254   int j;
1255
1256   if (phase6_pushback_length)
1257     {
1258       *tp = phase6_pushback[--phase6_pushback_length];
1259       return;
1260     }
1261   for (;;)
1262     {
1263       /* Get the next token.  If it is not a '#' at the beginning of a
1264          line (ignoring whitespace), return immediately.  */
1265       phaseX_get (tp);
1266       if (tp->type != token_type_hash)
1267         return;
1268
1269       /* Accumulate the rest of the directive in a buffer, until the
1270          "define" keyword is seen or until end of line.  */
1271       bufpos = 0;
1272       for (;;)
1273         {
1274           phaseX_get (tp);
1275           if (tp->type == token_type_eoln || tp->type == token_type_eof)
1276             break;
1277
1278           /* Before the "define" keyword and inside other directives
1279              white space is irrelevant.  So just throw it away.  */
1280           if (tp->type != token_type_white_space)
1281             {
1282               /* If it is a #define directive, return immediately,
1283                  thus treating the body of the #define directive like
1284                  normal input.  */
1285               if (bufpos == 0
1286                   && tp->type == token_type_name
1287                   && strcmp (tp->string, "define") == 0)
1288                 return;
1289
1290               /* Accumulate.  */
1291               if (bufpos >= bufmax)
1292                 {
1293                   bufmax = 2 * bufmax + 10;
1294                   buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1295                 }
1296               buf[bufpos++] = *tp;
1297             }
1298         }
1299
1300       /* If it is a #line directive, with no macros to expand, act on
1301          it.  Ignore all other directives.  */
1302       if (bufpos >= 3 && buf[0].type == token_type_name
1303           && strcmp (buf[0].string, "line") == 0
1304           && buf[1].type == token_type_number
1305           && buf[2].type == token_type_string_literal)
1306         {
1307           logical_file_name = xstrdup (buf[2].string);
1308           line_number = buf[1].number;
1309         }
1310       if (bufpos >= 2 && buf[0].type == token_type_number
1311           && buf[1].type == token_type_string_literal)
1312         {
1313           logical_file_name = xstrdup (buf[1].string);
1314           line_number = buf[0].number;
1315         }
1316
1317       /* Release the storage held by the directive.  */
1318       for (j = 0; j < bufpos; ++j)
1319         free_token (&buf[j]);
1320
1321       /* We must reset the selected comments.  */
1322       savable_comment_reset ();
1323     }
1324 }
1325
1326
1327 /* Supports 2 tokens of pushback.  */
1328 static void
1329 phase6_unget (token_ty *tp)
1330 {
1331   if (tp->type != token_type_eof)
1332     {
1333       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1334         abort ();
1335       phase6_pushback[phase6_pushback_length++] = *tp;
1336     }
1337 }
1338
1339
1340 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1341    literal placeholders.  */
1342
1343 /* Test for an ISO C 99 section 7.8.1 format string directive.  */
1344 static bool
1345 is_inttypes_macro (const char *name)
1346 {
1347   /* Syntax:
1348      P R I { d | i | o | u | x | X }
1349      { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR }  */
1350   if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1351     {
1352       name += 3;
1353       if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1354           || name[0] == 'x' || name[0] == 'X')
1355         {
1356           name += 1;
1357           if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1358               && name[3] == '\0')
1359             return true;
1360           if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1361               && name[3] == '\0')
1362             return true;
1363           if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1364               && name[3] == 'S' && name[4] == 'T')
1365             name += 5;
1366           else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1367                    && name[3] == 'T')
1368             name += 4;
1369           if (name[0] == '8' && name[1] == '\0')
1370             return true;
1371           if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1372             return true;
1373           if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1374             return true;
1375           if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1376             return true;
1377         }
1378     }
1379   return false;
1380 }
1381
1382 static void
1383 phase8a_get (token_ty *tp)
1384 {
1385   phase6_get (tp);
1386   if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1387     {
1388       /* Turn PRIdXXX into "<PRIdXXX>".  */
1389       size_t len = strlen (tp->string);
1390       char *new_string = (char *) xmalloc (len + 3);
1391       new_string[0] = '<';
1392       memcpy (new_string + 1, tp->string, len);
1393       new_string[len + 1] = '>';
1394       new_string[len + 2] = '\0';
1395       free (tp->string);
1396       tp->string = new_string;
1397       tp->comment = add_reference (savable_comment);
1398       tp->type = token_type_string_literal;
1399     }
1400 }
1401
1402 /* Supports 2 tokens of pushback.  */
1403 static inline void
1404 phase8a_unget (token_ty *tp)
1405 {
1406   phase6_unget (tp);
1407 }
1408
1409
1410 /* 8b. Drop whitespace.  */
1411 static void
1412 phase8b_get (token_ty *tp)
1413 {
1414   for (;;)
1415     {
1416       phase8a_get (tp);
1417
1418       if (tp->type == token_type_white_space)
1419         continue;
1420       if (tp->type == token_type_eoln)
1421         {
1422           /* We have to track the last occurrence of a string.  One
1423              mode of xgettext allows to group an extracted message
1424              with a comment for documentation.  The rule which states
1425              which comment is assumed to be grouped with the message
1426              says it should immediately precede it.  Our
1427              interpretation: between the last line of the comment and
1428              the line in which the keyword is found must be no line
1429              with non-white space tokens.  */
1430           ++newline_count;
1431           if (last_non_comment_line > last_comment_line)
1432             savable_comment_reset ();
1433           continue;
1434         }
1435       break;
1436     }
1437 }
1438
1439 /* Supports 2 tokens of pushback.  */
1440 static inline void
1441 phase8b_unget (token_ty *tp)
1442 {
1443   phase8a_unget (tp);
1444 }
1445
1446
1447 /* 8c. In ObjectiveC mode, drop '@' before a literal string.  We need to
1448    do this before performing concatenation of adjacent string literals.  */
1449 static void
1450 phase8c_get (token_ty *tp)
1451 {
1452   token_ty tmp;
1453
1454   phase8b_get (tp);
1455   if (tp->type != token_type_objc_special)
1456     return;
1457   phase8b_get (&tmp);
1458   if (tmp.type != token_type_string_literal)
1459     {
1460       phase8b_unget (&tmp);
1461       return;
1462     }
1463   /* Drop the '@' token and return immediately the following string.  */
1464   drop_reference (tmp.comment);
1465   tmp.comment = tp->comment;
1466   *tp = tmp;
1467 }
1468
1469 /* Supports only one pushback token.  */
1470 static inline void
1471 phase8c_unget (token_ty *tp)
1472 {
1473   phase8b_unget (tp);
1474 }
1475
1476
1477 /* 8. Concatenate adjacent string literals to form single string
1478    literals (because we don't expand macros, there are a few things we
1479    will miss).  */
1480
1481 static void
1482 phase8_get (token_ty *tp)
1483 {
1484   phase8c_get (tp);
1485   if (tp->type != token_type_string_literal)
1486     return;
1487   for (;;)
1488     {
1489       token_ty tmp;
1490       size_t len;
1491
1492       phase8c_get (&tmp);
1493       if (tmp.type != token_type_string_literal)
1494         {
1495           phase8c_unget (&tmp);
1496           return;
1497         }
1498       len = strlen (tp->string);
1499       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1500       strcpy (tp->string + len, tmp.string);
1501       free (tmp.string);
1502     }
1503 }
1504
1505
1506 /* ===================== Reading of high-level tokens.  ==================== */
1507
1508
1509 enum xgettext_token_type_ty
1510 {
1511   xgettext_token_type_eof,
1512   xgettext_token_type_keyword,
1513   xgettext_token_type_symbol,
1514   xgettext_token_type_lparen,
1515   xgettext_token_type_rparen,
1516   xgettext_token_type_comma,
1517   xgettext_token_type_colon,
1518   xgettext_token_type_string_literal,
1519   xgettext_token_type_other
1520 };
1521 typedef enum xgettext_token_type_ty xgettext_token_type_ty;
1522
1523 typedef struct xgettext_token_ty xgettext_token_ty;
1524 struct xgettext_token_ty
1525 {
1526   xgettext_token_type_ty type;
1527
1528   /* These fields are used only for xgettext_token_type_keyword.  */
1529   int argnum1;
1530   int argnum2;
1531
1532   /* This field is used only for xgettext_token_type_string_literal,
1533      xgettext_token_type_keyword, xgettext_token_type_symbol.  */
1534   char *string;
1535
1536   /* This field is used only for xgettext_token_type_string_literal.  */
1537   refcounted_string_list_ty *comment;
1538
1539   /* These fields are only for
1540        xgettext_token_type_keyword,
1541        xgettext_token_type_string_literal.  */
1542   lex_pos_ty pos;
1543 };
1544
1545
1546 /* 9. Convert the remaining preprocessing tokens to C tokens and
1547    discards any white space from the translation unit.  */
1548
1549 static void
1550 x_c_lex (xgettext_token_ty *tp)
1551 {
1552   for (;;)
1553     {
1554       token_ty token;
1555       void *keyword_value;
1556
1557       phase8_get (&token);
1558       switch (token.type)
1559         {
1560         case token_type_eof:
1561           tp->type = xgettext_token_type_eof;
1562           return;
1563
1564         case token_type_name:
1565           last_non_comment_line = newline_count;
1566
1567           if (find_entry (objc_extensions ? &objc_keywords : &c_keywords,
1568                           token.string, strlen (token.string), &keyword_value)
1569               == 0)
1570             {
1571               tp->type = xgettext_token_type_keyword;
1572               tp->argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1573               tp->argnum2 = (int) (long) keyword_value >> 10;
1574               tp->pos.file_name = logical_file_name;
1575               tp->pos.line_number = token.line_number;
1576             }
1577           else
1578             tp->type = xgettext_token_type_symbol;
1579           tp->string = token.string;
1580           return;
1581
1582         case token_type_lparen:
1583           last_non_comment_line = newline_count;
1584
1585           tp->type = xgettext_token_type_lparen;
1586           return;
1587
1588         case token_type_rparen:
1589           last_non_comment_line = newline_count;
1590
1591           tp->type = xgettext_token_type_rparen;
1592           return;
1593
1594         case token_type_comma:
1595           last_non_comment_line = newline_count;
1596
1597           tp->type = xgettext_token_type_comma;
1598           return;
1599
1600         case token_type_colon:
1601           last_non_comment_line = newline_count;
1602
1603           tp->type = xgettext_token_type_colon;
1604           return;
1605
1606         case token_type_string_literal:
1607           last_non_comment_line = newline_count;
1608
1609           tp->type = xgettext_token_type_string_literal;
1610           tp->string = token.string;
1611           tp->comment = token.comment;
1612           tp->pos.file_name = logical_file_name;
1613           tp->pos.line_number = token.line_number;
1614           return;
1615
1616         case token_type_objc_special:
1617           drop_reference (token.comment);
1618           /* FALLTHROUGH */
1619
1620         default:
1621           last_non_comment_line = newline_count;
1622
1623           tp->type = xgettext_token_type_other;
1624           return;
1625         }
1626     }
1627 }
1628
1629
1630 /* ========================= Extracting strings.  ========================== */
1631
1632
1633 /* Context lookup table.  */
1634 static flag_context_list_table_ty *flag_context_list_table;
1635
1636
1637 /* The file is broken into tokens.  Scan the token stream, looking for
1638    a keyword, followed by a left paren, followed by a string.  When we
1639    see this sequence, we have something to remember.  We assume we are
1640    looking at a valid C or C++ program, and leave the complaints about
1641    the grammar to the compiler.
1642
1643      Normal handling: Look for
1644        keyword ( ... msgid ... )
1645      Plural handling: Look for
1646        keyword ( ... msgid ... msgid_plural ... )
1647
1648    We use recursion because the arguments before msgid or between msgid
1649    and msgid_plural can contain subexpressions of the same form.  */
1650
1651
1652 /* Extract messages until the next balanced closing parenthesis.
1653    Extracted messages are added to MLP.
1654    When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1655    if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1656    otherwise PLURAL_COMMAS = 0.
1657    When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1658    Return true upon eof, false upon closing parenthesis.  */
1659 static bool
1660 extract_parenthesized (message_list_ty *mlp,
1661                        flag_context_ty outer_context,
1662                        flag_context_list_iterator_ty context_iter,
1663                        int commas_to_skip, int plural_commas)
1664 {
1665   /* Remember the message containing the msgid, for msgid_plural.  */
1666   message_ty *plural_mp = NULL;
1667
1668   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1669   int state;
1670   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1671   int next_commas_to_skip = -1;
1672   int next_plural_commas = 0;
1673   /* Context iterator that will be used if the next token is a '('.  */
1674   flag_context_list_iterator_ty next_context_iter =
1675     passthrough_context_list_iterator;
1676   /* Context iterator that will be used if the next token is a ':'.
1677      (Objective C selector syntax.)  */
1678   flag_context_list_iterator_ty selectorcall_context_iter =
1679     passthrough_context_list_iterator;
1680   /* Current context.  */
1681   flag_context_ty inner_context =
1682     inherited_context (outer_context,
1683                        flag_context_list_iterator_advance (&context_iter));
1684
1685   /* Start state is 0.  */
1686   state = 0;
1687
1688   for (;;)
1689     {
1690       xgettext_token_ty token;
1691
1692       x_c_lex (&token);
1693       switch (token.type)
1694         {
1695         case xgettext_token_type_keyword:
1696           next_commas_to_skip = token.argnum1 - 1;
1697           next_plural_commas = (token.argnum2 > token.argnum1
1698                                 ? token.argnum2 - token.argnum1 : 0);
1699           state = 1;
1700           goto keyword_or_symbol;
1701
1702         case xgettext_token_type_symbol:
1703           state = 0;
1704         keyword_or_symbol:
1705           next_context_iter =
1706             flag_context_list_iterator (
1707               flag_context_list_table_lookup (
1708                 flag_context_list_table,
1709                 token.string, strlen (token.string)));
1710           if (objc_extensions)
1711             {
1712               size_t token_string_len = strlen (token.string);
1713               token.string = xrealloc (token.string, token_string_len + 2);
1714               token.string[token_string_len] = ':';
1715               token.string[token_string_len + 1] = '\0';
1716               selectorcall_context_iter =
1717                 flag_context_list_iterator (
1718                   flag_context_list_table_lookup (
1719                     flag_context_list_table,
1720                     token.string, token_string_len + 1));
1721             }
1722           free (token.string);
1723           continue;
1724
1725         case xgettext_token_type_lparen:
1726           if (extract_parenthesized (mlp, inner_context, next_context_iter,
1727                                      state ? next_commas_to_skip : -1,
1728                                      state ? next_plural_commas : 0))
1729             return true;
1730           next_context_iter = null_context_list_iterator;
1731           selectorcall_context_iter = null_context_list_iterator;
1732           state = 0;
1733           continue;
1734
1735         case xgettext_token_type_rparen:
1736           return false;
1737
1738         case xgettext_token_type_comma:
1739           if (commas_to_skip >= 0)
1740             {
1741               if (commas_to_skip > 0)
1742                 commas_to_skip--;
1743               else
1744                 if (plural_mp != NULL && plural_commas > 0)
1745                   {
1746                     commas_to_skip = plural_commas - 1;
1747                     plural_commas = 0;
1748                   }
1749                 else
1750                   commas_to_skip = -1;
1751             }
1752           inner_context =
1753             inherited_context (outer_context,
1754                                flag_context_list_iterator_advance (
1755                                  &context_iter));
1756           next_context_iter = passthrough_context_list_iterator;
1757           selectorcall_context_iter = passthrough_context_list_iterator;
1758           state = 0;
1759           continue;
1760
1761         case xgettext_token_type_colon:
1762           if (objc_extensions)
1763             {
1764               context_iter = selectorcall_context_iter;
1765               inner_context =
1766                 inherited_context (inner_context,
1767                                    flag_context_list_iterator_advance (
1768                                      &context_iter));
1769               next_context_iter = passthrough_context_list_iterator;
1770               selectorcall_context_iter = passthrough_context_list_iterator;
1771             }
1772           else
1773             {
1774               next_context_iter = null_context_list_iterator;
1775               selectorcall_context_iter = null_context_list_iterator;
1776             }
1777           state = 0;
1778           continue;
1779
1780         case xgettext_token_type_string_literal:
1781           if (extract_all)
1782             {
1783               savable_comment_to_xgettext_comment (token.comment);
1784               remember_a_message (mlp, token.string, inner_context, &token.pos);
1785               savable_comment_reset ();
1786             }
1787           else
1788             {
1789               if (commas_to_skip == 0)
1790                 {
1791                   if (plural_mp == NULL)
1792                     {
1793                       /* Seen an msgid.  */
1794                       message_ty *mp;
1795
1796                       savable_comment_to_xgettext_comment (token.comment);
1797                       mp = remember_a_message (mlp, token.string,
1798                                                inner_context, &token.pos);
1799                       savable_comment_reset ();
1800                       if (plural_commas > 0)
1801                         plural_mp = mp;
1802                     }
1803                   else
1804                     {
1805                       /* Seen an msgid_plural.  */
1806                       remember_a_message_plural (plural_mp, token.string,
1807                                                  inner_context, &token.pos);
1808                       plural_mp = NULL;
1809                     }
1810                 }
1811               else
1812                 free (token.string);
1813             }
1814           drop_reference (token.comment);
1815           next_context_iter = null_context_list_iterator;
1816           selectorcall_context_iter = null_context_list_iterator;
1817           state = 0;
1818           continue;
1819
1820         case xgettext_token_type_other:
1821           next_context_iter = null_context_list_iterator;
1822           selectorcall_context_iter = null_context_list_iterator;
1823           state = 0;
1824           continue;
1825
1826         case xgettext_token_type_eof:
1827           return true;
1828
1829         default:
1830           abort ();
1831         }
1832     }
1833 }
1834
1835
1836 static void
1837 extract_whole_file (FILE *f,
1838                     const char *real_filename, const char *logical_filename,
1839                     flag_context_list_table_ty *flag_table,
1840                     msgdomain_list_ty *mdlp)
1841 {
1842   message_list_ty *mlp = mdlp->item[0]->messages;
1843
1844   fp = f;
1845   real_file_name = real_filename;
1846   logical_file_name = xstrdup (logical_filename);
1847   line_number = 1;
1848
1849   newline_count = 0;
1850   last_comment_line = -1;
1851   last_non_comment_line = -1;
1852
1853   flag_context_list_table = flag_table;
1854
1855   init_keywords ();
1856
1857   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1858      due to an unbalanced closing parenthesis, just restart it.  */
1859   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1860                                  -1, 0))
1861     ;
1862
1863   /* Close scanner.  */
1864   fp = NULL;
1865   real_file_name = NULL;
1866   logical_file_name = NULL;
1867   line_number = 0;
1868 }
1869
1870
1871 void
1872 extract_c (FILE *f,
1873            const char *real_filename, const char *logical_filename,
1874            flag_context_list_table_ty *flag_table,
1875            msgdomain_list_ty *mdlp)
1876 {
1877   objc_extensions = false;
1878   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1879 }
1880
1881 void
1882 extract_objc (FILE *f,
1883               const char *real_filename, const char *logical_filename,
1884               flag_context_list_table_ty *flag_table,
1885               msgdomain_list_ty *mdlp)
1886 {
1887   objc_extensions = true;
1888   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1889 }