gnu/dist/gettext/gettext-tools/src/x-perl.c

   1 /* xgettext Perl backend.
   2    Copyright (C) 2002-2004 Free Software Foundation, Inc.
   3
   4    This file was written by Guido Flohr <guido@imperia.net>, 2002-2003.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <stdbool.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "message.h"
  31 #include "xgettext.h"
  32 #include "x-perl.h"
  33 #include "error.h"
  34 #include "error-progname.h"
  35 #include "xalloc.h"
  36 #include "exit.h"
  37 #include "po-charset.h"
  38 #include "ucs4-utf8.h"
  39 #include "uniname.h"
  40 #include "getline.h"
  41 #include "gettext.h"
  42
  43 #define _(s) gettext(s)
  44
  45 /* The Perl syntax is defined in perlsyn.pod.  Try the command
  46    "man perlsyn" or "perldoc perlsyn".
  47    Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
  48    Try the command "man perlsub" or "perldoc perlsub".  */
  49
  50 #define DEBUG_PERL 0
  51
  52
  53 /* ====================== Keyword set customization.  ====================== */
  54
  55 /* If true extract all strings.  */
  56 static bool extract_all = false;
  57
  58 static hash_table keywords;
  59 static bool default_keywords = true;
  60
  61
  62 void
  63 x_perl_extract_all ()
  64 {
  65   extract_all = true;
  66 }
  67
  68
  69 void
  70 x_perl_keyword (const char *name)
  71 {
  72   if (name == NULL)
  73     default_keywords = false;
  74   else
  75     {
  76       const char *end;
  77       int argnum1;
  78       int argnum2;
  79       const char *colon;
  80
  81       if (keywords.table == NULL)
  82         init_hash (&keywords, 100);
  83
  84       split_keywordspec (name, &end, &argnum1, &argnum2);
  85
  86       /* The characters between name and end should form a valid C identifier.
  87          A colon means an invalid parse in split_keywordspec().  */
  88       colon = strchr (name, ':');
  89       if (colon == NULL || colon >= end)
  90         {
  91           if (argnum1 == 0)
  92             argnum1 = 1;
  93           insert_entry (&keywords, name, end - name,
  94                         (void *) (long) (argnum1 + (argnum2 << 10)));
  95         }
  96     }
  97 }
  98
  99 /* Finish initializing the keywords hash table.
 100    Called after argument processing, before each file is processed.  */
 101 static void
 102 init_keywords ()
 103 {
 104   if (default_keywords)
 105     {
 106       x_perl_keyword ("gettext");
 107       x_perl_keyword ("%gettext");
 108       x_perl_keyword ("$gettext");
 109       x_perl_keyword ("dgettext:2");
 110       x_perl_keyword ("dcgettext:2");
 111       x_perl_keyword ("ngettext:1,2");
 112       x_perl_keyword ("dngettext:2,3");
 113       x_perl_keyword ("dcngettext:2,3");
 114       x_perl_keyword ("gettext_noop");
 115 #if 0
 116       x_perl_keyword ("__");
 117       x_perl_keyword ("$__");
 118       x_perl_keyword ("%__");
 119       x_perl_keyword ("__x");
 120       x_perl_keyword ("__n:1,2");
 121       x_perl_keyword ("__nx:1,2");
 122       x_perl_keyword ("__xn:1,2");
 123       x_perl_keyword ("N__");
 124 #endif
 125       default_keywords = false;
 126     }
 127 }
 128
 129 void
 130 init_flag_table_perl ()
 131 {
 132   xgettext_record_flag ("gettext:1:pass-perl-format");
 133   xgettext_record_flag ("gettext:1:pass-perl-brace-format");
 134   xgettext_record_flag ("%gettext:1:pass-perl-format");
 135   xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
 136   xgettext_record_flag ("$gettext:1:pass-perl-format");
 137   xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
 138   xgettext_record_flag ("dgettext:2:pass-perl-format");
 139   xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
 140   xgettext_record_flag ("dcgettext:2:pass-perl-format");
 141   xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
 142   xgettext_record_flag ("ngettext:1:pass-perl-format");
 143   xgettext_record_flag ("ngettext:2:pass-perl-format");
 144   xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
 145   xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
 146   xgettext_record_flag ("dngettext:2:pass-perl-format");
 147   xgettext_record_flag ("dngettext:3:pass-perl-format");
 148   xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
 149   xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
 150   xgettext_record_flag ("dcngettext:2:pass-perl-format");
 151   xgettext_record_flag ("dcngettext:3:pass-perl-format");
 152   xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
 153   xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
 154   xgettext_record_flag ("gettext_noop:1:pass-perl-format");
 155   xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
 156   xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
 157   xgettext_record_flag ("sprintf:1:perl-format");
 158 #if 0
 159   xgettext_record_flag ("__:1:pass-perl-format");
 160   xgettext_record_flag ("__:1:pass-perl-brace-format");
 161   xgettext_record_flag ("%__:1:pass-perl-format");
 162   xgettext_record_flag ("%__:1:pass-perl-brace-format");
 163   xgettext_record_flag ("$__:1:pass-perl-format");
 164   xgettext_record_flag ("$__:1:pass-perl-brace-format");
 165   xgettext_record_flag ("__x:1:perl-brace-format");
 166   xgettext_record_flag ("__n:1:pass-perl-format");
 167   xgettext_record_flag ("__n:2:pass-perl-format");
 168   xgettext_record_flag ("__n:1:pass-perl-brace-format");
 169   xgettext_record_flag ("__n:2:pass-perl-brace-format");
 170   xgettext_record_flag ("__nx:1:perl-brace-format");
 171   xgettext_record_flag ("__nx:2:perl-brace-format");
 172   xgettext_record_flag ("__xn:1:perl-brace-format");
 173   xgettext_record_flag ("__xn:2:perl-brace-format");
 174   xgettext_record_flag ("N__:1:pass-perl-format");
 175   xgettext_record_flag ("N__:1:pass-perl-brace-format");
 176 #endif
 177 }
 178
 179
 180 /* ======================== Reading of characters.  ======================== */
 181
 182 /* Real filename, used in error messages about the input file.  */
 183 static const char *real_file_name;
 184
 185 /* Logical filename and line number, used to label the extracted messages.  */
 186 static char *logical_file_name;
 187 static int line_number;
 188
 189 /* The input file stream.  */
 190 static FILE *fp;
 191
 192 /* The current line buffer.  */
 193 static char *linebuf;
 194
 195 /* The size of the current line.  */
 196 static int linesize;
 197
 198 /* The position in the current line.  */
 199 static int linepos;
 200
 201 /* The size of the input buffer.  */
 202 static size_t linebuf_size;
 203
 204 /* Number of lines eaten for here documents.  */
 205 static int here_eaten;
 206
 207 /* Paranoia: EOF marker for __END__ or __DATA__.  */
 208 static bool end_of_file;
 209
 210
 211 /* 1. line_number handling.  */
 212
 213 /* Returns the next character from the input stream or EOF.  */
 214 static int
 215 phase1_getc ()
 216 {
 217   line_number += here_eaten;
 218   here_eaten = 0;
 219
 220   if (end_of_file)
 221     return EOF;
 222
 223   if (linepos >= linesize)
 224     {
 225       linesize = getline (&linebuf, &linebuf_size, fp);
 226
 227       if (linesize < 0)
 228         {
 229           if (ferror (fp))
 230             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 231                    real_file_name);
 232           end_of_file = true;
 233           return EOF;
 234         }
 235
 236       linepos = 0;
 237       ++line_number;
 238
 239       /* Undosify.  This is important for catching the end of <<EOF and
 240          <<'EOF'.  We could rely on stdio doing this for us but you
 241          it is not uncommon to to come across Perl scripts with CRLF
 242          newline conventions on systems that do not follow this
 243          convention.  */
 244       if (linesize >= 2 && linebuf[linesize - 1] == '\n'
 245           && linebuf[linesize - 2] == '\r')
 246         {
 247           linebuf[linesize - 2] = '\n';
 248           linebuf[linesize - 1] = '\0';
 249           --linesize;
 250         }
 251     }
 252
 253   return linebuf[linepos++];
 254 }
 255
 256 /* Supports only one pushback character.  */
 257 static void
 258 phase1_ungetc (int c)
 259 {
 260   if (c != EOF)
 261     {
 262       if (linepos == 0)
 263         /* Attempt to ungetc across line boundary.  Shouldn't happen.
 264            No two phase1_ungetc calls are permitted in a row.  */
 265         abort ();
 266
 267       --linepos;
 268     }
 269 }
 270
 271 /* Read a here document and return its contents.
 272    The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
 273    encoded as well.  */
 274
 275 static char *
 276 get_here_document (const char *delimiter)
 277 {
 278   /* Accumulator for the entire here document, including a NUL byte
 279      at the end.  */
 280   static char *buffer;
 281   static size_t bufmax = 0;
 282   size_t bufpos = 0;
 283   /* Current line being appended.  */
 284   static char *my_linebuf = NULL;
 285   static size_t my_linebuf_size = 0;
 286
 287   /* Allocate the initial buffer.  Later on, bufmax > 0.  */
 288   if (bufmax == 0)
 289     {
 290       buffer = xrealloc (NULL, 1);
 291       buffer[0] = '\0';
 292       bufmax = 1;
 293     }
 294
 295   for (;;)
 296     {
 297       int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
 298       char *my_line_utf8;
 299       bool chomp;
 300
 301       if (read_bytes < 0)
 302         {
 303           if (ferror (fp))
 304             {
 305               error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 306                      real_file_name);
 307             }
 308           else
 309             {
 310               error_with_progname = false;
 311               error (EXIT_SUCCESS, 0, _("\
 312 %s:%d: can't find string terminator \"%s\" anywhere before EOF"),
 313                      real_file_name, line_number, delimiter);
 314               error_with_progname = true;
 315
 316               break;
 317             }
 318         }
 319
 320       ++here_eaten;
 321
 322       /* Convert to UTF-8.  */
 323       my_line_utf8 =
 324         from_current_source_encoding (my_linebuf, logical_file_name,
 325                                       line_number + here_eaten);
 326       if (my_line_utf8 != my_linebuf)
 327         {
 328           if (strlen (my_line_utf8) >= my_linebuf_size)
 329             {
 330               my_linebuf_size = strlen (my_line_utf8) + 1;
 331               my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
 332             }
 333           strcpy (my_linebuf, my_line_utf8);
 334           free (my_line_utf8);
 335         }
 336
 337       /* Undosify.  This is important for catching the end of <<EOF and
 338          <<'EOF'.  We could rely on stdio doing this for us but you
 339          it is not uncommon to to come across Perl scripts with CRLF
 340          newline conventions on systems that do not follow this
 341          convention.  */
 342       if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
 343           && my_linebuf[read_bytes - 2] == '\r')
 344         {
 345           my_linebuf[read_bytes - 2] = '\n';
 346           my_linebuf[read_bytes - 1] = '\0';
 347           --read_bytes;
 348         }
 349
 350       /* Temporarily remove the trailing newline from my_linebuf.  */
 351       chomp = false;
 352       if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
 353         {
 354           chomp = true;
 355           my_linebuf[read_bytes - 1] = '\0';
 356         }
 357
 358       /* See whether this line terminates the here document.  */
 359       if (strcmp (my_linebuf, delimiter) == 0)
 360         break;
 361
 362       /* Add back the trailing newline to my_linebuf.  */
 363       if (chomp)
 364         my_linebuf[read_bytes - 1] = '\n';
 365
 366       /* Ensure room for read_bytes + 1 bytes.  */
 367       if (bufpos + read_bytes >= bufmax)
 368         {
 369           do
 370             bufmax = 2 * bufmax + 10;
 371           while (bufpos + read_bytes >= bufmax);
 372           buffer = xrealloc (buffer, bufmax);
 373         }
 374       /* Append this line to the accumulator.  */
 375       strcpy (buffer + bufpos, my_linebuf);
 376       bufpos += read_bytes;
 377     }
 378
 379   /* Done accumulating the here document.  */
 380   return xstrdup (buffer);
 381 }
 382
 383 /* Skips pod sections.  */
 384 static void
 385 skip_pod ()
 386 {
 387   line_number += here_eaten;
 388   here_eaten = 0;
 389   linepos = 0;
 390
 391   for (;;)
 392     {
 393       linesize = getline (&linebuf, &linebuf_size, fp);
 394
 395       if (linesize < 0)
 396         {
 397           if (ferror (fp))
 398             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 399                    real_file_name);
 400           return;
 401         }
 402
 403       ++line_number;
 404
 405       if (strncmp ("=cut", linebuf, 4) == 0)
 406         {
 407           /* Force reading of a new line on next call to phase1_getc().  */
 408           linepos = linesize;
 409           return;
 410         }
 411     }
 412 }
 413
 414
 415 /* These are for tracking whether comments count as immediately before
 416    keyword.  */
 417 static int last_comment_line;
 418 static int last_non_comment_line;
 419
 420
 421 /* 2. Replace each comment that is not inside a string literal or regular
 422    expression with a newline character.  We need to remember the comment
 423    for later, because it may be attached to a keyword string.  */
 424
 425 static int
 426 phase2_getc ()
 427 {
 428   static char *buffer;
 429   static size_t bufmax;
 430   size_t buflen;
 431   int lineno;
 432   int c;
 433   char *utf8_string;
 434
 435   c = phase1_getc ();
 436   if (c == '#')
 437     {
 438       buflen = 0;
 439       lineno = line_number;
 440       /* Skip leading whitespace.  */
 441       for (;;)
 442         {
 443           c = phase1_getc ();
 444           if (c == EOF)
 445             break;
 446           if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
 447             {
 448               phase1_ungetc (c);
 449               break;
 450             }
 451         }
 452       /* Accumulate the comment.  */
 453       for (;;)
 454         {
 455           c = phase1_getc ();
 456           if (c == '\n' || c == EOF)
 457             break;
 458           if (buflen >= bufmax)
 459             {
 460               bufmax = 2 * bufmax + 10;
 461               buffer = xrealloc (buffer, bufmax);
 462             }
 463           buffer[buflen++] = c;
 464         }
 465       if (buflen >= bufmax)
 466         {
 467           bufmax = 2 * bufmax + 10;
 468           buffer = xrealloc (buffer, bufmax);
 469         }
 470       buffer[buflen] = '\0';
 471       /* Convert it to UTF-8.  */
 472       utf8_string =
 473         from_current_source_encoding (buffer, logical_file_name, lineno);
 474       /* Save it until we encounter the corresponding string.  */
 475       xgettext_current_source_encoding = po_charset_utf8;
 476       xgettext_comment_add (utf8_string);
 477       xgettext_current_source_encoding = xgettext_global_source_encoding;
 478       last_comment_line = lineno;
 479     }
 480   return c;
 481 }
 482
 483 /* Supports only one pushback character.  */
 484 static void
 485 phase2_ungetc (int c)
 486 {
 487   if (c != EOF)
 488     phase1_ungetc (c);
 489 }
 490
 491 /* Whitespace recognition.  */
 492
 493 #define case_whitespace \
 494   case ' ': case '\t': case '\r': case '\n': case '\f'
 495
 496 static inline bool
 497 is_whitespace (int c)
 498 {
 499   return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
 500 }
 501
 502
 503 /* ========================== Reading of tokens.  ========================== */
 504
 505
 506 enum token_type_ty
 507 {
 508   token_type_eof,
 509   token_type_lparen,            /* ( */
 510   token_type_rparen,            /* ) */
 511   token_type_comma,             /* , */
 512   token_type_fat_comma,         /* => */
 513   token_type_dereference,       /* , */
 514   token_type_semicolon,         /* ; */
 515   token_type_lbrace,            /* { */
 516   token_type_rbrace,            /* } */
 517   token_type_lbracket,          /* [ */
 518   token_type_rbracket,          /* ] */
 519   token_type_string,            /* quote-like */
 520   token_type_named_op,          /* if, unless, while, ... */
 521   token_type_variable,          /* $... */
 522   token_type_symbol,            /* symbol, number */
 523   token_type_regex_op,          /* s, tr, y, m.  */
 524   token_type_dot,               /* . */
 525   token_type_other,             /* regexp, misc. operator */
 526   /* The following are not really token types, but variants used by
 527      the parser.  */
 528   token_type_keyword_symbol     /* keyword symbol */
 529 };
 530 typedef enum token_type_ty token_type_ty;
 531
 532 /* Subtypes for strings, important for interpolation.  */
 533 enum string_type_ty
 534 {
 535   string_type_verbatim,     /* "<<'EOF'", "m'...'", "s'...''...'",
 536                                "tr/.../.../", "y/.../.../".  */
 537   string_type_q,            /* "'..'", "q/.../".  */
 538   string_type_qq,           /* '"..."', "`...`", "qq/.../", "qx/.../",
 539                                "<file*glob>".  */
 540   string_type_qr            /* Not supported.  */
 541 };
 542
 543 /* Subtypes for symbols, important for dollar interpretation.  */
 544 enum symbol_type_ty
 545 {
 546   symbol_type_none,         /* Nothing special.  */
 547   symbol_type_sub,          /* 'sub'.  */
 548   symbol_type_function      /* Function name after 'sub'.  */
 549 };
 550
 551 typedef struct token_ty token_ty;
 552 struct token_ty
 553 {
 554   token_type_ty type;
 555   int sub_type;                 /* for token_type_string, token_type_symbol */
 556   char *string;                 /* for:                 in encoding:
 557                                    token_type_named_op  ASCII
 558                                    token_type_string    UTF-8
 559                                    token_type_symbol    ASCII
 560                                    token_type_variable  global_source_encoding
 561                                  */
 562   int line_number;
 563 };
 564
 565 #if DEBUG_PERL
 566 static const char *
 567 token2string (const token_ty *token)
 568 {
 569   switch (token->type)
 570     {
 571     case token_type_eof:
 572       return "token_type_eof";
 573     case token_type_lparen:
 574       return "token_type_lparen";
 575     case token_type_rparen:
 576       return "token_type_rparen";
 577     case token_type_comma:
 578       return "token_type_comma";
 579     case token_type_fat_comma:
 580       return "token_type_fat_comma";
 581     case token_type_dereference:
 582       return "token_type_dereference";
 583     case token_type_semicolon:
 584       return "token_type_semicolon";
 585     case token_type_lbrace:
 586       return "token_type_lbrace";
 587     case token_type_rbrace:
 588       return "token_type_rbrace";
 589     case token_type_lbracket:
 590       return "token_type_lbracket";
 591     case token_type_rbracket:
 592       return "token_type_rbracket";
 593     case token_type_string:
 594       return "token_type_string";
 595     case token_type_named_op:
 596       return "token_type_named_op";
 597     case token_type_variable:
 598       return "token_type_variable";
 599     case token_type_symbol:
 600       return "token_type_symbol";
 601     case token_type_regex_op:
 602       return "token_type_regex_op";
 603     case token_type_dot:
 604       return "token_type_dot";
 605     case token_type_other:
 606       return "token_type_other";
 607     default:
 608       return "unknown";
 609     }
 610 }
 611 #endif
 612
 613 /* Free the memory pointed to by a 'struct token_ty'.  */
 614 static inline void
 615 free_token (token_ty *tp)
 616 {
 617   switch (tp->type)
 618     {
 619     case token_type_named_op:
 620     case token_type_string:
 621     case token_type_symbol:
 622     case token_type_variable:
 623       free (tp->string);
 624       break;
 625     default:
 626       break;
 627     }
 628   free (tp);
 629 }
 630
 631 /* Pass 1 of extracting quotes: Find the end of the string, regardless
 632    of the semantics of the construct.  Return the complete string,
 633    including the starting and the trailing delimiter, with backslashes
 634    removed where appropriate.  */
 635 static char *
 636 extract_quotelike_pass1 (int delim)
 637 {
 638   /* This function is called recursively.  No way to allocate stuff
 639      statically.  Also alloca() is inappropriate due to limited stack
 640      size on some platforms.  So we use malloc().  */
 641   int bufmax = 10;
 642   char *buffer = (char *) xmalloc (bufmax);
 643   int bufpos = 0;
 644   bool nested = true;
 645   int counter_delim;
 646
 647   buffer[bufpos++] = delim;
 648
 649   /* Find the closing delimiter.  */
 650   switch (delim)
 651     {
 652     case '(':
 653       counter_delim = ')';
 654       break;
 655     case '{':
 656       counter_delim = '}';
 657       break;
 658     case '[':
 659       counter_delim = ']';
 660       break;
 661     case '<':
 662       counter_delim = '>';
 663       break;
 664     default: /* "..." or '...' or |...| etc. */
 665       nested = false;
 666       counter_delim = delim;
 667       break;
 668     }
 669
 670   for (;;)
 671     {
 672       int c = phase1_getc ();
 673
 674       /* This round can produce 1 or 2 bytes.  Ensure room for 2 bytes.  */
 675       if (bufpos + 2 > bufmax)
 676         {
 677           bufmax = 2 * bufmax + 10;
 678           buffer = xrealloc (buffer, bufmax);
 679         }
 680
 681       if (c == counter_delim || c == EOF)
 682         {
 683           buffer[bufpos++] = counter_delim; /* will be stripped off later */
 684           buffer[bufpos++] = '\0';
 685 #if DEBUG_PERL
 686           fprintf (stderr, "PASS1: %s\n", buffer);
 687 #endif
 688           return buffer;
 689         }
 690
 691       if (nested && c == delim)
 692         {
 693           char *inner = extract_quotelike_pass1 (delim);
 694           size_t len = strlen (inner);
 695
 696           /* Ensure room for len + 1 bytes.  */
 697           if (bufpos + len >= bufmax)
 698             {
 699               do
 700                 bufmax = 2 * bufmax + 10;
 701               while (bufpos + len >= bufmax);
 702               buffer = xrealloc (buffer, bufmax);
 703             }
 704           strcpy (buffer + bufpos, inner);
 705           free (inner);
 706           bufpos += len;
 707         }
 708       else if (c == '\\')
 709         {
 710           c = phase1_getc ();
 711           if (c == '\\')
 712             {
 713               buffer[bufpos++] = '\\';
 714               buffer[bufpos++] = '\\';
 715             }
 716           else if (c == delim || c == counter_delim)
 717             {
 718               /* This is pass2 in Perl.  */
 719               buffer[bufpos++] = c;
 720             }
 721           else
 722             {
 723               buffer[bufpos++] = '\\';
 724               phase1_ungetc (c);
 725             }
 726         }
 727       else
 728         {
 729           buffer[bufpos++] = c;
 730         }
 731     }
 732 }
 733
 734 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
 735    encoding.  */
 736 static char *
 737 extract_quotelike_pass1_utf8 (int delim)
 738 {
 739   char *string = extract_quotelike_pass1 (delim);
 740   char *utf8_string =
 741     from_current_source_encoding (string, logical_file_name, line_number);
 742   if (utf8_string != string)
 743     free (string);
 744   return utf8_string;
 745 }
 746
 747
 748 /* ========= Reading of tokens and commands.  Extracting strings.  ========= */
 749
 750
 751 /* There is an ambiguity about '/': It can start a division operator ('/' or
 752    '/=') or it can start a regular expression.  The distinction is important
 753    because inside regular expressions, '#' loses its special meaning.
 754    The distinction is possible depending on the parsing state: After a
 755    variable or simple expression, it's a division operator; at the beginning
 756    of an expression, it's a regexp.  */
 757 static bool prefer_division_over_regexp;
 758
 759 /* Context lookup table.  */
 760 static flag_context_list_table_ty *flag_context_list_table;
 761
 762
 763 /* Forward declaration of local functions.  */
 764 static void interpolate_keywords (message_list_ty *mlp, const char *string,
 765                                   int lineno);
 766 static token_ty *x_perl_lex (message_list_ty *mlp);
 767 static void x_perl_unlex (token_ty *tp);
 768 static bool extract_balanced (message_list_ty *mlp, int state,
 769                               token_type_ty delim,
 770                               flag_context_ty outer_context,
 771                               flag_context_list_iterator_ty context_iter,
 772                               int arg_sg, int arg_pl);
 773
 774
 775 /* Extract an unsigned hexadecimal number from STRING, considering at
 776    most LEN bytes and place the result in *RESULT.  Returns a pointer
 777    to the first character past the hexadecimal number.  */
 778 static const char *
 779 extract_hex (const char *string, size_t len, unsigned int *result)
 780 {
 781   size_t i;
 782
 783   *result = 0;
 784
 785   for (i = 0; i < len; i++)
 786     {
 787       char c = string[i];
 788       int number;
 789
 790       if (c >= 'A' && c <= 'F')
 791         number = c - 'A' + 10;
 792       else if (c >= 'a' && c <= 'f')
 793         number = c - 'a' + 10;
 794       else if (c >= '0' && c <= '9')
 795         number = c - '0';
 796       else
 797         break;
 798
 799       *result <<= 4;
 800       *result |= number;
 801     }
 802
 803   return string + i;
 804 }
 805
 806 /* Extract an unsigned octal number from STRING, considering at
 807    most LEN bytes and place the result in *RESULT.  Returns a pointer
 808    to the first character past the octal number.  */
 809 static const char *
 810 extract_oct (const char *string, size_t len, unsigned int *result)
 811 {
 812   size_t i;
 813
 814   *result = 0;
 815
 816   for (i = 0; i < len; i++)
 817     {
 818       char c = string[i];
 819       int number;
 820
 821       if (c >= '0' && c <= '7')
 822         number = c - '0';
 823       else
 824         break;
 825
 826       *result <<= 3;
 827       *result |= number;
 828     }
 829
 830   return string + i;
 831 }
 832
 833 /* Extract the various quotelike constructs except for <<EOF.  See the
 834    section "Gory details of parsing quoted constructs" in perlop.pod.
 835    Return the resulting token in *tp; tp->type == token_type_string.  */
 836 static void
 837 extract_quotelike (token_ty *tp, int delim)
 838 {
 839   char *string = extract_quotelike_pass1_utf8 (delim);
 840   size_t len = strlen (string);
 841
 842   tp->type = token_type_string;
 843   /* Take the string without the delimiters at the start and at the end.  */
 844   if (!(len >= 2))
 845     abort ();
 846   string[len - 1] = '\0';
 847   tp->string = xstrdup (string + 1);
 848   free (string);
 849 }
 850
 851 /* Extract the quotelike constructs with double delimiters, like
 852    s/[SEARCH]/[REPLACE]/.  This function does not eat up trailing
 853    modifiers (left to the caller).
 854    Return the resulting token in *tp; tp->type == token_type_regex_op.  */
 855 static void
 856 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
 857                           bool interpolate)
 858 {
 859   char *string;
 860
 861   tp->type = token_type_regex_op;
 862
 863   string = extract_quotelike_pass1_utf8 (delim);
 864   if (interpolate)
 865     interpolate_keywords (mlp, string, line_number);
 866   free (string);
 867
 868   if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
 869     {
 870       /* The delimiter for the second string can be different, e.g.
 871          s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/.  See "man perlrequick".  */
 872       delim = phase1_getc ();
 873       while (is_whitespace (delim))
 874         {
 875           /* The hash-sign is not a valid delimiter after whitespace, ergo
 876              use phase2_getc() and not phase1_getc() now.  */
 877           delim = phase2_getc ();
 878         }
 879     }
 880   string = extract_quotelike_pass1_utf8 (delim);
 881   if (interpolate)
 882     interpolate_keywords (mlp, string, line_number);
 883   free (string);
 884 }
 885
 886 /* Perform pass 3 of quotelike extraction (interpolation).
 887    *tp is a token of type token_type_string.
 888    This function replaces tp->string.  */
 889 /* FIXME: Currently may writes null-bytes into the string.  */
 890 static void
 891 extract_quotelike_pass3 (token_ty *tp, int error_level)
 892 {
 893   static char *buffer;
 894   static int bufmax = 0;
 895   int bufpos = 0;
 896   const char *crs;
 897   bool uppercase;
 898   bool lowercase;
 899   bool quotemeta;
 900
 901 #if DEBUG_PERL
 902   switch (tp->sub_type)
 903     {
 904     case string_type_verbatim:
 905       fprintf (stderr, "Interpolating string_type_verbatim:\n");
 906       break;
 907     case string_type_q:
 908       fprintf (stderr, "Interpolating string_type_q:\n");
 909       break;
 910     case string_type_qq:
 911       fprintf (stderr, "Interpolating string_type_qq:\n");
 912       break;
 913     case string_type_qr:
 914       fprintf (stderr, "Interpolating string_type_qr:\n");
 915       break;
 916     }
 917   fprintf (stderr, "%s\n", tp->string);
 918   if (tp->sub_type == string_type_verbatim)
 919     fprintf (stderr, "---> %s\n", tp->string);
 920 #endif
 921
 922   if (tp->sub_type == string_type_verbatim)
 923     return;
 924
 925   /* Loop over tp->string, accumulating the expansion in buffer.  */
 926   crs = tp->string;
 927   uppercase = false;
 928   lowercase = false;
 929   quotemeta = false;
 930   while (*crs)
 931     {
 932       bool backslashed;
 933
 934       /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
 935          if \Q modifier is present.  */
 936       if (bufpos + 7 > bufmax)
 937         {
 938           bufmax = 2 * bufmax + 10;
 939           buffer = xrealloc (buffer, bufmax);
 940         }
 941
 942       if (tp->sub_type == string_type_q)
 943         {
 944           switch (*crs)
 945             {
 946             case '\\':
 947               if (crs[1] == '\\')
 948                 {
 949                   crs += 2;
 950                   buffer[bufpos++] = '\\';
 951                   break;
 952                 }
 953               /* FALLTHROUGH */
 954             default:
 955               buffer[bufpos++] = *crs++;
 956               break;
 957             }
 958           continue;
 959         }
 960
 961       /* We only get here for double-quoted strings or regular expressions.
 962          Unescape escape sequences.  */
 963       if (*crs == '\\')
 964         {
 965           switch (crs[1])
 966             {
 967             case 't':
 968               crs += 2;
 969               buffer[bufpos++] = '\t';
 970               continue;
 971             case 'n':
 972               crs += 2;
 973               buffer[bufpos++] = '\n';
 974               continue;
 975             case 'r':
 976               crs += 2;
 977               buffer[bufpos++] = '\r';
 978               continue;
 979             case 'f':
 980               crs += 2;
 981               buffer[bufpos++] = '\f';
 982               continue;
 983             case 'b':
 984               crs += 2;
 985               buffer[bufpos++] = '\b';
 986               continue;
 987             case 'a':
 988               crs += 2;
 989               buffer[bufpos++] = '\a';
 990               continue;
 991             case 'e':
 992               crs += 2;
 993               buffer[bufpos++] = 0x1b;
 994               continue;
 995             case '0': case '1': case '2': case '3':
 996             case '4': case '5': case '6': case '7':
 997               {
 998                 unsigned int oct_number;
 999                 int length;
1000
1001                 crs = extract_oct (crs + 1, 3, &oct_number);
1002
1003                 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1004                    true, the character should be converted to its uppercase
1005                    resp. lowercase equivalent.  I don't know if the necessary
1006                    facilities are already included in gettext.  For US-Ascii
1007                    the conversion can be already be done, however.  */
1008                 if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1009                   {
1010                     oct_number = oct_number - 'a' + 'A';
1011                   }
1012                 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1013                   {
1014                     oct_number = oct_number - 'A' + 'a';
1015                   }
1016
1017
1018                 /* Yes, octal escape sequences in the range 0x100..0x1ff are
1019                    valid.  */
1020                 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1021                                     oct_number, 2);
1022                 if (length > 0)
1023                   bufpos += length;
1024               }
1025               continue;
1026             case 'x':
1027               {
1028                 unsigned int hex_number = 0;
1029                 int length;
1030
1031                 crs += 2;
1032                 if (*crs == '{')
1033                   {
1034                     const char *end = strchr (crs, '}');
1035                     if (end == NULL)
1036                       {
1037                         error_with_progname = false;
1038                         error (error_level, 0, _("\
1039 %s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1040                         error_with_progname = true;
1041                         ++crs;
1042                         continue;
1043                       }
1044                     else
1045                       {
1046                         ++crs;
1047                         (void) extract_hex (crs, end - crs, &hex_number);
1048                         crs = end + 1;
1049                       }
1050                   }
1051                 else
1052                   {
1053                     crs = extract_hex (crs, 2, &hex_number);
1054                   }
1055
1056                 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1057                    true, the character should be converted to its uppercase
1058                    resp. lowercase equivalent.  I don't know if the necessary
1059                    facilities are already included in gettext.  For US-Ascii
1060                    the conversion can be already be done, however.  */
1061                 if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1062                   {
1063                     hex_number = hex_number - 'a' + 'A';
1064                   }
1065                 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1066                   {
1067                     hex_number = hex_number - 'A' + 'a';
1068                   }
1069
1070                 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1071                                     hex_number, 6);
1072
1073                 if (length > 0)
1074                   bufpos += length;
1075               }
1076               continue;
1077             case 'c':
1078               /* Perl's notion of control characters.  */
1079               crs += 2;
1080               if (*crs)
1081                 {
1082                   int the_char = (unsigned char) *crs;
1083                   if (the_char >= 'a' || the_char <= 'z')
1084                     the_char = the_char - 'a' + 'A';
1085                   buffer[bufpos++] = the_char ^ 0x40;
1086                 }
1087               continue;
1088             case 'N':
1089               crs += 2;
1090               if (*crs == '{')
1091                 {
1092                   const char *end = strchr (crs + 1, '}');
1093                   if (end != NULL)
1094                     {
1095                       char *name;
1096                       unsigned int unicode;
1097
1098                       name = (char *) xmalloc (end - (crs + 1) + 1);
1099                       memcpy (name, crs + 1, end - (crs + 1));
1100                       name[end - (crs + 1)] = '\0';
1101
1102                       unicode = unicode_name_character (name);
1103                       if (unicode != UNINAME_INVALID)
1104                         {
1105                           /* FIXME: Convert to upper/lowercase if the
1106                              corresponding flag is set to true.  */
1107                           int length =
1108                             u8_uctomb ((unsigned char *) (buffer + bufpos),
1109                                        unicode, 6);
1110                           if (length > 0)
1111                             bufpos += length;
1112                         }
1113
1114                       free (name);
1115
1116                       crs = end + 1;
1117                     }
1118                 }
1119               continue;
1120             }
1121         }
1122
1123       /* No escape sequence, go on.  */
1124       if (*crs == '\\')
1125         {
1126           ++crs;
1127           switch (*crs)
1128             {
1129             case 'E':
1130               uppercase = false;
1131               lowercase = false;
1132               quotemeta = false;
1133               ++crs;
1134               continue;
1135             case 'L':
1136               uppercase = false;
1137               lowercase = true;
1138               ++crs;
1139               continue;
1140             case 'U':
1141               uppercase = true;
1142               lowercase = false;
1143               ++crs;
1144               continue;
1145             case 'Q':
1146               quotemeta = true;
1147               ++crs;
1148               continue;
1149             case 'l':
1150               ++crs;
1151               if (*crs >= 'A' && *crs <= 'Z')
1152                 {
1153                   buffer[bufpos++] = *crs - 'A' + 'a';
1154                 }
1155               else if ((unsigned char) *crs >= 0x80)
1156                 {
1157                   error_with_progname = false;
1158                   error (error_level, 0, _("\
1159 %s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1160                          real_file_name, line_number, *crs);
1161                   error_with_progname = true;
1162                 }
1163               else
1164                 {
1165                   buffer[bufpos++] = *crs;
1166                 }
1167               ++crs;
1168               continue;
1169             case 'u':
1170               ++crs;
1171               if (*crs >= 'a' && *crs <= 'z')
1172                 {
1173                   buffer[bufpos++] = *crs - 'a' + 'A';
1174                 }
1175               else if ((unsigned char) *crs >= 0x80)
1176                 {
1177                   error_with_progname = false;
1178                   error (error_level, 0, _("\
1179 %s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1180                          real_file_name, line_number, *crs);
1181                   error_with_progname = true;
1182                 }
1183               else
1184                 {
1185                   buffer[bufpos++] = *crs;
1186                 }
1187               ++crs;
1188               continue;
1189             case '\\':
1190               buffer[bufpos++] = *crs;
1191               ++crs;
1192               continue;
1193             default:
1194               backslashed = true;
1195               break;
1196             }
1197         }
1198       else
1199         backslashed = false;
1200
1201       if (quotemeta
1202           && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1203                || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1204         {
1205           buffer[bufpos++] = '\\';
1206           backslashed = true;
1207         }
1208
1209       if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1210         {
1211           error_with_progname = false;
1212           error (error_level, 0, _("\
1213 %s:%d: invalid variable interpolation at \"%c\""),
1214                  real_file_name, line_number, *crs);
1215           error_with_progname = true;
1216           ++crs;
1217         }
1218       else if (lowercase)
1219         {
1220           if (*crs >= 'A' && *crs <= 'Z')
1221             buffer[bufpos++] = *crs - 'A' + 'a';
1222           else if ((unsigned char) *crs >= 0x80)
1223             {
1224               error_with_progname = false;
1225               error (error_level, 0, _("\
1226 %s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1227                      real_file_name, line_number, *crs);
1228               error_with_progname = true;
1229               buffer[bufpos++] = *crs;
1230             }
1231           else
1232             buffer[bufpos++] = *crs;
1233           ++crs;
1234         }
1235       else if (uppercase)
1236         {
1237           if (*crs >= 'a' && *crs <= 'z')
1238             buffer[bufpos++] = *crs - 'a' + 'A';
1239           else if ((unsigned char) *crs >= 0x80)
1240             {
1241               error_with_progname = false;
1242               error (error_level, 0, _("\
1243 %s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1244                      real_file_name, line_number, *crs);
1245               error_with_progname = true;
1246               buffer[bufpos++] = *crs;
1247             }
1248           else
1249             buffer[bufpos++] = *crs;
1250           ++crs;
1251         }
1252       else
1253         {
1254           buffer[bufpos++] = *crs++;
1255         }
1256     }
1257
1258   /* Ensure room for 1 more byte.  */
1259   if (bufpos >= bufmax)
1260     {
1261       bufmax = 2 * bufmax + 10;
1262       buffer = xrealloc (buffer, bufmax);
1263     }
1264
1265   buffer[bufpos++] = '\0';
1266
1267 #if DEBUG_PERL
1268   fprintf (stderr, "---> %s\n", buffer);
1269 #endif
1270
1271   /* Replace tp->string.  */
1272   free (tp->string);
1273   tp->string = xstrdup (buffer);
1274 }
1275
1276 /* Parse a variable.  This is done in several steps:
1277      1) Consume all leading occurencies of '$', '@', '%', and '*'.
1278      2) Determine the name of the variable from the following input.
1279      3) Parse possible following hash keys or array indexes.
1280  */
1281 static void
1282 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1283 {
1284   static char *buffer;
1285   static int bufmax = 0;
1286   int bufpos = 0;
1287   int c = first;
1288   size_t varbody_length = 0;
1289   bool maybe_hash_deref = false;
1290   bool maybe_hash_value = false;
1291
1292   tp->type = token_type_variable;
1293
1294 #if DEBUG_PERL
1295   fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1296            real_file_name, line_number, first);
1297 #endif
1298
1299   /*
1300    * 1) Consume dollars and so on (not euros ...).  Unconditionally
1301    *    accepting the hash sign (#) will maybe lead to inaccurate
1302    *    results.  FIXME!
1303    */
1304   while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1305     {
1306       if (bufpos >= bufmax)
1307         {
1308           bufmax = 2 * bufmax + 10;
1309           buffer = xrealloc (buffer, bufmax);
1310         }
1311       buffer[bufpos++] = c;
1312       c = phase1_getc ();
1313     }
1314
1315   if (c == EOF)
1316     {
1317       tp->type = token_type_eof;
1318       return;
1319     }
1320
1321   /* Hash references are treated in a special way, when looking for
1322      our keywords.  */
1323   if (buffer[0] == '$')
1324     {
1325       if (bufpos == 1)
1326         maybe_hash_value = true;
1327       else if (bufpos == 2 && buffer[1] == '$')
1328         {
1329           if (!(c == '{'
1330                 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1331                 || (c >= '0' && c <= '9')
1332                 || c == '_' || c == ':' || c == '\'' || c >= 0x80))
1333             {
1334               /* Special variable $$ for pid.  */
1335               if (bufpos >= bufmax)
1336                 {
1337                   bufmax = 2 * bufmax + 10;
1338                   buffer = xrealloc (buffer, bufmax);
1339                 }
1340               buffer[bufpos++] = '\0';
1341               tp->string = xstrdup (buffer);
1342 #if DEBUG_PERL
1343               fprintf (stderr, "%s:%d: is PID ($$)\n",
1344                        real_file_name, line_number);
1345 #endif
1346
1347               phase1_ungetc (c);
1348               return;
1349             }
1350
1351           maybe_hash_deref = true;
1352           bufpos = 1;
1353         }
1354     }
1355
1356   /*
1357    * 2) Get the name of the variable.  The first character is practically
1358    *    arbitrary.  Punctuation and numbers automagically put a variable
1359    *    in the global namespace but that subtle difference is not interesting
1360    *    for us.
1361    */
1362   if (bufpos >= bufmax)
1363     {
1364       bufmax = 2 * bufmax + 10;
1365       buffer = xrealloc (buffer, bufmax);
1366     }
1367   if (c == '{')
1368     {
1369       /* Yuck, we cannot accept ${gettext} as a keyword...  Except for
1370        * debugging purposes it is also harmless, that we suppress the
1371        * real name of the variable.
1372        */
1373 #if DEBUG_PERL
1374       fprintf (stderr, "%s:%d: braced {variable_name}\n",
1375                real_file_name, line_number);
1376 #endif
1377
1378       if (extract_balanced (mlp, 0, token_type_rbrace,
1379                             null_context, null_context_list_iterator, -1, -1))
1380         return;
1381       buffer[bufpos++] = c;
1382     }
1383   else
1384     {
1385       while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1386              || (c >= '0' && c <= '9')
1387              || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1388         {
1389           ++varbody_length;
1390           if (bufpos >= bufmax)
1391             {
1392               bufmax = 2 * bufmax + 10;
1393               buffer = xrealloc (buffer, bufmax);
1394             }
1395           buffer[bufpos++] = c;
1396           c = phase1_getc ();
1397         }
1398       phase1_ungetc (c);
1399     }
1400
1401   /* Probably some strange Perl variable like $`.  */
1402   if (varbody_length == 0)
1403     {
1404       c = phase1_getc ();
1405       if (c == EOF || is_whitespace (c))
1406         phase1_ungetc (c);  /* Loser.  */
1407       else
1408         {
1409           if (bufpos >= bufmax)
1410             {
1411               bufmax = 2 * bufmax + 10;
1412               buffer = xrealloc (buffer, bufmax);
1413             }
1414           buffer[bufpos++] = c;
1415         }
1416     }
1417
1418   if (bufpos >= bufmax)
1419     {
1420       bufmax = 2 * bufmax + 10;
1421       buffer = xrealloc (buffer, bufmax);
1422     }
1423   buffer[bufpos++] = '\0';
1424
1425   tp->string = xstrdup (buffer);
1426
1427 #if DEBUG_PERL
1428   fprintf (stderr, "%s:%d: complete variable name: %s\n",
1429            real_file_name, line_number, tp->string);
1430 #endif
1431
1432   prefer_division_over_regexp = true;
1433
1434   /*
1435    * 3) If the following looks strange to you, this is valid Perl syntax:
1436    *
1437    *      $var = $$hashref    # We can place a
1438    *                          # comment here and then ...
1439    *             {key_into_hashref};
1440    *
1441    *    POD sections are not allowed but we leave complaints about
1442    *    that to the compiler/interpreter.
1443    */
1444   /* We only extract strings from the first hash key (if present).  */
1445
1446   if (maybe_hash_deref || maybe_hash_value)
1447     {
1448       bool is_dereference = false;
1449       int c;
1450
1451       do
1452         c = phase2_getc ();
1453       while (is_whitespace (c));
1454
1455       if (c == '-')
1456         {
1457           int c2 = phase1_getc ();
1458
1459           if (c2 == '>')
1460             {
1461               is_dereference = true;
1462
1463               do
1464                 c = phase2_getc ();
1465               while (is_whitespace (c));
1466             }
1467           else if (c2 != '\n')
1468             {
1469               /* Discarding the newline is harmless here.  The only
1470                  special character recognized after a minus is greater-than
1471                  for dereference.  However, the sequence "-\n>" that we
1472                  treat incorrectly here, is a syntax error.  */
1473               phase1_ungetc (c2);
1474             }
1475         }
1476
1477       if (maybe_hash_value && is_dereference)
1478         {
1479 #if DEBUG_PERL
1480           fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1481                    real_file_name, line_number);
1482 #endif
1483         }
1484       else if (maybe_hash_value)
1485         {
1486           /* Fake it into a hash.  */
1487           tp->string[0] = '%';
1488         }
1489
1490       /* Do NOT change that into else if (see above).  */
1491       if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1492         {
1493           void *keyword_value;
1494
1495 #if DEBUG_PERL
1496           fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1497                    real_file_name, line_number);
1498 #endif
1499
1500           if (find_entry (&keywords, tp->string, strlen (tp->string),
1501                           &keyword_value) == 0)
1502             {
1503               /* Extract a possible string from the key.  Before proceeding
1504                  we check whether the open curly is followed by a symbol and
1505                  then by a right curly.  */
1506               flag_context_list_iterator_ty context_iter =
1507                 flag_context_list_iterator (
1508                   flag_context_list_table_lookup (
1509                     flag_context_list_table,
1510                     tp->string, strlen (tp->string)));
1511               token_ty *t1 = x_perl_lex (mlp);
1512
1513 #if DEBUG_PERL
1514               fprintf (stderr, "%s:%d: extracting string key\n",
1515                        real_file_name, line_number);
1516 #endif
1517
1518               if (t1->type == token_type_symbol
1519                   || t1->type == token_type_named_op)
1520                 {
1521                   token_ty *t2 = x_perl_lex (mlp);
1522                   if (t2->type == token_type_rbrace)
1523                     {
1524                       flag_context_ty context;
1525                       lex_pos_ty pos;
1526
1527                       context =
1528                         inherited_context (null_context,
1529                                            flag_context_list_iterator_advance (
1530                                              &context_iter));
1531
1532                       pos.line_number = line_number;
1533                       pos.file_name = logical_file_name;
1534
1535                       xgettext_current_source_encoding = po_charset_utf8;
1536                       remember_a_message (mlp, xstrdup (t1->string), context, &pos);
1537                       xgettext_current_source_encoding = xgettext_global_source_encoding;
1538                       free_token (t2);
1539                       free_token (t1);
1540                     }
1541                   else
1542                     {
1543                       x_perl_unlex (t2);
1544                     }
1545                 }
1546               else
1547                 {
1548                   x_perl_unlex (t1);
1549                   if (extract_balanced (mlp, 1, token_type_rbrace,
1550                                         null_context, context_iter, 1, -1))
1551                     return;
1552                 }
1553             }
1554           else
1555             {
1556               phase2_ungetc (c);
1557             }
1558         }
1559       else
1560         {
1561           phase2_ungetc (c);
1562         }
1563     }
1564
1565   /* Now consume "->", "[...]", and "{...}".  */
1566   for (;;)
1567     {
1568       int c = phase2_getc ();
1569       int c2;
1570
1571       switch (c)
1572         {
1573         case '{':
1574 #if DEBUG_PERL
1575           fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1576                    real_file_name, line_number);
1577 #endif
1578           extract_balanced (mlp, 0, token_type_rbrace,
1579                             null_context, null_context_list_iterator, -1, -1);
1580           break;
1581
1582         case '[':
1583 #if DEBUG_PERL
1584           fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1585                    real_file_name, line_number);
1586 #endif
1587           extract_balanced (mlp, 0, token_type_rbracket,
1588                             null_context, null_context_list_iterator, -1, -1);
1589           break;
1590
1591         case '-':
1592           c2 = phase1_getc ();
1593           if (c2 == '>')
1594             {
1595 #if DEBUG_PERL
1596               fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1597                        real_file_name, line_number);
1598 #endif
1599               break;
1600             }
1601           else if (c2 != '\n')
1602             {
1603               /* Discarding the newline is harmless here.  The only
1604                  special character recognized after a minus is greater-than
1605                  for dereference.  However, the sequence "-\n>" that we
1606                  treat incorrectly here, is a syntax error.  */
1607               phase1_ungetc (c2);
1608             }
1609           /* FALLTHROUGH */
1610
1611         default:
1612 #if DEBUG_PERL
1613           fprintf (stderr, "%s:%d: variable finished\n",
1614                    real_file_name, line_number);
1615 #endif
1616           phase2_ungetc (c);
1617           return;
1618         }
1619     }
1620 }
1621
1622 /* Actually a simplified version of extract_variable().  It searches for
1623    variables inside a double-quoted string that may interpolate to
1624    some keyword hash (reference).  The string is UTF-8 encoded.  */
1625 static void
1626 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1627 {
1628   static char *buffer;
1629   static int bufmax = 0;
1630   int bufpos = 0;
1631   flag_context_ty context;
1632   int c;
1633   bool maybe_hash_deref = false;
1634   enum parser_state
1635     {
1636       initial,
1637       one_dollar,
1638       two_dollars,
1639       identifier,
1640       minus,
1641       wait_lbrace,
1642       wait_quote,
1643       dquote,
1644       squote,
1645       barekey,
1646       wait_rbrace
1647     } state;
1648   token_ty token;
1649
1650   lex_pos_ty pos;
1651
1652   /* States are:
1653    *
1654    * initial:      initial
1655    * one_dollar:   dollar sign seen in state INITIAL
1656    * two_dollars:  another dollar-sign has been seen in state ONE_DOLLAR
1657    * identifier:   a valid identifier character has been seen in state
1658    *               ONE_DOLLAR or TWO_DOLLARS
1659    * minus:        a minus-sign has been seen in state IDENTIFIER
1660    * wait_lbrace:  a greater-than has been seen in state MINUS
1661    * wait_quote:   a left brace has been seen in state IDENTIFIER or in
1662    *               state WAIT_LBRACE
1663    * dquote:       a double-quote has been seen in state WAIT_QUOTE
1664    * squote:       a single-quote has been seen in state WAIT_QUOTE
1665    * barekey:      an bareword character has been seen in state WAIT_QUOTE
1666    * wait_rbrace:  closing quote has been seen in state DQUOTE or SQUOTE
1667    *
1668    * In the states initial...identifier the context is null_context; in the
1669    * states minus...wait_rbrace the context is the one suitable for the first
1670    * argument of the last seen identifier.
1671    */
1672   state = initial;
1673   context = null_context;
1674
1675   token.type = token_type_string;
1676   token.sub_type = string_type_qq;
1677   token.line_number = line_number;
1678   pos.file_name = logical_file_name;
1679   pos.line_number = lineno;
1680
1681   while ((c = (unsigned char) *string++) != '\0')
1682     {
1683       void *keyword_value;
1684
1685       if (state == initial)
1686         bufpos = 0;
1687
1688       if (c == '\n')
1689         lineno++;
1690
1691       if (bufpos + 1 >= bufmax)
1692         {
1693           bufmax = 2 * bufmax + 10;
1694           buffer = xrealloc (buffer, bufmax);
1695         }
1696
1697       switch (state)
1698         {
1699         case initial:
1700           switch (c)
1701             {
1702             case '\\':
1703               c = (unsigned char) *string++;
1704               if (c == '\0')
1705                 return;
1706               break;
1707             case '$':
1708               buffer[bufpos++] = '$';
1709               maybe_hash_deref = false;
1710               state = one_dollar;
1711               break;
1712             default:
1713               break;
1714             }
1715           break;
1716         case one_dollar:
1717           switch (c)
1718             {
1719             case '$':
1720               /*
1721                * This is enough to make us believe later that we dereference
1722                * a hash reference.
1723                */
1724               maybe_hash_deref = true;
1725               state = two_dollars;
1726               break;
1727             default:
1728               if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1729                   || (c >= 'A' && c <= 'Z')
1730                   || (c >= 'a' && c <= 'z')
1731                   || (c >= '0' && c <= '9'))
1732                 {
1733                   buffer[bufpos++] = c;
1734                   state = identifier;
1735                 }
1736               else
1737                 state = initial;
1738               break;
1739             }
1740           break;
1741         case two_dollars:
1742           if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1743               || (c >= 'A' && c <= 'Z')
1744               || (c >= 'a' && c <= 'z')
1745               || (c >= '0' && c <= '9'))
1746             {
1747               buffer[bufpos++] = c;
1748               state = identifier;
1749             }
1750           else
1751             state = initial;
1752           break;
1753         case identifier:
1754           switch (c)
1755             {
1756             case '-':
1757               if (find_entry (&keywords, buffer, bufpos, &keyword_value) == 0)
1758                 {
1759                   flag_context_list_iterator_ty context_iter =
1760                     flag_context_list_iterator (
1761                       flag_context_list_table_lookup (
1762                         flag_context_list_table,
1763                         buffer, bufpos));
1764                   context =
1765                     inherited_context (null_context,
1766                                        flag_context_list_iterator_advance (
1767                                          &context_iter));
1768                   state = minus;
1769                 }
1770               else
1771                 state = initial;
1772               break;
1773             case '{':
1774               if (!maybe_hash_deref)
1775                 buffer[0] = '%';
1776               if (find_entry (&keywords, buffer, bufpos, &keyword_value) == 0)
1777                 {
1778                   flag_context_list_iterator_ty context_iter =
1779                     flag_context_list_iterator (
1780                       flag_context_list_table_lookup (
1781                         flag_context_list_table,
1782                         buffer, bufpos));
1783                   context =
1784                     inherited_context (null_context,
1785                                        flag_context_list_iterator_advance (
1786                                          &context_iter));
1787                   state = wait_quote;
1788                 }
1789               else
1790                 state = initial;
1791               break;
1792             default:
1793               if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1794                   || (c >= 'A' && c <= 'Z')
1795                   || (c >= 'a' && c <= 'z')
1796                   || (c >= '0' && c <= '9'))
1797                 {
1798                   buffer[bufpos++] = c;
1799                 }
1800               else
1801                 state = initial;
1802               break;
1803             }
1804           break;
1805         case minus:
1806           switch (c)
1807             {
1808             case '>':
1809               state = wait_lbrace;
1810               break;
1811             default:
1812               context = null_context;
1813               state = initial;
1814               break;
1815             }
1816           break;
1817         case wait_lbrace:
1818           switch (c)
1819             {
1820             case '{':
1821               state = wait_quote;
1822               break;
1823             default:
1824               context = null_context;
1825               state = initial;
1826               break;
1827             }
1828           break;
1829         case wait_quote:
1830           switch (c)
1831             {
1832             case_whitespace:
1833               break;
1834             case '\'':
1835               pos.line_number = lineno;
1836               bufpos = 0;
1837               state = squote;
1838               break;
1839             case '"':
1840               pos.line_number = lineno;
1841               bufpos = 0;
1842               state = dquote;
1843               break;
1844             default:
1845               if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1846                   || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1847                 {
1848                   pos.line_number = lineno;
1849                   bufpos = 0;
1850                   buffer[bufpos++] = c;
1851                   state = barekey;
1852                 }
1853               else
1854                 {
1855                   context = null_context;
1856                   state = initial;
1857                 }
1858               break;
1859             }
1860           break;
1861         case dquote:
1862           switch (c)
1863             {
1864             case '"':
1865               /* The resulting string has to be interpolated twice.  */
1866               buffer[bufpos] = '\0';
1867               token.string = xstrdup (buffer);
1868               extract_quotelike_pass3 (&token, EXIT_FAILURE);
1869               /* The string can only shrink with interpolation (because
1870                  we ignore \Q).  */
1871               if (!(strlen (token.string) <= bufpos))
1872                 abort ();
1873               strcpy (buffer, token.string);
1874               free (token.string);
1875               state = wait_rbrace;
1876               break;
1877             case '\\':
1878               if (string[0] == '\"')
1879                 {
1880                   buffer[bufpos++] = string++[0];
1881                 }
1882               else if (string[0])
1883                 {
1884                   buffer[bufpos++] = '\\';
1885                   buffer[bufpos++] = string++[0];
1886                 }
1887               else
1888                 {
1889                   context = null_context;
1890                   state = initial;
1891                 }
1892               break;
1893             default:
1894               buffer[bufpos++] = c;
1895               break;
1896             }
1897           break;
1898         case squote:
1899           switch (c)
1900             {
1901             case '\'':
1902               state = wait_rbrace;
1903               break;
1904             case '\\':
1905               if (string[0] == '\'')
1906                 {
1907                   buffer[bufpos++] = string++[0];
1908                 }
1909               else if (string[0])
1910                 {
1911                   buffer[bufpos++] = '\\';
1912                   buffer[bufpos++] = string++[0];
1913                 }
1914               else
1915                 {
1916                   context = null_context;
1917                   state = initial;
1918                 }
1919               break;
1920             default:
1921               buffer[bufpos++] = c;
1922               break;
1923             }
1924           break;
1925         case barekey:
1926           if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1927               || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1928             {
1929               buffer[bufpos++] = c;
1930               break;
1931             }
1932           else if (is_whitespace (c))
1933             {
1934               state = wait_rbrace;
1935               break;
1936             }
1937           else if (c != '}')
1938             {
1939               context = null_context;
1940               state = initial;
1941               break;
1942             }
1943           /* Must be right brace.  */
1944           /* FALLTHROUGH */
1945         case wait_rbrace:
1946           switch (c)
1947             {
1948             case_whitespace:
1949               break;
1950             case '}':
1951               buffer[bufpos] = '\0';
1952               token.string = xstrdup (buffer);
1953               extract_quotelike_pass3 (&token, EXIT_FAILURE);
1954               xgettext_current_source_encoding = po_charset_utf8;
1955               remember_a_message (mlp, token.string, context, &pos);
1956               xgettext_current_source_encoding = xgettext_global_source_encoding;
1957               /* FALLTHROUGH */
1958             default:
1959               context = null_context;
1960               state = initial;
1961               break;
1962             }
1963           break;
1964         }
1965     }
1966 }
1967
1968 /* The last token seen in the token stream.  This is important for the
1969    interpretation of '?' and '/'.  */
1970 static token_type_ty last_token;
1971
1972 /* Combine characters into tokens.  Discard whitespace.  */
1973
1974 static void
1975 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
1976 {
1977   static char *buffer;
1978   static int bufmax;
1979   int bufpos;
1980   int c;
1981
1982   for (;;)
1983     {
1984       c = phase2_getc ();
1985       tp->line_number = line_number;
1986
1987       switch (c)
1988         {
1989         case EOF:
1990           tp->type = token_type_eof;
1991           return;
1992
1993         case '\n':
1994           if (last_non_comment_line > last_comment_line)
1995             xgettext_comment_reset ();
1996           /* FALLTHROUGH */
1997         case '\t':
1998         case ' ':
1999           /* Ignore whitespace.  */
2000           continue;
2001
2002         case '%':
2003         case '@':
2004         case '*':
2005         case '$':
2006           if (!extract_all)
2007             {
2008               extract_variable (mlp, tp, c);
2009               prefer_division_over_regexp = true;
2010               return;
2011             }
2012           break;
2013         }
2014
2015       last_non_comment_line = tp->line_number;
2016
2017       switch (c)
2018         {
2019         case '.':
2020           {
2021             int c2 = phase1_getc ();
2022             phase1_ungetc (c2);
2023             if (c2 == '.')
2024               {
2025                 tp->type = token_type_other;
2026                 prefer_division_over_regexp = false;
2027                 return;
2028               }
2029             else if (c2 >= '0' && c2 <= '9')
2030               {
2031                 prefer_division_over_regexp = false;
2032               }
2033             else
2034               {
2035                 tp->type = token_type_dot;
2036                 prefer_division_over_regexp = true;
2037                 return;
2038               }
2039           }
2040           /* FALLTHROUGH */
2041         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2042         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2043         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2044         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2045         case 'Y': case 'Z':
2046         case '_':
2047         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2048         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2049         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2050         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2051         case 'y': case 'z':
2052         case '0': case '1': case '2': case '3': case '4':
2053         case '5': case '6': case '7': case '8': case '9':
2054           /* Symbol, or part of a number.  */
2055           prefer_division_over_regexp = true;
2056           bufpos = 0;
2057           for (;;)
2058             {
2059               if (bufpos >= bufmax)
2060                 {
2061                   bufmax = 2 * bufmax + 10;
2062                   buffer = xrealloc (buffer, bufmax);
2063                 }
2064               buffer[bufpos++] = c;
2065               c = phase1_getc ();
2066               switch (c)
2067                 {
2068                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2069                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2070                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2071                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2072                 case 'Y': case 'Z':
2073                 case '_':
2074                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2075                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2076                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2077                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2078                 case 'y': case 'z':
2079                 case '0': case '1': case '2': case '3': case '4':
2080                 case '5': case '6': case '7': case '8': case '9':
2081                   continue;
2082
2083                 default:
2084                   phase1_ungetc (c);
2085                   break;
2086                 }
2087               break;
2088             }
2089           if (bufpos >= bufmax)
2090             {
2091               bufmax = 2 * bufmax + 10;
2092               buffer = xrealloc (buffer, bufmax);
2093             }
2094           buffer[bufpos] = '\0';
2095
2096           if (strcmp (buffer, "__END__") == 0
2097               || strcmp (buffer, "__DATA__") == 0)
2098             {
2099               end_of_file = true;
2100               tp->type = token_type_eof;
2101               return;
2102             }
2103           else if (strcmp (buffer, "and") == 0
2104                    || strcmp (buffer, "cmp") == 0
2105                    || strcmp (buffer, "eq") == 0
2106                    || strcmp (buffer, "if") == 0
2107                    || strcmp (buffer, "ge") == 0
2108                    || strcmp (buffer, "gt") == 0
2109                    || strcmp (buffer, "le") == 0
2110                    || strcmp (buffer, "lt") == 0
2111                    || strcmp (buffer, "ne") == 0
2112                    || strcmp (buffer, "not") == 0
2113                    || strcmp (buffer, "or") == 0
2114                    || strcmp (buffer, "unless") == 0
2115                    || strcmp (buffer, "while") == 0
2116                    || strcmp (buffer, "xor") == 0)
2117             {
2118               tp->type = token_type_named_op;
2119               tp->string = xstrdup (buffer);
2120               prefer_division_over_regexp = false;
2121               return;
2122             }
2123           else if (strcmp (buffer, "s") == 0
2124                  || strcmp (buffer, "y") == 0
2125                  || strcmp (buffer, "tr") == 0)
2126             {
2127               int delim = phase1_getc ();
2128
2129               while (is_whitespace (delim))
2130                 delim = phase2_getc ();
2131
2132               if (delim == EOF)
2133                 {
2134                   tp->type = token_type_eof;
2135                   return;
2136                 }
2137               if ((delim >= '0' && delim <= '9')
2138                   || (delim >= 'A' && delim <= 'Z')
2139                   || (delim >= 'a' && delim <= 'z'))
2140                 {
2141                   /* False positive.  */
2142                   phase2_ungetc (delim);
2143                   tp->type = token_type_symbol;
2144                   tp->sub_type = symbol_type_none;
2145                   tp->string = xstrdup (buffer);
2146                   prefer_division_over_regexp = true;
2147                   return;
2148                 }
2149               extract_triple_quotelike (mlp, tp, delim,
2150                                         buffer[0] == 's' && delim != '\'');
2151
2152               /* Eat the following modifiers.  */
2153               do
2154                 c = phase1_getc ();
2155               while (c >= 'a' && c <= 'z');
2156               phase1_ungetc (c);
2157               return;
2158             }
2159           else if (strcmp (buffer, "m") == 0)
2160             {
2161               int delim = phase1_getc ();
2162
2163               while (is_whitespace (delim))
2164                 delim = phase2_getc ();
2165
2166               if (delim == EOF)
2167                 {
2168                   tp->type = token_type_eof;
2169                   return;
2170                 }
2171               if ((delim >= '0' && delim <= '9')
2172                   || (delim >= 'A' && delim <= 'Z')
2173                   || (delim >= 'a' && delim <= 'z'))
2174                 {
2175                   /* False positive.  */
2176                   phase2_ungetc (delim);
2177                   tp->type = token_type_symbol;
2178                   tp->sub_type = symbol_type_none;
2179                   tp->string = xstrdup (buffer);
2180                   prefer_division_over_regexp = true;
2181                   return;
2182                 }
2183               extract_quotelike (tp, delim);
2184               if (delim != '\'')
2185                 interpolate_keywords (mlp, tp->string, line_number);
2186               free (tp->string);
2187               tp->type = token_type_regex_op;
2188               prefer_division_over_regexp = true;
2189
2190               /* Eat the following modifiers.  */
2191               do
2192                 c = phase1_getc ();
2193               while (c >= 'a' && c <= 'z');
2194               phase1_ungetc (c);
2195               return;
2196             }
2197           else if (strcmp (buffer, "qq") == 0
2198                    || strcmp (buffer, "q") == 0
2199                    || strcmp (buffer, "qx") == 0
2200                    || strcmp (buffer, "qw") == 0
2201                    || strcmp (buffer, "qr") == 0)
2202             {
2203               /* The qw (...) construct is not really a string but we
2204                  can treat in the same manner and then pretend it is
2205                  a symbol.  Rationale: Saying "qw (foo bar)" is the
2206                  same as "my @list = ('foo', 'bar'); @list;".  */
2207
2208               int delim = phase1_getc ();
2209
2210               while (is_whitespace (delim))
2211                 delim = phase2_getc ();
2212
2213               if (delim == EOF)
2214                 {
2215                   tp->type = token_type_eof;
2216                   return;
2217                 }
2218               prefer_division_over_regexp = true;
2219
2220               if ((delim >= '0' && delim <= '9')
2221                   || (delim >= 'A' && delim <= 'Z')
2222                   || (delim >= 'a' && delim <= 'z'))
2223                 {
2224                   /* False positive.  */
2225                   phase2_ungetc (delim);
2226                   tp->type = token_type_symbol;
2227                   tp->sub_type = symbol_type_none;
2228                   tp->string = xstrdup (buffer);
2229                   prefer_division_over_regexp = true;
2230                   return;
2231                 }
2232
2233               extract_quotelike (tp, delim);
2234
2235               switch (buffer[1])
2236                 {
2237                 case 'q':
2238                 case 'x':
2239                   tp->type = token_type_string;
2240                   tp->sub_type = string_type_qq;
2241                   interpolate_keywords (mlp, tp->string, line_number);
2242                   break;
2243                 case 'r':
2244                   tp->type = token_type_regex_op;
2245                   break;
2246                 case 'w':
2247                   tp->type = token_type_symbol;
2248                   tp->sub_type = symbol_type_none;
2249                   break;
2250                 case '\0':
2251                   tp->type = token_type_string;
2252                   tp->sub_type = string_type_q;
2253                   break;
2254                 default:
2255                   abort ();
2256                 }
2257               return;
2258             }
2259           else if (strcmp (buffer, "grep") == 0
2260                    || strcmp (buffer, "split") == 0)
2261             {
2262               prefer_division_over_regexp = false;
2263             }
2264           tp->type = token_type_symbol;
2265           tp->sub_type = (strcmp (buffer, "sub") == 0
2266                           ? symbol_type_sub
2267                           : symbol_type_none);
2268           tp->string = xstrdup (buffer);
2269           return;
2270
2271         case '"':
2272           prefer_division_over_regexp = true;
2273           extract_quotelike (tp, c);
2274           tp->sub_type = string_type_qq;
2275           interpolate_keywords (mlp, tp->string, line_number);
2276           return;
2277
2278         case '`':
2279           prefer_division_over_regexp = true;
2280           extract_quotelike (tp, c);
2281           tp->sub_type = string_type_qq;
2282           interpolate_keywords (mlp, tp->string, line_number);
2283           return;
2284
2285         case '\'':
2286           prefer_division_over_regexp = true;
2287           extract_quotelike (tp, c);
2288           tp->sub_type = string_type_q;
2289           return;
2290
2291         case '(':
2292           c = phase2_getc ();
2293           if (c == ')')
2294             /* Ignore empty list.  */
2295             continue;
2296           else
2297             phase2_ungetc (c);
2298           tp->type = token_type_lparen;
2299           prefer_division_over_regexp = false;
2300           return;
2301
2302         case ')':
2303           tp->type = token_type_rparen;
2304           prefer_division_over_regexp = true;
2305           return;
2306
2307         case '{':
2308           tp->type = token_type_lbrace;
2309           prefer_division_over_regexp = false;
2310           return;
2311
2312         case '}':
2313           tp->type = token_type_rbrace;
2314           prefer_division_over_regexp = false;
2315           return;
2316
2317         case '[':
2318           tp->type = token_type_lbracket;
2319           prefer_division_over_regexp = false;
2320           return;
2321
2322         case ']':
2323           tp->type = token_type_rbracket;
2324           prefer_division_over_regexp = false;
2325           return;
2326
2327         case ';':
2328           tp->type = token_type_semicolon;
2329           prefer_division_over_regexp = false;
2330           return;
2331
2332         case ',':
2333           tp->type = token_type_comma;
2334           prefer_division_over_regexp = false;
2335           return;
2336
2337         case '=':
2338           /* Check for fat comma.  */
2339           c = phase1_getc ();
2340           if (c == '>')
2341             {
2342               tp->type = token_type_fat_comma;
2343               return;
2344             }
2345           else if (linepos == 2
2346                    && (last_token == token_type_semicolon
2347                        || last_token == token_type_rbrace)
2348                    && ((c >= 'A' && c <='Z')
2349                        || (c >= 'a' && c <= 'z')))
2350             {
2351 #if DEBUG_PERL
2352               fprintf (stderr, "%s:%d: start pod section\n",
2353                        real_file_name, line_number);
2354 #endif
2355               skip_pod ();
2356 #if DEBUG_PERL
2357               fprintf (stderr, "%s:%d: end pod section\n",
2358                        real_file_name, line_number);
2359 #endif
2360               continue;
2361             }
2362           phase1_ungetc (c);
2363           tp->type = token_type_other;
2364           prefer_division_over_regexp = false;
2365           return;
2366
2367         case '<':
2368           /* Check for <<EOF and friends.  */
2369           prefer_division_over_regexp = false;
2370           c = phase1_getc ();
2371           if (c == '<')
2372             {
2373               c = phase1_getc ();
2374               if (c == '\'')
2375                 {
2376                   char *string;
2377                   extract_quotelike (tp, c);
2378                   string = get_here_document (tp->string);
2379                   free (tp->string);
2380                   tp->string = string;
2381                   tp->type = token_type_string;
2382                   tp->sub_type = string_type_verbatim;
2383                   tp->line_number = line_number + 1;
2384                   return;
2385                 }
2386               else if (c == '"')
2387                 {
2388                   char *string;
2389                   extract_quotelike (tp, c);
2390                   string = get_here_document (tp->string);
2391                   free (tp->string);
2392                   tp->string = string;
2393                   tp->type = token_type_string;
2394                   tp->sub_type = string_type_qq;
2395                   tp->line_number = line_number + 1;
2396                   interpolate_keywords (mlp, tp->string, line_number + 1);
2397                   return;
2398                 }
2399               else if ((c >= 'A' && c <= 'Z')
2400                        || (c >= 'a' && c <= 'z')
2401                        || c == '_')
2402                 {
2403                   bufpos = 0;
2404                   while ((c >= 'A' && c <= 'Z')
2405                          || (c >= 'a' && c <= 'z')
2406                          || (c >= '0' && c <= '9')
2407                          || c == '_' || c >= 0x80)
2408                     {
2409                       if (bufpos >= bufmax)
2410                         {
2411                           bufmax = 2 * bufmax + 10;
2412                           buffer = xrealloc (buffer, bufmax);
2413                         }
2414                       buffer[bufpos++] = c;
2415                       c = phase1_getc ();
2416                     }
2417                   if (c == EOF)
2418                     {
2419                       tp->type = token_type_eof;
2420                       return;
2421                     }
2422                   else
2423                     {
2424                       char *string;
2425                       phase1_ungetc (c);
2426                       if (bufpos >= bufmax)
2427                         {
2428                           bufmax = 2 * bufmax + 10;
2429                           buffer = xrealloc (buffer, bufmax);
2430                         }
2431                       buffer[bufpos++] = '\0';
2432                       string = get_here_document (buffer);
2433                       tp->string = string;
2434                       tp->type = token_type_string;
2435                       tp->sub_type = string_type_qq;
2436                       tp->line_number = line_number + 1;
2437                       interpolate_keywords (mlp, tp->string, line_number + 1);
2438                       return;
2439                     }
2440                 }
2441               else
2442                 {
2443                   tp->type = token_type_other;
2444                   return;
2445                 }
2446             }
2447           else
2448             {
2449               phase1_ungetc (c);
2450               tp->type = token_type_other;
2451             }
2452           return;  /* End of case '>'.  */
2453
2454         case '-':
2455           /* Check for dereferencing operator.  */
2456           c = phase1_getc ();
2457           if (c == '>')
2458             {
2459               tp->type = token_type_dereference;
2460               return;
2461             }
2462           else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2463             {
2464               /* One of the -X (filetest) functions.  We play safe
2465                  and accept all alphabetical characters here.  */
2466               tp->type = token_type_other;
2467               return;
2468             }
2469           phase1_ungetc (c);
2470           tp->type = token_type_other;
2471           prefer_division_over_regexp = false;
2472           return;
2473
2474         case '/':
2475         case '?':
2476           if (!prefer_division_over_regexp)
2477             {
2478               extract_quotelike (tp, c);
2479               interpolate_keywords (mlp, tp->string, line_number);
2480               free (tp->string);
2481               tp->type = token_type_other;
2482               prefer_division_over_regexp = true;
2483               /* Eat the following modifiers.  */
2484               do
2485                 c = phase1_getc ();
2486               while (c >= 'a' && c <= 'z');
2487               phase1_ungetc (c);
2488               return;
2489             }
2490           /* FALLTHROUGH */
2491
2492         default:
2493           /* We could carefully recognize each of the 2 and 3 character
2494              operators, but it is not necessary, as we only need to recognize
2495              gettext invocations.  Don't bother.  */
2496           tp->type = token_type_other;
2497           prefer_division_over_regexp = false;
2498           return;
2499         }
2500     }
2501 }
2502
2503
2504 /* A token stack used as a lookahead buffer.  */
2505
2506 typedef struct token_stack_ty token_stack_ty;
2507 struct token_stack_ty
2508 {
2509   token_ty **items;
2510   size_t nitems;
2511   size_t nitems_max;
2512 };
2513
2514 static struct token_stack_ty token_stack;
2515
2516 #if DEBUG_PERL
2517 /* Dumps all resources allocated by stack STACK.  */
2518 static int
2519 token_stack_dump (token_stack_ty *stack)
2520 {
2521   size_t i;
2522
2523   fprintf (stderr, "BEGIN STACK DUMP\n");
2524   for (i = 0; i < stack->nitems; i++)
2525     {
2526       token_ty *token = stack->items[i];
2527       fprintf (stderr, "  [%s]\n", token2string (token));
2528       switch (token->type)
2529         {
2530         case token_type_named_op:
2531         case token_type_string:
2532         case token_type_symbol:
2533         case token_type_variable:
2534           fprintf (stderr, "    string: %s\n", token->string);
2535           break;
2536         }
2537     }
2538   fprintf (stderr, "END STACK DUMP\n");
2539   return 0;
2540 }
2541 #endif
2542
2543 /* Pushes the token TOKEN onto the stack STACK.  */
2544 static inline void
2545 token_stack_push (token_stack_ty *stack, token_ty *token)
2546 {
2547   if (stack->nitems >= stack->nitems_max)
2548     {
2549       size_t nbytes;
2550
2551       stack->nitems_max = 2 * stack->nitems_max + 4;
2552       nbytes = stack->nitems_max * sizeof (token_ty *);
2553       stack->items = xrealloc (stack->items, nbytes);
2554     }
2555   stack->items[stack->nitems++] = token;
2556 }
2557
2558 /* Pops the most recently pushed token from the stack STACK and returns it.
2559    Returns NULL if the stack is empty.  */
2560 static inline token_ty *
2561 token_stack_pop (token_stack_ty *stack)
2562 {
2563   if (stack->nitems > 0)
2564     return stack->items[--(stack->nitems)];
2565   else
2566     return NULL;
2567 }
2568
2569 /* Return the top of the stack without removing it from the stack, or
2570    NULL if the stack is empty.  */
2571 static inline token_ty *
2572 token_stack_peek (const token_stack_ty *stack)
2573 {
2574   if (stack->nitems > 0)
2575     return stack->items[stack->nitems - 1];
2576   else
2577     return NULL;
2578 }
2579
2580 /* Frees all resources allocated by stack STACK.  */
2581 static inline void
2582 token_stack_free (token_stack_ty *stack)
2583 {
2584   size_t i;
2585
2586   for (i = 0; i < stack->nitems; i++)
2587     free_token (stack->items[i]);
2588   free (stack->items);
2589 }
2590
2591
2592 static token_ty *
2593 x_perl_lex (message_list_ty *mlp)
2594 {
2595 #if DEBUG_PERL
2596   int dummy = token_stack_dump (&token_stack);
2597 #endif
2598   token_ty *tp = token_stack_pop (&token_stack);
2599
2600   if (!tp)
2601     {
2602       tp = (token_ty *) xmalloc (sizeof (token_ty));
2603       x_perl_prelex (mlp, tp);
2604 #if DEBUG_PERL
2605       fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2606                real_file_name, line_number, token2string (tp));
2607 #endif
2608     }
2609 #if DEBUG_PERL
2610   else
2611     {
2612       fprintf (stderr, "%s:%d: %s recycled from stack\n",
2613                real_file_name, line_number, token2string (tp));
2614     }
2615 #endif
2616
2617   /* A symbol followed by a fat comma is really a single-quoted string.
2618      Function definitions or forward declarations also need a special
2619      handling because the dollars and at signs inside the parentheses
2620      must not be interpreted as the beginning of a variable ')'.  */
2621   if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2622     {
2623       token_ty *next = token_stack_peek (&token_stack);
2624
2625       if (!next)
2626         {
2627 #if DEBUG_PERL
2628           fprintf (stderr, "%s:%d: pre-fetching next token\n",
2629                    real_file_name, line_number);
2630 #endif
2631           next = x_perl_lex (mlp);
2632           x_perl_unlex (next);
2633 #if DEBUG_PERL
2634           fprintf (stderr, "%s:%d: unshifted next token\n",
2635                    real_file_name, line_number);
2636 #endif
2637         }
2638
2639 #if DEBUG_PERL
2640       fprintf (stderr, "%s:%d: next token is %s\n",
2641                real_file_name, line_number, token2string (next));
2642 #endif
2643
2644       if (next->type == token_type_fat_comma)
2645         {
2646           tp->type = token_type_string;
2647           tp->sub_type = string_type_q;
2648 #if DEBUG_PERL
2649           fprintf (stderr,
2650                    "%s:%d: token %s mutated to token_type_string\n",
2651                    real_file_name, line_number, token2string (tp));
2652 #endif
2653         }
2654       else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2655                && next->type == token_type_symbol)
2656         {
2657           /* Start of a function declaration or definition.  Mark this
2658              symbol as a function name, so that we can later eat up
2659              possible prototype information.  */
2660 #if DEBUG_PERL
2661           fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2662                    real_file_name, line_number, next->string);
2663 #endif
2664           next->sub_type = symbol_type_function;
2665         }
2666       else if (tp->type == token_type_symbol
2667                && (tp->sub_type == symbol_type_sub
2668                    || tp->sub_type == symbol_type_function)
2669                && next->type == token_type_lparen)
2670         {
2671           /* For simplicity we simply consume everything up to the
2672              closing parenthesis.  Actually only a limited set of
2673              characters is allowed inside parentheses but we leave
2674              complaints to the interpreter and are prepared for
2675              future extensions to the Perl syntax.  */
2676           int c;
2677
2678 #if DEBUG_PERL
2679           fprintf (stderr, "%s:%d: consuming prototype information\n",
2680                    real_file_name, line_number);
2681 #endif
2682
2683           do
2684             {
2685               c = phase1_getc ();
2686 #if DEBUG_PERL
2687               fprintf (stderr, "  consuming character '%c'\n", c);
2688 #endif
2689             }
2690           while (c != EOF && c != ')');
2691           phase1_ungetc (c);
2692         }
2693     }
2694
2695   return tp;
2696 }
2697
2698 static void
2699 x_perl_unlex (token_ty *tp)
2700 {
2701   token_stack_push (&token_stack, tp);
2702 }
2703
2704
2705 /* ========================= Extracting strings.  ========================== */
2706
2707 /* Assuming TP is a string token, this function accumulates all subsequent
2708    . string2 . string3 ... to the string.  (String concatenation.)  */
2709
2710 static char *
2711 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2712 {
2713   char *string;
2714   size_t len;
2715
2716   extract_quotelike_pass3 (tp, error_level);
2717   string = xstrdup (tp->string);
2718   len = strlen (tp->string) + 1;
2719
2720   for (;;)
2721     {
2722       int c;
2723
2724       do
2725         c = phase2_getc ();
2726       while (is_whitespace (c));
2727
2728       if (c != '.')
2729         {
2730           phase2_ungetc (c);
2731           return string;
2732         }
2733
2734       do
2735         c = phase2_getc ();
2736       while (is_whitespace (c));
2737
2738       phase2_ungetc (c);
2739
2740       if (c == '"' || c == '\'' || c == '`'
2741           || (!prefer_division_over_regexp && (c == '/' || c == '?'))
2742           || c == 'q')
2743         {
2744           token_ty *qstring = x_perl_lex (mlp);
2745           if (qstring->type != token_type_string)
2746             {
2747               /* assert (qstring->type == token_type_symbol) */
2748               x_perl_unlex (qstring);
2749               return string;
2750             }
2751
2752           extract_quotelike_pass3 (qstring, error_level);
2753           len += strlen (qstring->string);
2754           string = xrealloc (string, len);
2755           strcat (string, qstring->string);
2756           free_token (qstring);
2757         }
2758     }
2759 }
2760
2761 /* The file is broken into tokens.  Scan the token stream, looking for
2762    a keyword, followed by a left paren, followed by a string.  When we
2763    see this sequence, we have something to remember.  We assume we are
2764    looking at a valid C or C++ program, and leave the complaints about
2765    the grammar to the compiler.
2766
2767      Normal handling: Look for
2768        keyword ( ... msgid ... )
2769      Plural handling: Look for
2770        keyword ( ... msgid ... msgid_plural ... )
2771
2772    We use recursion because the arguments before msgid or between msgid
2773    and msgid_plural can contain subexpressions of the same form.  */
2774
2775 /* Extract messages until the next balanced closing parenthesis.
2776    Extracted messages are added to MLP.
2777
2778    When specific arguments shall be extracted, ARG_SG and ARG_PL are
2779    set to the corresponding argument number or -1 if not applicable.
2780
2781    Returns true for EOF, false otherwise.
2782
2783    States are:
2784
2785    0 - initial state
2786    1 - keyword has been seen
2787    2 - extractable string has been seen
2788    3 - a dot operator after an extractable string has been seen
2789
2790    States 2 and 3 are "fragile", the parser will remain in state 2
2791    as long as only opening parentheses are seen, a transition to
2792    state 3 is done on appearance of a dot operator, all other tokens
2793    will cause the parser to fall back to state 1 or 0, eventually
2794    with an error message about invalid intermixing of constant and
2795    non-constant strings.
2796
2797    Likewise, state 3 is fragile.  The parser will remain in state 3
2798    as long as only closing parentheses are seen, a transition to state
2799    2 is done on appearance of another (literal!) string, all other
2800    tokens will cause a warning.  */
2801
2802 static bool
2803 extract_balanced (message_list_ty *mlp, int state, token_type_ty delim,
2804                   flag_context_ty outer_context,
2805                   flag_context_list_iterator_ty context_iter,
2806                   int arg_sg, int arg_pl)
2807 {
2808   /* Remember the message containing the msgid, for msgid_plural.  */
2809   message_ty *plural_mp = NULL;
2810
2811   /* The current argument for a possibly extracted keyword.  Counting
2812      starts with 1.  */
2813   int arg_count = 1;
2814
2815   /* Number of left parentheses seen.  */
2816   int paren_seen = 0;
2817
2818   /* Whether to implicitly assume the next tokens are arguments even without
2819      a '('.  */
2820   bool next_is_argument = false;
2821
2822   /* Context iterator that will be used if the next token is a '('.  */
2823   flag_context_list_iterator_ty next_context_iter =
2824     passthrough_context_list_iterator;
2825   /* Current context.  */
2826   flag_context_ty inner_context =
2827     inherited_context (outer_context,
2828                        flag_context_list_iterator_advance (&context_iter));
2829
2830 #if DEBUG_PERL
2831   static int nesting_level = 0;
2832
2833   ++nesting_level;
2834 #endif
2835
2836   last_token = token_type_semicolon;  /* Safe assumption.  */
2837   prefer_division_over_regexp = false;
2838
2839   for (;;)
2840     {
2841       int my_last_token = last_token;
2842       /* The current token.  */
2843       token_ty *tp;
2844
2845       tp = x_perl_lex (mlp);
2846
2847       last_token = tp->type;
2848
2849       if (delim == tp->type)
2850         {
2851 #if DEBUG_PERL
2852           fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
2853                    logical_file_name, tp->line_number, --nesting_level);
2854 #endif
2855           free_token (tp);
2856           return false;
2857         }
2858
2859       if (next_is_argument && tp->type != token_type_lparen)
2860         {
2861           /* An argument list starts, even though there is no '('.  */
2862           context_iter = next_context_iter;
2863           outer_context = inner_context;
2864           inner_context =
2865             inherited_context (outer_context,
2866                                flag_context_list_iterator_advance (
2867                                  &context_iter));
2868         }
2869
2870       switch (tp->type)
2871         {
2872         case token_type_symbol:
2873 #if DEBUG_PERL
2874           fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
2875                    logical_file_name, tp->line_number, nesting_level,
2876                    tp->string);
2877 #endif
2878
2879           {
2880             void *keyword_value;
2881
2882             if (find_entry (&keywords, tp->string, strlen (tp->string),
2883                             &keyword_value) == 0)
2884               {
2885                 last_token = token_type_keyword_symbol;
2886
2887                 arg_sg = (int) (long) keyword_value & ((1 << 10) - 1);
2888                 arg_pl = (int) (long) keyword_value >> 10;
2889                 arg_count = 1;
2890
2891                 state = 2;
2892               }
2893           }
2894           next_is_argument = true;
2895           next_context_iter =
2896             flag_context_list_iterator (
2897               flag_context_list_table_lookup (
2898                 flag_context_list_table,
2899                 tp->string, strlen (tp->string)));
2900           break;
2901
2902         case token_type_variable:
2903 #if DEBUG_PERL
2904           fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
2905                    logical_file_name, tp->line_number, nesting_level, tp->string);
2906 #endif
2907           prefer_division_over_regexp = true;
2908           next_is_argument = false;
2909           next_context_iter = null_context_list_iterator;
2910           break;
2911
2912         case token_type_lparen:
2913 #if DEBUG_PERL
2914           fprintf (stderr, "%s:%d: type left parentheses (%d)\n",
2915                    logical_file_name, tp->line_number, nesting_level);
2916 #endif
2917           ++paren_seen;
2918
2919           if (extract_balanced (mlp, state, token_type_rparen,
2920                                 inner_context, next_context_iter,
2921                                 arg_sg - arg_count + 1, arg_pl - arg_count + 1))
2922             {
2923               free_token (tp);
2924               return true;
2925             }
2926           if (my_last_token == token_type_keyword_symbol)
2927             arg_sg = arg_pl = -1;
2928           next_is_argument = false;
2929           next_context_iter = null_context_list_iterator;
2930           break;
2931
2932         case token_type_rparen:
2933 #if DEBUG_PERL
2934           fprintf (stderr, "%s:%d: type right parentheses(%d)\n",
2935                    logical_file_name, tp->line_number, nesting_level);
2936 #endif
2937           --paren_seen;
2938           next_is_argument = false;
2939           next_context_iter = null_context_list_iterator;
2940           break;
2941
2942         case token_type_comma:
2943         case token_type_fat_comma:
2944 #if DEBUG_PERL
2945           fprintf (stderr, "%s:%d: type comma (%d)\n",
2946                    logical_file_name, tp->line_number, nesting_level);
2947 #endif
2948           ++arg_count;
2949           if (arg_count > arg_sg && arg_count > arg_pl)
2950             {
2951               /* We have missed the argument.  */
2952               arg_sg = arg_pl = -1;
2953               arg_count = 0;
2954             }
2955 #if DEBUG_PERL
2956           fprintf (stderr, "%s:%d: arg_count: %d, arg_sg: %d, arg_pl: %d\n",
2957                    real_file_name, tp->line_number,
2958                    arg_count, arg_sg, arg_pl);
2959 #endif
2960           inner_context =
2961             inherited_context (outer_context,
2962                                flag_context_list_iterator_advance (
2963                                  &context_iter));
2964           next_is_argument = false;
2965           next_context_iter = passthrough_context_list_iterator;
2966           break;
2967
2968         case token_type_string:
2969 #if DEBUG_PERL
2970           fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
2971                    logical_file_name, tp->line_number, nesting_level,
2972                    tp->string);
2973 #endif
2974
2975           if (extract_all)
2976             {
2977               lex_pos_ty pos;
2978               char *string;
2979
2980               pos.file_name = logical_file_name;
2981               pos.line_number = tp->line_number;
2982               string = collect_message (mlp, tp, EXIT_SUCCESS);
2983               xgettext_current_source_encoding = po_charset_utf8;
2984               remember_a_message (mlp, string, inner_context, &pos);
2985               xgettext_current_source_encoding = xgettext_global_source_encoding;
2986             }
2987           else if (state)
2988             {
2989               lex_pos_ty pos;
2990               char *string;
2991
2992               pos.file_name = logical_file_name;
2993               pos.line_number = tp->line_number;
2994
2995               if (arg_count == arg_sg)
2996                 {
2997                   string = collect_message (mlp, tp, EXIT_FAILURE);
2998                   xgettext_current_source_encoding = po_charset_utf8;
2999                   plural_mp = remember_a_message (mlp, string, inner_context, &pos);
3000                   xgettext_current_source_encoding = xgettext_global_source_encoding;
3001                   arg_sg = -1;
3002                 }
3003               else if (arg_count == arg_pl)
3004                 {
3005                   if (plural_mp == NULL)
3006                     error (EXIT_FAILURE, 0, _("\
3007 %s:%d: fatal: plural message seen before singular message\n"),
3008                            real_file_name, tp->line_number);
3009
3010                   string = collect_message (mlp, tp, EXIT_FAILURE);
3011                   xgettext_current_source_encoding = po_charset_utf8;
3012                   remember_a_message_plural (plural_mp, string, inner_context, &pos);
3013                   xgettext_current_source_encoding = xgettext_global_source_encoding;
3014                   arg_pl = -1;
3015                 }
3016             }
3017
3018           if (arg_sg == -1 && arg_pl == -1)
3019             {
3020               state = 0;
3021               plural_mp = NULL;
3022             }
3023
3024           next_is_argument = false;
3025           next_context_iter = null_context_list_iterator;
3026           break;
3027
3028         case token_type_eof:
3029 #if DEBUG_PERL
3030           fprintf (stderr, "%s:%d: type EOF (%d)\n",
3031                    logical_file_name, tp->line_number, nesting_level);
3032 #endif
3033           free_token (tp);
3034           return true;
3035
3036         case token_type_lbrace:
3037 #if DEBUG_PERL
3038           fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3039                    logical_file_name, tp->line_number, nesting_level);
3040 #endif
3041           if (extract_balanced (mlp, 0, token_type_rbrace,
3042                                 null_context, null_context_list_iterator,
3043                                 -1, -1))
3044             {
3045               free_token (tp);
3046               return true;
3047             }
3048           next_is_argument = false;
3049           next_context_iter = null_context_list_iterator;
3050           break;
3051
3052         case token_type_rbrace:
3053 #if DEBUG_PERL
3054           fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3055                    logical_file_name, tp->line_number, nesting_level);
3056 #endif
3057           next_is_argument = false;
3058           next_context_iter = null_context_list_iterator;
3059           state = 0;
3060           break;
3061
3062         case token_type_lbracket:
3063 #if DEBUG_PERL
3064           fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3065                    logical_file_name, tp->line_number, nesting_level);
3066 #endif
3067           if (extract_balanced (mlp, 0, token_type_rbracket,
3068                                 null_context, null_context_list_iterator,
3069                                 -1, -1))
3070             {
3071               free_token (tp);
3072               return true;
3073             }
3074           next_is_argument = false;
3075           next_context_iter = null_context_list_iterator;
3076           break;
3077
3078         case token_type_rbracket:
3079 #if DEBUG_PERL
3080           fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3081                    logical_file_name, tp->line_number, nesting_level);
3082 #endif
3083           next_is_argument = false;
3084           next_context_iter = null_context_list_iterator;
3085           state = 0;
3086           break;
3087
3088         case token_type_semicolon:
3089 #if DEBUG_PERL
3090           fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3091                    logical_file_name, tp->line_number, nesting_level);
3092 #endif
3093           state = 0;
3094
3095           /* The ultimate sign.  */
3096           arg_sg = arg_pl = -1;
3097
3098           /* FIXME: Instead of resetting outer_context here, it may be better
3099              to recurse in the next_is_argument handling above, waiting for
3100              the next semicolon or other statement terminator.  */
3101           outer_context = null_context;
3102           context_iter = null_context_list_iterator;
3103           next_is_argument = false;
3104           next_context_iter = passthrough_context_list_iterator;
3105           inner_context =
3106             inherited_context (outer_context,
3107                                flag_context_list_iterator_advance (
3108                                  &context_iter));
3109           break;
3110
3111         case token_type_dereference:
3112 #if DEBUG_PERL
3113           fprintf (stderr, "%s:%d: type dereference (%d)\n",
3114                    logical_file_name, tp->line_number, nesting_level);
3115 #endif
3116           next_is_argument = false;
3117           next_context_iter = null_context_list_iterator;
3118           break;
3119
3120         case token_type_dot:
3121 #if DEBUG_PERL
3122           fprintf (stderr, "%s:%d: type dot (%d)\n",
3123                    logical_file_name, tp->line_number, nesting_level);
3124 #endif
3125           next_is_argument = false;
3126           next_context_iter = null_context_list_iterator;
3127           state = 0;
3128           break;
3129
3130         case token_type_named_op:
3131 #if DEBUG_PERL
3132           fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3133                    logical_file_name, tp->line_number, nesting_level,
3134                    tp->string);
3135 #endif
3136           next_is_argument = false;
3137           next_context_iter = null_context_list_iterator;
3138           state = 0;
3139           break;
3140
3141         case token_type_regex_op:
3142 #if DEBUG_PERL
3143           fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3144                    logical_file_name, tp->line_number, nesting_level);
3145 #endif
3146           next_is_argument = false;
3147           next_context_iter = null_context_list_iterator;
3148           break;
3149
3150         case token_type_other:
3151 #if DEBUG_PERL
3152           fprintf (stderr, "%s:%d: type other (%d)\n",
3153                    logical_file_name, tp->line_number, nesting_level);
3154 #endif
3155           next_is_argument = false;
3156           next_context_iter = null_context_list_iterator;
3157           state = 0;
3158           break;
3159
3160         default:
3161           fprintf (stderr, "%s:%d: unknown token type %d\n",
3162                    real_file_name, tp->line_number, tp->type);
3163           abort ();
3164         }
3165
3166       free_token (tp);
3167     }
3168 }
3169
3170 void
3171 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3172               flag_context_list_table_ty *flag_table,
3173               msgdomain_list_ty *mdlp)
3174 {
3175   message_list_ty *mlp = mdlp->item[0]->messages;
3176
3177   fp = f;
3178   real_file_name = real_filename;
3179   logical_file_name = xstrdup (logical_filename);
3180   line_number = 0;
3181
3182   last_comment_line = -1;
3183   last_non_comment_line = -1;
3184
3185   flag_context_list_table = flag_table;
3186
3187   init_keywords ();
3188
3189   token_stack.items = NULL;
3190   token_stack.nitems = 0;
3191   token_stack.nitems_max = 0;
3192   linesize = 0;
3193   linepos = 0;
3194   here_eaten = 0;
3195   end_of_file = false;
3196
3197   /* Eat tokens until eof is seen.  When extract_balanced returns
3198      due to an unbalanced closing brace, just restart it.  */
3199   while (!extract_balanced (mlp, 0, token_type_rbrace,
3200                             null_context, null_context_list_iterator,
3201                             -1, -1))
3202     ;
3203
3204   fp = NULL;
3205   real_file_name = NULL;
3206   free (logical_file_name);
3207   logical_file_name = NULL;
3208   line_number = 0;
3209   last_token = token_type_semicolon;
3210   token_stack_free (&token_stack);
3211   here_eaten = 0;
3212   end_of_file = true;
3213 }