gnu/dist/gettext/gettext-tools/src/x-php.c

   1 /* xgettext PHP backend.
   2    Copyright (C) 2001-2003 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <bruno@clisp.org>, 2002.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <stdbool.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28
  29 #include "message.h"
  30 #include "xgettext.h"
  31 #include "x-php.h"
  32 #include "error.h"
  33 #include "xalloc.h"
  34 #include "exit.h"
  35 #include "gettext.h"
  36
  37 #define _(s) gettext(s)
  38
  39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  40
  41
  42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
  43    See also php-4.1.0/Zend/zend_language_scanner.l.  */
  44
  45
  46 /* ====================== Keyword set customization.  ====================== */
  47
  48 /* If true extract all strings.  */
  49 static bool extract_all = false;
  50
  51 static hash_table keywords;
  52 static bool default_keywords = true;
  53
  54
  55 void
  56 x_php_extract_all ()
  57 {
  58   extract_all = true;
  59 }
  60
  61
  62 void
  63 x_php_keyword (const char *name)
  64 {
  65   if (name == NULL)
  66     default_keywords = false;
  67   else
  68     {
  69       const char *end;
  70       int argnum1;
  71       int argnum2;
  72       const char *colon;
  73
  74       if (keywords.table == NULL)
  75         init_hash (&keywords, 100);
  76
  77       split_keywordspec (name, &end, &argnum1, &argnum2);
  78
  79       /* The characters between name and end should form a valid C identifier.
  80          A colon means an invalid parse in split_keywordspec().  */
  81       colon = strchr (name, ':');
  82       if (colon == NULL || colon >= end)
  83         {
  84           if (argnum1 == 0)
  85             argnum1 = 1;
  86           insert_entry (&keywords, name, end - name,
  87                         (void *) (long) (argnum1 + (argnum2 << 10)));
  88         }
  89     }
  90 }
  91
  92 /* Finish initializing the keywords hash table.
  93    Called after argument processing, before each file is processed.  */
  94 static void
  95 init_keywords ()
  96 {
  97   if (default_keywords)
  98     {
  99       x_php_keyword ("_");
 100       x_php_keyword ("gettext");
 101       x_php_keyword ("dgettext:2");
 102       x_php_keyword ("dcgettext:2");
 103       /* The following were added in PHP 4.2.0.  */
 104       x_php_keyword ("ngettext:1,2");
 105       x_php_keyword ("dngettext:2,3");
 106       x_php_keyword ("dcngettext:2,3");
 107       default_keywords = false;
 108     }
 109 }
 110
 111 void
 112 init_flag_table_php ()
 113 {
 114   xgettext_record_flag ("_:1:pass-php-format");
 115   xgettext_record_flag ("gettext:1:pass-php-format");
 116   xgettext_record_flag ("dgettext:2:pass-php-format");
 117   xgettext_record_flag ("dcgettext:2:pass-php-format");
 118   xgettext_record_flag ("ngettext:1:pass-php-format");
 119   xgettext_record_flag ("ngettext:2:pass-php-format");
 120   xgettext_record_flag ("dngettext:2:pass-php-format");
 121   xgettext_record_flag ("dngettext:3:pass-php-format");
 122   xgettext_record_flag ("dcngettext:2:pass-php-format");
 123   xgettext_record_flag ("dcngettext:3:pass-php-format");
 124   xgettext_record_flag ("sprintf:1:php-format");
 125   xgettext_record_flag ("printf:1:php-format");
 126 }
 127
 128
 129 /* ======================== Reading of characters.  ======================== */
 130
 131
 132 /* Real filename, used in error messages about the input file.  */
 133 static const char *real_file_name;
 134
 135 /* Logical filename and line number, used to label the extracted messages.  */
 136 static char *logical_file_name;
 137 static int line_number;
 138
 139 /* The input file stream.  */
 140 static FILE *fp;
 141
 142
 143 /* 1. line_number handling.  */
 144
 145 static unsigned char phase1_pushback[2];
 146 static int phase1_pushback_length;
 147
 148 static int
 149 phase1_getc ()
 150 {
 151   int c;
 152
 153   if (phase1_pushback_length)
 154     c = phase1_pushback[--phase1_pushback_length];
 155   else
 156     {
 157       c = getc (fp);
 158
 159       if (c == EOF)
 160         {
 161           if (ferror (fp))
 162             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 163                    real_file_name);
 164           return EOF;
 165         }
 166     }
 167
 168   if (c == '\n')
 169     line_number++;
 170
 171   return c;
 172 }
 173
 174 /* Supports 2 characters of pushback.  */
 175 static void
 176 phase1_ungetc (int c)
 177 {
 178   if (c != EOF)
 179     {
 180       if (c == '\n')
 181         --line_number;
 182
 183       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 184         abort ();
 185       phase1_pushback[phase1_pushback_length++] = c;
 186     }
 187 }
 188
 189
 190 /* 2. Ignore HTML sections.  They are equivalent to PHP echo commands and
 191    therefore don't contain translatable strings.  */
 192
 193 static void
 194 skip_html ()
 195 {
 196   for (;;)
 197     {
 198       int c = phase1_getc ();
 199
 200       if (c == EOF)
 201         return;
 202
 203       if (c == '<')
 204         {
 205           int c2 = phase1_getc ();
 206
 207           if (c2 == EOF)
 208             break;
 209
 210           if (c2 == '?')
 211             {
 212               /* <?php is the normal way to enter PHP mode. <? and <?= are
 213                  recognized by PHP depending on a configuration setting.  */
 214               int c3 = phase1_getc ();
 215
 216               if (c3 != '=')
 217                 phase1_ungetc (c3);
 218
 219               return;
 220             }
 221
 222           if (c2 == '%')
 223             {
 224               /* <% and <%= are recognized by PHP depending on a configuration
 225                  setting.  */
 226               int c3 = phase1_getc ();
 227
 228               if (c3 != '=')
 229                 phase1_ungetc (c3);
 230
 231               return;
 232             }
 233
 234           if (c2 == '<')
 235             {
 236               phase1_ungetc (c2);
 237               continue;
 238             }
 239
 240           /* < script language = php >
 241              < script language = "php" >
 242              < script language = 'php' >
 243              are always recognized.  */
 244           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 245             c2 = phase1_getc ();
 246           if (c2 != 's' && c2 != 'S')
 247             {
 248               phase1_ungetc (c2);
 249               continue;
 250             }
 251           c2 = phase1_getc ();
 252           if (c2 != 'c' && c2 != 'C')
 253             {
 254               phase1_ungetc (c2);
 255               continue;
 256             }
 257           c2 = phase1_getc ();
 258           if (c2 != 'r' && c2 != 'R')
 259             {
 260               phase1_ungetc (c2);
 261               continue;
 262             }
 263           c2 = phase1_getc ();
 264           if (c2 != 'i' && c2 != 'I')
 265             {
 266               phase1_ungetc (c2);
 267               continue;
 268             }
 269           c2 = phase1_getc ();
 270           if (c2 != 'p' && c2 != 'P')
 271             {
 272               phase1_ungetc (c2);
 273               continue;
 274             }
 275           c2 = phase1_getc ();
 276           if (c2 != 't' && c2 != 'T')
 277             {
 278               phase1_ungetc (c2);
 279               continue;
 280             }
 281           c2 = phase1_getc ();
 282           if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
 283             {
 284               phase1_ungetc (c2);
 285               continue;
 286             }
 287           do
 288             c2 = phase1_getc ();
 289           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
 290           if (c2 != 'l' && c2 != 'L')
 291             {
 292               phase1_ungetc (c2);
 293               continue;
 294             }
 295           c2 = phase1_getc ();
 296           if (c2 != 'a' && c2 != 'A')
 297             {
 298               phase1_ungetc (c2);
 299               continue;
 300             }
 301           c2 = phase1_getc ();
 302           if (c2 != 'n' && c2 != 'N')
 303             {
 304               phase1_ungetc (c2);
 305               continue;
 306             }
 307           c2 = phase1_getc ();
 308           if (c2 != 'g' && c2 != 'G')
 309             {
 310               phase1_ungetc (c2);
 311               continue;
 312             }
 313           c2 = phase1_getc ();
 314           if (c2 != 'u' && c2 != 'U')
 315             {
 316               phase1_ungetc (c2);
 317               continue;
 318             }
 319           c2 = phase1_getc ();
 320           if (c2 != 'a' && c2 != 'A')
 321             {
 322               phase1_ungetc (c2);
 323               continue;
 324             }
 325           c2 = phase1_getc ();
 326           if (c2 != 'g' && c2 != 'G')
 327             {
 328               phase1_ungetc (c2);
 329               continue;
 330             }
 331           c2 = phase1_getc ();
 332           if (c2 != 'e' && c2 != 'E')
 333             {
 334               phase1_ungetc (c2);
 335               continue;
 336             }
 337           c2 = phase1_getc ();
 338           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 339             c2 = phase1_getc ();
 340           if (c2 != '=')
 341             {
 342               phase1_ungetc (c2);
 343               continue;
 344             }
 345           c2 = phase1_getc ();
 346           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 347             c2 = phase1_getc ();
 348           if (c2 == '"')
 349             {
 350               c2 = phase1_getc ();
 351               if (c2 != 'p')
 352                 {
 353                   phase1_ungetc (c2);
 354                   continue;
 355                 }
 356               c2 = phase1_getc ();
 357               if (c2 != 'h')
 358                 {
 359                   phase1_ungetc (c2);
 360                   continue;
 361                 }
 362               c2 = phase1_getc ();
 363               if (c2 != 'p')
 364                 {
 365                   phase1_ungetc (c2);
 366                   continue;
 367                 }
 368               c2 = phase1_getc ();
 369               if (c2 != '"')
 370                 {
 371                   phase1_ungetc (c2);
 372                   continue;
 373                 }
 374             }
 375           else if (c2 == '\'')
 376             {
 377               c2 = phase1_getc ();
 378               if (c2 != 'p')
 379                 {
 380                   phase1_ungetc (c2);
 381                   continue;
 382                 }
 383               c2 = phase1_getc ();
 384               if (c2 != 'h')
 385                 {
 386                   phase1_ungetc (c2);
 387                   continue;
 388                 }
 389               c2 = phase1_getc ();
 390               if (c2 != 'p')
 391                 {
 392                   phase1_ungetc (c2);
 393                   continue;
 394                 }
 395               c2 = phase1_getc ();
 396               if (c2 != '\'')
 397                 {
 398                   phase1_ungetc (c2);
 399                   continue;
 400                 }
 401             }
 402           else
 403             {
 404               if (c2 != 'p')
 405                 {
 406                   phase1_ungetc (c2);
 407                   continue;
 408                 }
 409               c2 = phase1_getc ();
 410               if (c2 != 'h')
 411                 {
 412                   phase1_ungetc (c2);
 413                   continue;
 414                 }
 415               c2 = phase1_getc ();
 416               if (c2 != 'p')
 417                 {
 418                   phase1_ungetc (c2);
 419                   continue;
 420                 }
 421             }
 422           c2 = phase1_getc ();
 423           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 424             c2 = phase1_getc ();
 425           if (c2 != '>')
 426             {
 427               phase1_ungetc (c2);
 428               continue;
 429             }
 430           return;
 431         }
 432     }
 433 }
 434
 435 #if 0
 436
 437 static unsigned char phase2_pushback[1];
 438 static int phase2_pushback_length;
 439
 440 static int
 441 phase2_getc ()
 442 {
 443   int c;
 444
 445   if (phase2_pushback_length)
 446     return phase2_pushback[--phase2_pushback_length];
 447
 448   c = phase1_getc ();
 449   switch (c)
 450     {
 451     case '?':
 452     case '%':
 453       {
 454         int c2 = phase1_getc ();
 455         if (c2 == '>')
 456           {
 457             /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
 458             skip_html ();
 459             return ' ';
 460           }
 461         phase1_ungetc (c2);
 462       }
 463       break;
 464
 465     case '<':
 466       {
 467         int c2 = phase1_getc ();
 468
 469         /* < / script > terminates PHP mode and switches back to HTML mode.  */
 470         while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 471           c2 = phase1_getc ();
 472         if (c2 == '/')
 473           {
 474             do
 475               c2 = phase1_getc ();
 476             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
 477             if (c2 == 's' || c2 == 'S')
 478               {
 479                 c2 = phase1_getc ();
 480                 if (c2 == 'c' || c2 == 'C')
 481                   {
 482                     c2 = phase1_getc ();
 483                     if (c2 == 'r' || c2 == 'R')
 484                       {
 485                         c2 = phase1_getc ();
 486                         if (c2 == 'i' || c2 == 'I')
 487                           {
 488                             c2 = phase1_getc ();
 489                             if (c2 == 'p' || c2 == 'P')
 490                               {
 491                                 c2 = phase1_getc ();
 492                                 if (c2 == 't' || c2 == 'T')
 493                                   {
 494                                     do
 495                                       c2 = phase1_getc ();
 496                                     while (c2 == ' ' || c2 == '\t'
 497                                            || c2 == '\n' || c2 == '\r');
 498                                     if (c2 == '>')
 499                                       {
 500                                         skip_html ();
 501                                         return ' ';
 502                                       }
 503                                   }
 504                               }
 505                           }
 506                       }
 507                   }
 508               }
 509           }
 510         phase1_ungetc (c2);
 511       }
 512       break;
 513     }
 514
 515   return c;
 516 }
 517
 518 static void
 519 phase2_ungetc (int c)
 520 {
 521   if (c != EOF)
 522     {
 523       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 524         abort ();
 525       phase2_pushback[phase2_pushback_length++] = c;
 526     }
 527 }
 528
 529 #endif
 530
 531
 532 /* Accumulating comments.  */
 533
 534 static char *buffer;
 535 static size_t bufmax;
 536 static size_t buflen;
 537
 538 static inline void
 539 comment_start ()
 540 {
 541   buflen = 0;
 542 }
 543
 544 static inline void
 545 comment_add (int c)
 546 {
 547   if (buflen >= bufmax)
 548     {
 549       bufmax = 2 * bufmax + 10;
 550       buffer = xrealloc (buffer, bufmax);
 551     }
 552   buffer[buflen++] = c;
 553 }
 554
 555 static inline void
 556 comment_line_end (size_t chars_to_remove)
 557 {
 558   buflen -= chars_to_remove;
 559   while (buflen >= 1
 560          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 561     --buflen;
 562   if (chars_to_remove == 0 && buflen >= bufmax)
 563     {
 564       bufmax = 2 * bufmax + 10;
 565       buffer = xrealloc (buffer, bufmax);
 566     }
 567   buffer[buflen] = '\0';
 568   xgettext_comment_add (buffer);
 569 }
 570
 571
 572 /* 3. Replace each comment that is not inside a string literal with a
 573    space character.  We need to remember the comment for later, because
 574    it may be attached to a keyword string.  */
 575
 576 /* These are for tracking whether comments count as immediately before
 577    keyword.  */
 578 static int last_comment_line;
 579 static int last_non_comment_line;
 580
 581 static unsigned char phase3_pushback[1];
 582 static int phase3_pushback_length;
 583
 584 static int
 585 phase3_getc ()
 586 {
 587   int lineno;
 588   int c;
 589
 590   if (phase3_pushback_length)
 591     return phase3_pushback[--phase3_pushback_length];
 592
 593   c = phase1_getc ();
 594
 595   if (c == '#')
 596     {
 597       /* sh comment.  */
 598       bool last_was_qmark = false;
 599
 600       comment_start ();
 601       lineno = line_number;
 602       for (;;)
 603         {
 604           c = phase1_getc ();
 605           if (c == '\n' || c == EOF)
 606             {
 607               comment_line_end (0);
 608               break;
 609             }
 610           if (last_was_qmark && c == '>')
 611             {
 612               comment_line_end (1);
 613               skip_html ();
 614               break;
 615             }
 616           /* We skip all leading white space, but not EOLs.  */
 617           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 618             comment_add (c);
 619           last_was_qmark = (c == '?' || c == '%');
 620         }
 621       last_comment_line = lineno;
 622       return '\n';
 623     }
 624   else if (c == '/')
 625     {
 626       c = phase1_getc ();
 627
 628       switch (c)
 629         {
 630         default:
 631           phase1_ungetc (c);
 632           return '/';
 633
 634         case '*':
 635           {
 636             /* C comment.  */
 637             bool last_was_star;
 638
 639             comment_start ();
 640             lineno = line_number;
 641             last_was_star = false;
 642             for (;;)
 643               {
 644                 c = phase1_getc ();
 645                 if (c == EOF)
 646                   break;
 647                 /* We skip all leading white space, but not EOLs.  */
 648                 if (buflen == 0 && (c == ' ' || c == '\t'))
 649                   continue;
 650                 comment_add (c);
 651                 switch (c)
 652                   {
 653                   case '\n':
 654                     comment_line_end (1);
 655                     comment_start ();
 656                     lineno = line_number;
 657                     last_was_star = false;
 658                     continue;
 659
 660                   case '*':
 661                     last_was_star = true;
 662                     continue;
 663
 664                   case '/':
 665                     if (last_was_star)
 666                       {
 667                         comment_line_end (2);
 668                         break;
 669                       }
 670                     /* FALLTHROUGH */
 671
 672                   default:
 673                     last_was_star = false;
 674                     continue;
 675                   }
 676                 break;
 677               }
 678             last_comment_line = lineno;
 679             return ' ';
 680           }
 681
 682         case '/':
 683           {
 684             /* C++ comment.  */
 685             bool last_was_qmark = false;
 686
 687             comment_start ();
 688             lineno = line_number;
 689             for (;;)
 690               {
 691                 c = phase1_getc ();
 692                 if (c == '\n' || c == EOF)
 693                   {
 694                     comment_line_end (0);
 695                     break;
 696                   }
 697                 if (last_was_qmark && c == '>')
 698                   {
 699                     comment_line_end (1);
 700                     skip_html ();
 701                     break;
 702                   }
 703                 /* We skip all leading white space, but not EOLs.  */
 704                 if (!(buflen == 0 && (c == ' ' || c == '\t')))
 705                   comment_add (c);
 706                 last_was_qmark = (c == '?' || c == '%');
 707               }
 708             last_comment_line = lineno;
 709             return '\n';
 710           }
 711         }
 712     }
 713   else
 714     return c;
 715 }
 716
 717 #ifdef unused
 718 static void
 719 phase3_ungetc (int c)
 720 {
 721   if (c != EOF)
 722     {
 723       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 724         abort ();
 725       phase3_pushback[phase3_pushback_length++] = c;
 726     }
 727 }
 728 #endif
 729
 730
 731 /* ========================== Reading of tokens.  ========================== */
 732
 733
 734 enum token_type_ty
 735 {
 736   token_type_eof,
 737   token_type_lparen,            /* ( */
 738   token_type_rparen,            /* ) */
 739   token_type_comma,             /* , */
 740   token_type_string_literal,    /* "abc" */
 741   token_type_symbol,            /* symbol, number */
 742   token_type_other              /* misc. operator */
 743 };
 744 typedef enum token_type_ty token_type_ty;
 745
 746 typedef struct token_ty token_ty;
 747 struct token_ty
 748 {
 749   token_type_ty type;
 750   char *string;         /* for token_type_string_literal, token_type_symbol */
 751   int line_number;
 752 };
 753
 754
 755 /* Free the memory pointed to by a 'struct token_ty'.  */
 756 static inline void
 757 free_token (token_ty *tp)
 758 {
 759   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
 760     free (tp->string);
 761 }
 762
 763
 764 /* 4. Combine characters into tokens.  Discard whitespace.  */
 765
 766 static void
 767 x_php_lex (token_ty *tp)
 768 {
 769   static char *buffer;
 770   static int bufmax;
 771   int bufpos;
 772   int c;
 773
 774   tp->string = NULL;
 775
 776   for (;;)
 777     {
 778       tp->line_number = line_number;
 779       c = phase3_getc ();
 780       switch (c)
 781         {
 782         case EOF:
 783           tp->type = token_type_eof;
 784           return;
 785
 786         case '\n':
 787           if (last_non_comment_line > last_comment_line)
 788             xgettext_comment_reset ();
 789           /* FALLTHROUGH */
 790         case ' ':
 791         case '\t':
 792         case '\r':
 793           /* Ignore whitespace.  */
 794           continue;
 795         }
 796
 797       last_non_comment_line = tp->line_number;
 798
 799       switch (c)
 800         {
 801         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
 802         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
 803         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
 804         case 'V': case 'W': case 'X': case 'Y': case 'Z':
 805         case '_':
 806         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
 807         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
 808         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
 809         case 'v': case 'w': case 'x': case 'y': case 'z':
 810           bufpos = 0;
 811           for (;;)
 812             {
 813               if (bufpos >= bufmax)
 814                 {
 815                   bufmax = 2 * bufmax + 10;
 816                   buffer = xrealloc (buffer, bufmax);
 817                 }
 818               buffer[bufpos++] = c;
 819               c = phase1_getc ();
 820               switch (c)
 821                 {
 822                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 823                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 824                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 825                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 826                 case 'Y': case 'Z':
 827                 case '_':
 828                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 829                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 830                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 831                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 832                 case 'y': case 'z':
 833                 case '0': case '1': case '2': case '3': case '4':
 834                 case '5': case '6': case '7': case '8': case '9':
 835                   continue;
 836
 837                 default:
 838                   phase1_ungetc (c);
 839                   break;
 840                 }
 841               break;
 842             }
 843           if (bufpos >= bufmax)
 844             {
 845               bufmax = 2 * bufmax + 10;
 846               buffer = xrealloc (buffer, bufmax);
 847             }
 848           buffer[bufpos] = 0;
 849           tp->string = xstrdup (buffer);
 850           tp->type = token_type_symbol;
 851           return;
 852
 853         case '\'':
 854           /* Single-quoted string literal.  */
 855           bufpos = 0;
 856           for (;;)
 857             {
 858               c = phase1_getc ();
 859               if (c == EOF || c == '\'')
 860                 break;
 861               if (c == '\\')
 862                 {
 863                   c = phase1_getc ();
 864                   if (c != '\\' && c != '\'')
 865                     {
 866                       phase1_ungetc (c);
 867                       c = '\\';
 868                     }
 869                 }
 870               if (bufpos >= bufmax)
 871                 {
 872                   bufmax = 2 * bufmax + 10;
 873                   buffer = xrealloc (buffer, bufmax);
 874                 }
 875               buffer[bufpos++] = c;
 876             }
 877           if (bufpos >= bufmax)
 878             {
 879               bufmax = 2 * bufmax + 10;
 880               buffer = xrealloc (buffer, bufmax);
 881             }
 882           buffer[bufpos] = 0;
 883           tp->type = token_type_string_literal;
 884           tp->string = xstrdup (buffer);
 885           return;
 886
 887         case '"':
 888           /* Double-quoted string literal.  */
 889           tp->type = token_type_string_literal;
 890           bufpos = 0;
 891           for (;;)
 892             {
 893               c = phase1_getc ();
 894               if (c == EOF || c == '"')
 895                 break;
 896               if (c == '$')
 897                 {
 898                   c = phase1_getc ();
 899                   if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
 900                       || c == '_' || c == '{' || c >= 0x7f)
 901                     {
 902                       /* String with variables.  */
 903                       tp->type = token_type_other;
 904                       continue;
 905                     }
 906                   phase1_ungetc (c);
 907                   c = '$';
 908                 }
 909               if (c == '{')
 910                 {
 911                   c = phase1_getc ();
 912                   if (c == '$')
 913                     {
 914                       /* String with expressions.  */
 915                       tp->type = token_type_other;
 916                       continue;
 917                     }
 918                   phase1_ungetc (c);
 919                   c = '{';
 920                 }
 921               if (c == '\\')
 922                 {
 923                   int n, j;
 924
 925                   c = phase1_getc ();
 926                   switch (c)
 927                     {
 928                     case '"':
 929                     case '\\':
 930                     case '$':
 931                       break;
 932
 933                     case '0': case '1': case '2': case '3':
 934                     case '4': case '5': case '6': case '7':
 935                       n = 0;
 936                       for (j = 0; j < 3; ++j)
 937                         {
 938                           n = n * 8 + c - '0';
 939                           c = phase1_getc ();
 940                           switch (c)
 941                             {
 942                             default:
 943                               break;
 944
 945                             case '0': case '1': case '2': case '3':
 946                             case '4': case '5': case '6': case '7':
 947                               continue;
 948                             }
 949                           break;
 950                         }
 951                       phase1_ungetc (c);
 952                       c = n;
 953                       break;
 954
 955                     case 'x':
 956                       n = 0;
 957                       for (j = 0; j < 2; ++j)
 958                         {
 959                           c = phase1_getc ();
 960                           switch (c)
 961                             {
 962                             case '0': case '1': case '2': case '3': case '4':
 963                             case '5': case '6': case '7': case '8': case '9':
 964                               n = n * 16 + c - '0';
 965                               break;
 966                             case 'A': case 'B': case 'C': case 'D': case 'E':
 967                             case 'F':
 968                               n = n * 16 + 10 + c - 'A';
 969                               break;
 970                             case 'a': case 'b': case 'c': case 'd': case 'e':
 971                             case 'f':
 972                               n = n * 16 + 10 + c - 'a';
 973                               break;
 974                             default:
 975                               phase1_ungetc (c);
 976                               c = 0;
 977                               break;
 978                             }
 979                           if (c == 0)
 980                             break;
 981                         }
 982                       if (j == 0)
 983                         {
 984                           phase1_ungetc ('x');
 985                           c = '\\';
 986                         }
 987                       else
 988                         c = n;
 989                       break;
 990
 991                     case 'n':
 992                       c = '\n';
 993                       break;
 994                     case 't':
 995                       c = '\t';
 996                       break;
 997                     case 'r':
 998                       c = '\r';
 999                       break;
1000
1001                     default:
1002                       phase1_ungetc (c);
1003                       c = '\\';
1004                       break;
1005                     }
1006                 }
1007               if (bufpos >= bufmax)
1008                 {
1009                   bufmax = 2 * bufmax + 10;
1010                   buffer = xrealloc (buffer, bufmax);
1011                 }
1012               buffer[bufpos++] = c;
1013             }
1014           if (bufpos >= bufmax)
1015             {
1016               bufmax = 2 * bufmax + 10;
1017               buffer = xrealloc (buffer, bufmax);
1018             }
1019           buffer[bufpos] = 0;
1020           if (tp->type == token_type_string_literal)
1021             tp->string = xstrdup (buffer);
1022           return;
1023
1024         case '?':
1025         case '%':
1026           {
1027             int c2 = phase1_getc ();
1028             if (c2 == '>')
1029               {
1030                 /* ?> and %> terminate PHP mode and switch back to HTML
1031                    mode.  */
1032                 skip_html ();
1033               }
1034             else
1035               phase1_ungetc (c2);
1036             tp->type = token_type_other;
1037             return;
1038           }
1039
1040         case '(':
1041           tp->type = token_type_lparen;
1042           return;
1043
1044         case ')':
1045           tp->type = token_type_rparen;
1046           return;
1047
1048         case ',':
1049           tp->type = token_type_comma;
1050           return;
1051
1052         case '<':
1053           {
1054             int c2 = phase1_getc ();
1055             if (c2 == '<')
1056               {
1057                 int c3 = phase1_getc ();
1058                 if (c3 == '<')
1059                   {
1060                     /* Start of here document.
1061                        Parse whitespace, then label, then newline.  */
1062                     do
1063                       c = phase3_getc ();
1064                     while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1065
1066                     bufpos = 0;
1067                     do
1068                       {
1069                         if (bufpos >= bufmax)
1070                           {
1071                             bufmax = 2 * bufmax + 10;
1072                             buffer = xrealloc (buffer, bufmax);
1073                           }
1074                         buffer[bufpos++] = c;
1075                         c = phase3_getc ();
1076                       }
1077                     while (c != EOF && c != '\n' && c != '\r');
1078                     /* buffer[0..bufpos-1] now contains the label.  */
1079
1080                     /* Now skip the here document.  */
1081                     for (;;)
1082                       {
1083                         c = phase1_getc ();
1084                         if (c == EOF)
1085                           break;
1086                         if (c == '\n' || c == '\r')
1087                           {
1088                             int bufidx = 0;
1089
1090                             while (bufidx < bufpos)
1091                               {
1092                                 c = phase1_getc ();
1093                                 if (c == EOF)
1094                                   break;
1095                                 if (c != buffer[bufidx])
1096                                   {
1097                                     phase1_ungetc (c);
1098                                     break;
1099                                   }
1100                               }
1101                             c = phase1_getc ();
1102                             if (c != ';')
1103                               phase1_ungetc (c);
1104                             c = phase1_getc ();
1105                             if (c == '\n' || c == '\r')
1106                               break;
1107                           }
1108                       }
1109
1110                     /* FIXME: Ideally we should turn the here document into a
1111                        string literal if it didn't contain $ substitution.  And
1112                        we should also respect backslash escape sequences like
1113                        in double-quoted strings.  */
1114                     tp->type = token_type_other;
1115                     return;
1116                   }
1117                 phase1_ungetc (c3);
1118               }
1119
1120             /* < / script > terminates PHP mode and switches back to HTML
1121                mode.  */
1122             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1123               c2 = phase1_getc ();
1124             if (c2 == '/')
1125               {
1126                 do
1127                   c2 = phase1_getc ();
1128                 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1129                 if (c2 == 's' || c2 == 'S')
1130                   {
1131                     c2 = phase1_getc ();
1132                     if (c2 == 'c' || c2 == 'C')
1133                       {
1134                         c2 = phase1_getc ();
1135                         if (c2 == 'r' || c2 == 'R')
1136                           {
1137                             c2 = phase1_getc ();
1138                             if (c2 == 'i' || c2 == 'I')
1139                               {
1140                                 c2 = phase1_getc ();
1141                                 if (c2 == 'p' || c2 == 'P')
1142                                   {
1143                                     c2 = phase1_getc ();
1144                                     if (c2 == 't' || c2 == 'T')
1145                                       {
1146                                         do
1147                                           c2 = phase1_getc ();
1148                                         while (c2 == ' ' || c2 == '\t'
1149                                                || c2 == '\n' || c2 == '\r');
1150                                         if (c2 == '>')
1151                                           {
1152                                             skip_html ();
1153                                           }
1154                                         else
1155                                           phase1_ungetc (c2);
1156                                       }
1157                                     else
1158                                       phase1_ungetc (c2);
1159                                   }
1160                                 else
1161                                   phase1_ungetc (c2);
1162                               }
1163                             else
1164                               phase1_ungetc (c2);
1165                           }
1166                         else
1167                           phase1_ungetc (c2);
1168                       }
1169                     else
1170                       phase1_ungetc (c2);
1171                   }
1172                 else
1173                   phase1_ungetc (c2);
1174               }
1175             else
1176               phase1_ungetc (c2);
1177
1178             tp->type = token_type_other;
1179             return;
1180           }
1181
1182         case '`':
1183           /* Execution operator.  */
1184         default:
1185           /* We could carefully recognize each of the 2 and 3 character
1186              operators, but it is not necessary, as we only need to recognize
1187              gettext invocations.  Don't bother.  */
1188           tp->type = token_type_other;
1189           return;
1190         }
1191     }
1192 }
1193
1194
1195 /* ========================= Extracting strings.  ========================== */
1196
1197
1198 /* Context lookup table.  */
1199 static flag_context_list_table_ty *flag_context_list_table;
1200
1201
1202 /* The file is broken into tokens.  Scan the token stream, looking for
1203    a keyword, followed by a left paren, followed by a string.  When we
1204    see this sequence, we have something to remember.  We assume we are
1205    looking at a valid C or C++ program, and leave the complaints about
1206    the grammar to the compiler.
1207
1208      Normal handling: Look for
1209        keyword ( ... msgid ... )
1210      Plural handling: Look for
1211        keyword ( ... msgid ... msgid_plural ... )
1212
1213    We use recursion because the arguments before msgid or between msgid
1214    and msgid_plural can contain subexpressions of the same form.  */
1215
1216
1217 /* Extract messages until the next balanced closing parenthesis.
1218    Extracted messages are added to MLP.
1219    When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1220    if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1221    otherwise PLURAL_COMMAS = 0.
1222    When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1223    Return true upon eof, false upon closing parenthesis.  */
1224 static bool
1225 extract_parenthesized (message_list_ty *mlp,
1226                        flag_context_ty outer_context,
1227                        flag_context_list_iterator_ty context_iter,
1228                        int commas_to_skip, int plural_commas)
1229 {
1230   /* Remember the message containing the msgid, for msgid_plural.  */
1231   message_ty *plural_mp = NULL;
1232
1233   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1234   int state;
1235   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1236   int next_commas_to_skip = -1;
1237   int next_plural_commas = 0;
1238   /* Context iterator that will be used if the next token is a '('.  */
1239   flag_context_list_iterator_ty next_context_iter =
1240     passthrough_context_list_iterator;
1241   /* Current context.  */
1242   flag_context_ty inner_context =
1243     inherited_context (outer_context,
1244                        flag_context_list_iterator_advance (&context_iter));
1245
1246   /* Start state is 0.  */
1247   state = 0;
1248
1249   for (;;)
1250     {
1251       token_ty token;
1252
1253       x_php_lex (&token);
1254       switch (token.type)
1255         {
1256         case token_type_symbol:
1257           {
1258             void *keyword_value;
1259
1260             if (find_entry (&keywords, token.string, strlen (token.string),
1261                             &keyword_value)
1262                 == 0)
1263               {
1264                 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1265                 int argnum2 = (int) (long) keyword_value >> 10;
1266
1267                 next_commas_to_skip = argnum1 - 1;
1268                 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1269                 state = 1;
1270               }
1271             else
1272               state = 0;
1273           }
1274           next_context_iter =
1275             flag_context_list_iterator (
1276               flag_context_list_table_lookup (
1277                 flag_context_list_table,
1278                 token.string, strlen (token.string)));
1279           free (token.string);
1280           continue;
1281
1282         case token_type_lparen:
1283           if (extract_parenthesized (mlp, inner_context, next_context_iter,
1284                                      state ? next_commas_to_skip : -1,
1285                                      state ? next_plural_commas: 0))
1286             return true;
1287           next_context_iter = null_context_list_iterator;
1288           state = 0;
1289           continue;
1290
1291         case token_type_rparen:
1292           return false;
1293
1294         case token_type_comma:
1295           if (commas_to_skip >= 0)
1296             {
1297               if (commas_to_skip > 0)
1298                 commas_to_skip--;
1299               else
1300                 if (plural_mp != NULL && plural_commas > 0)
1301                   {
1302                     commas_to_skip = plural_commas - 1;
1303                     plural_commas = 0;
1304                   }
1305                 else
1306                   commas_to_skip = -1;
1307             }
1308           inner_context =
1309             inherited_context (outer_context,
1310                                flag_context_list_iterator_advance (
1311                                  &context_iter));
1312           next_context_iter = passthrough_context_list_iterator;
1313           state = 0;
1314           continue;
1315
1316         case token_type_string_literal:
1317           {
1318             lex_pos_ty pos;
1319             pos.file_name = logical_file_name;
1320             pos.line_number = token.line_number;
1321
1322             if (extract_all)
1323               remember_a_message (mlp, token.string, inner_context, &pos);
1324             else
1325               {
1326                 if (commas_to_skip == 0)
1327                   {
1328                     if (plural_mp == NULL)
1329                       {
1330                         /* Seen an msgid.  */
1331                         message_ty *mp =
1332                           remember_a_message (mlp, token.string,
1333                                               inner_context, &pos);
1334                         if (plural_commas > 0)
1335                           plural_mp = mp;
1336                       }
1337                     else
1338                       {
1339                         /* Seen an msgid_plural.  */
1340                         remember_a_message_plural (plural_mp, token.string,
1341                                                    inner_context, &pos);
1342                         plural_mp = NULL;
1343                       }
1344                   }
1345                 else
1346                   free (token.string);
1347               }
1348           }
1349           next_context_iter = null_context_list_iterator;
1350           state = 0;
1351           continue;
1352
1353         case token_type_other:
1354           next_context_iter = null_context_list_iterator;
1355           state = 0;
1356           continue;
1357
1358         case token_type_eof:
1359           return true;
1360
1361         default:
1362           abort ();
1363         }
1364     }
1365 }
1366
1367
1368 void
1369 extract_php (FILE *f,
1370              const char *real_filename, const char *logical_filename,
1371              flag_context_list_table_ty *flag_table,
1372              msgdomain_list_ty *mdlp)
1373 {
1374   message_list_ty *mlp = mdlp->item[0]->messages;
1375
1376   fp = f;
1377   real_file_name = real_filename;
1378   logical_file_name = xstrdup (logical_filename);
1379   line_number = 1;
1380
1381   last_comment_line = -1;
1382   last_non_comment_line = -1;
1383
1384   flag_context_list_table = flag_table;
1385
1386   init_keywords ();
1387
1388   /* Initial mode is HTML mode, not PHP mode.  */
1389   skip_html ();
1390
1391   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1392      due to an unbalanced closing parenthesis, just restart it.  */
1393   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1394                                  -1, 0))
1395     ;
1396
1397   /* Close scanner.  */
1398   fp = NULL;
1399   real_file_name = NULL;
1400   logical_file_name = NULL;
1401   line_number = 0;
1402 }