gnu/dist/gettext/gettext-tools/src/x-python.c

   1 /* xgettext Python backend.
   2    Copyright (C) 2002-2003 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <assert.h>
  25 #include <errno.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "message.h"
  32 #include "xgettext.h"
  33 #include "x-python.h"
  34 #include "error.h"
  35 #include "error-progname.h"
  36 #include "xalloc.h"
  37 #include "exit.h"
  38 #include "po-charset.h"
  39 #include "uniname.h"
  40 #include "utf16-ucs4.h"
  41 #include "ucs4-utf8.h"
  42 #include "gettext.h"
  43
  44 #define _(s) gettext(s)
  45
  46 #define max(a,b) ((a) > (b) ? (a) : (b))
  47
  48 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  49
  50
  51 /* The Python syntax is defined in the Python Reference Manual
  52    /usr/share/doc/packages/python/html/ref/index.html.
  53    See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
  54    Python-2.0/Objects/unicodeobject.c.  */
  55
  56
  57 /* ====================== Keyword set customization.  ====================== */
  58
  59 /* If true extract all strings.  */
  60 static bool extract_all = false;
  61
  62 static hash_table keywords;
  63 static bool default_keywords = true;
  64
  65
  66 void
  67 x_python_extract_all ()
  68 {
  69   extract_all = true;
  70 }
  71
  72
  73 void
  74 x_python_keyword (const char *name)
  75 {
  76   if (name == NULL)
  77     default_keywords = false;
  78   else
  79     {
  80       const char *end;
  81       int argnum1;
  82       int argnum2;
  83       const char *colon;
  84
  85       if (keywords.table == NULL)
  86         init_hash (&keywords, 100);
  87
  88       split_keywordspec (name, &end, &argnum1, &argnum2);
  89
  90       /* The characters between name and end should form a valid C identifier.
  91          A colon means an invalid parse in split_keywordspec().  */
  92       colon = strchr (name, ':');
  93       if (colon == NULL || colon >= end)
  94         {
  95           if (argnum1 == 0)
  96             argnum1 = 1;
  97           insert_entry (&keywords, name, end - name,
  98                         (void *) (long) (argnum1 + (argnum2 << 10)));
  99         }
 100     }
 101 }
 102
 103 /* Finish initializing the keywords hash table.
 104    Called after argument processing, before each file is processed.  */
 105 static void
 106 init_keywords ()
 107 {
 108   if (default_keywords)
 109     {
 110       x_python_keyword ("gettext");
 111       x_python_keyword ("ugettext");
 112       x_python_keyword ("dgettext:2");
 113       x_python_keyword ("ngettext:1,2");
 114       x_python_keyword ("ungettext:1,2");
 115       x_python_keyword ("dngettext:2,3");
 116       x_python_keyword ("_");
 117       default_keywords = false;
 118     }
 119 }
 120
 121 void
 122 init_flag_table_python ()
 123 {
 124   xgettext_record_flag ("gettext:1:pass-python-format");
 125   xgettext_record_flag ("ugettext:1:pass-python-format");
 126   xgettext_record_flag ("dgettext:2:pass-python-format");
 127   xgettext_record_flag ("ngettext:1:pass-python-format");
 128   xgettext_record_flag ("ngettext:2:pass-python-format");
 129   xgettext_record_flag ("ungettext:1:pass-python-format");
 130   xgettext_record_flag ("ungettext:2:pass-python-format");
 131   xgettext_record_flag ("dngettext:2:pass-python-format");
 132   xgettext_record_flag ("dngettext:3:pass-python-format");
 133   xgettext_record_flag ("_:1:pass-python-format");
 134   /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
 135 }
 136
 137
 138 /* ======================== Reading of characters.  ======================== */
 139
 140 /* Real filename, used in error messages about the input file.  */
 141 static const char *real_file_name;
 142
 143 /* Logical filename and line number, used to label the extracted messages.  */
 144 static char *logical_file_name;
 145 static int line_number;
 146
 147 /* The input file stream.  */
 148 static FILE *fp;
 149
 150
 151 /* 1. line_number handling.  Also allow a lookahead.  */
 152
 153 static unsigned char phase1_pushback[max (9, UNINAME_MAX + 3)];
 154 static int phase1_pushback_length;
 155
 156 static int
 157 phase1_getc ()
 158 {
 159   int c;
 160
 161   if (phase1_pushback_length)
 162     c = phase1_pushback[--phase1_pushback_length];
 163   else
 164     {
 165       c = getc (fp);
 166
 167       if (c == EOF)
 168         {
 169           if (ferror (fp))
 170             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 171                    real_file_name);
 172           return EOF;
 173         }
 174     }
 175
 176   if (c == '\n')
 177     line_number++;
 178
 179   return c;
 180 }
 181
 182 /* Supports max (9, UNINAME_MAX + 3) characters of pushback.  */
 183 static void
 184 phase1_ungetc (int c)
 185 {
 186   if (c != EOF)
 187     {
 188       if (c == '\n')
 189         --line_number;
 190
 191       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 192         abort ();
 193       phase1_pushback[phase1_pushback_length++] = c;
 194     }
 195 }
 196
 197
 198 /* Accumulating comments.  */
 199
 200 static char *buffer;
 201 static size_t bufmax;
 202 static size_t buflen;
 203
 204 static inline void
 205 comment_start ()
 206 {
 207   buflen = 0;
 208 }
 209
 210 static inline void
 211 comment_add (int c)
 212 {
 213   /* We assume the program source is in ISO-8859-1 (for consistency with
 214      Python's \ooo and \xnn syntax inside strings), but we produce a POT
 215      file in UTF-8 encoding.  */
 216   size_t len = ((unsigned char) c < 0x80 ? 1 : 2);
 217   if (buflen + len > bufmax)
 218     {
 219       bufmax = 2 * bufmax + 10;
 220       buffer = xrealloc (buffer, bufmax);
 221     }
 222   if ((unsigned char) c < 0x80)
 223     buffer[buflen++] = c;
 224   else
 225     {
 226       buffer[buflen++] = 0xc0 | ((unsigned char) c >> 6);
 227       buffer[buflen++] = 0x80 | ((unsigned char) c & 0x3f);
 228     }
 229 }
 230
 231 static inline void
 232 comment_line_end ()
 233 {
 234   while (buflen >= 1
 235          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 236     --buflen;
 237   if (buflen >= bufmax)
 238     {
 239       bufmax = 2 * bufmax + 10;
 240       buffer = xrealloc (buffer, bufmax);
 241     }
 242   buffer[buflen] = '\0';
 243   savable_comment_add (buffer);
 244 }
 245
 246 /* These are for tracking whether comments count as immediately before
 247    keyword.  */
 248 static int last_comment_line;
 249 static int last_non_comment_line;
 250
 251
 252 /* 2. Outside strings, replace backslash-newline with nothing and a comment
 253       with nothing.  */
 254
 255 static int
 256 phase2_getc ()
 257 {
 258   int c;
 259
 260   for (;;)
 261     {
 262       c = phase1_getc ();
 263       if (c == '\\')
 264         {
 265           c = phase1_getc ();
 266           if (c != '\n')
 267             {
 268               phase1_ungetc (c);
 269               /* This shouldn't happen usually, because "A backslash is
 270                  illegal elsewhere on a line outside a string literal."  */
 271               return '\\';
 272             }
 273           /* Eat backslash-newline.  */
 274         }
 275       else if (c == '#')
 276         {
 277           /* Eat a comment.  */
 278           last_comment_line = line_number;
 279           comment_start ();
 280           for (;;)
 281             {
 282               c = phase1_getc ();
 283               if (c == EOF || c == '\n')
 284                 break;
 285               /* We skip all leading white space, but not EOLs.  */
 286               if (!(buflen == 0 && (c == ' ' || c == '\t')))
 287                 comment_add (c);
 288             }
 289           comment_line_end ();
 290           return c;
 291         }
 292       else
 293         return c;
 294     }
 295 }
 296
 297 /* Supports only one pushback character.  */
 298 static void
 299 phase2_ungetc (int c)
 300 {
 301   phase1_ungetc (c);
 302 }
 303
 304
 305 /* ========================== Reading of tokens.  ========================== */
 306
 307
 308 enum token_type_ty
 309 {
 310   token_type_eof,
 311   token_type_lparen,            /* ( */
 312   token_type_rparen,            /* ) */
 313   token_type_comma,             /* , */
 314   token_type_string,            /* "abc", 'abc', """abc""", '''abc''' */
 315   token_type_symbol,            /* symbol, number */
 316   token_type_other              /* misc. operator */
 317 };
 318 typedef enum token_type_ty token_type_ty;
 319
 320 typedef struct token_ty token_ty;
 321 struct token_ty
 322 {
 323   token_type_ty type;
 324   char *string;         /* for token_type_string, token_type_symbol */
 325   refcounted_string_list_ty *comment;   /* for token_type_string */
 326   int line_number;
 327 };
 328
 329
 330 /* There are two different input syntaxes for strings, "abc" and r"abc",
 331    and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
 332    Which escape sequences are understood, i.e. what is interpreted specially
 333    after backslash?
 334     "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
 335     r"abc"
 336     u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
 337     ur"abc"                                           \unnnn
 338    The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
 339    \unnnn items.  The \ooo and \xnn values are ISO-8859-1 values: u"\xff" and
 340    u"\u00ff" are the same.  */
 341
 342 #define P7_EOF (-1)
 343 #define P7_STRING_END (-2)
 344
 345 static int
 346 phase7_getuc (int quote_char,
 347               bool triple, bool interpret_ansic, bool interpret_unicode,
 348               unsigned int *backslash_counter)
 349 {
 350   int c;
 351
 352   for (;;)
 353     {
 354       /* Use phase 1, because phase 2 elides comments.  */
 355       c = phase1_getc ();
 356
 357       if (c == EOF)
 358         return P7_EOF;
 359
 360       if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
 361         {
 362           if (triple)
 363             {
 364               int c1 = phase1_getc ();
 365               if (c1 == quote_char)
 366                 {
 367                   int c2 = phase1_getc ();
 368                   if (c2 == quote_char)
 369                     return P7_STRING_END;
 370                   phase1_ungetc (c2);
 371                 }
 372               phase1_ungetc (c1);
 373               return c;
 374             }
 375           else
 376             return P7_STRING_END;
 377         }
 378
 379       if (c == '\n')
 380         {
 381           if (triple)
 382             {
 383               *backslash_counter = 0;
 384               return '\n';
 385             }
 386           /* In r"..." and ur"..." strings, newline is only allowed
 387              immediately after an odd number of backslashes (although the
 388              backslashes are not interpreted!).  */
 389           if (!(interpret_ansic || (*backslash_counter & 1) == 0))
 390             {
 391               *backslash_counter = 0;
 392               return '\n';
 393             }
 394           phase1_ungetc (c);
 395           error_with_progname = false;
 396           error (0, 0, _("%s:%d: warning: unterminated string"),
 397                  logical_file_name, line_number);
 398           error_with_progname = true;
 399           return P7_STRING_END;
 400         }
 401
 402       if (c != '\\')
 403         {
 404           *backslash_counter = 0;
 405           return c;
 406         }
 407
 408       /* Backslash handling.  */
 409
 410       if (!interpret_ansic && !interpret_unicode)
 411         {
 412           ++*backslash_counter;
 413           return '\\';
 414         }
 415
 416       /* Dispatch according to the character following the backslash.  */
 417       c = phase1_getc ();
 418       if (c == EOF)
 419         {
 420           ++*backslash_counter;
 421           return '\\';
 422         }
 423
 424       if (interpret_ansic)
 425         switch (c)
 426           {
 427           case '\n':
 428             continue;
 429           case '\\':
 430             ++*backslash_counter;
 431             return c;
 432           case '\'': case '"':
 433             *backslash_counter = 0;
 434             return c;
 435           case 'a':
 436             *backslash_counter = 0;
 437             return '\a';
 438           case 'b':
 439             *backslash_counter = 0;
 440             return '\b';
 441           case 'f':
 442             *backslash_counter = 0;
 443             return '\f';
 444           case 'n':
 445             *backslash_counter = 0;
 446             return '\n';
 447           case 'r':
 448             *backslash_counter = 0;
 449             return '\r';
 450           case 't':
 451             *backslash_counter = 0;
 452             return '\t';
 453           case 'v':
 454             *backslash_counter = 0;
 455             return '\v';
 456           case '0': case '1': case '2': case '3': case '4':
 457           case '5': case '6': case '7':
 458             {
 459               int n = c - '0';
 460
 461               c = phase1_getc ();
 462               if (c != EOF)
 463                 {
 464                   if (c >= '0' && c <= '7')
 465                     {
 466                       n = (n << 3) + (c - '0');
 467                       c = phase1_getc ();
 468                       if (c != EOF)
 469                         {
 470                           if (c >= '0' && c <= '7')
 471                             n = (n << 3) + (c - '0');
 472                           else
 473                             phase1_ungetc (c);
 474                         }
 475                     }
 476                   else
 477                     phase1_ungetc (c);
 478                 }
 479               *backslash_counter = 0;
 480               return (unsigned char) n;
 481             }
 482           case 'x':
 483             {
 484               int c1 = phase1_getc ();
 485               int n1;
 486
 487               if (c1 >= '0' && c1 <= '9')
 488                 n1 = c1 - '0';
 489               else if (c1 >= 'A' && c1 <= 'F')
 490                 n1 = c1 - 'A' + 10;
 491               else if (c1 >= 'a' && c1 <= 'f')
 492                 n1 = c1 - 'a' + 10;
 493               else
 494                 n1 = -1;
 495
 496               if (n1 >= 0)
 497                 {
 498                   int c2 = phase1_getc ();
 499                   int n2;
 500
 501                   if (c2 >= '0' && c2 <= '9')
 502                     n2 = c2 - '0';
 503                   else if (c2 >= 'A' && c2 <= 'F')
 504                     n2 = c2 - 'A' + 10;
 505                   else if (c2 >= 'a' && c2 <= 'f')
 506                     n2 = c2 - 'a' + 10;
 507                   else
 508                     n2 = -1;
 509
 510                   if (n2 >= 0)
 511                     {
 512                       *backslash_counter = 0;
 513                       return (unsigned char) ((n1 << 4) + n2);
 514                     }
 515
 516                   phase1_ungetc (c2);
 517                 }
 518               phase1_ungetc (c1);
 519               phase1_ungetc (c);
 520               ++*backslash_counter;
 521               return '\\';
 522             }
 523           }
 524
 525       if (interpret_unicode)
 526         {
 527           if (c == 'u')
 528             {
 529               unsigned char buf[4];
 530               unsigned int n = 0;
 531               int i;
 532
 533               for (i = 0; i < 4; i++)
 534                 {
 535                   int c1 = phase1_getc ();
 536
 537                   if (c1 >= '0' && c1 <= '9')
 538                     n = (n << 4) + (c1 - '0');
 539                   else if (c1 >= 'A' && c1 <= 'F')
 540                     n = (n << 4) + (c1 - 'A' + 10);
 541                   else if (c1 >= 'a' && c1 <= 'f')
 542                     n = (n << 4) + (c1 - 'a' + 10);
 543                   else
 544                     {
 545                       phase1_ungetc (c1);
 546                       while (--i >= 0)
 547                         phase1_ungetc (buf[i]);
 548                       phase1_ungetc (c);
 549                       ++*backslash_counter;
 550                       return '\\';
 551                     }
 552
 553                   buf[i] = c1;
 554                 }
 555               *backslash_counter = 0;
 556               return n;
 557             }
 558
 559           if (interpret_ansic)
 560             {
 561               if (c == 'U')
 562                 {
 563                   unsigned char buf[8];
 564                   unsigned int n = 0;
 565                   int i;
 566
 567                   for (i = 0; i < 8; i++)
 568                     {
 569                       int c1 = phase1_getc ();
 570
 571                       if (c1 >= '0' && c1 <= '9')
 572                         n = (n << 4) + (c1 - '0');
 573                       else if (c1 >= 'A' && c1 <= 'F')
 574                         n = (n << 4) + (c1 - 'A' + 10);
 575                       else if (c1 >= 'a' && c1 <= 'f')
 576                         n = (n << 4) + (c1 - 'a' + 10);
 577                       else
 578                         {
 579                           phase1_ungetc (c1);
 580                           while (--i >= 0)
 581                             phase1_ungetc (buf[i]);
 582                           phase1_ungetc (c);
 583                           ++*backslash_counter;
 584                           return '\\';
 585                         }
 586
 587                       buf[i] = c1;
 588                     }
 589                   if (n < 0x110000)
 590                     {
 591                       *backslash_counter = 0;
 592                       return n;
 593                     }
 594
 595                   error_with_progname = false;
 596                   error (0, 0, _("%s:%d: warning: invalid Unicode character"),
 597                          logical_file_name, line_number);
 598                   error_with_progname = true;
 599
 600                   while (--i >= 0)
 601                     phase1_ungetc (buf[i]);
 602                   phase1_ungetc (c);
 603                   ++*backslash_counter;
 604                   return '\\';
 605                 }
 606
 607               if (c == 'N')
 608                 {
 609                   int c1 = phase1_getc ();
 610                   if (c1 == '{')
 611                     {
 612                       unsigned char buf[UNINAME_MAX + 1];
 613                       int i;
 614                       unsigned int n;
 615
 616                       for (i = 0; i < UNINAME_MAX; i++)
 617                         {
 618                           int c2 = phase1_getc ();
 619                           if (!(c2 >= ' ' && c2 <= '~'))
 620                             {
 621                               phase1_ungetc (c2);
 622                               while (--i >= 0)
 623                                 phase1_ungetc (buf[i]);
 624                               phase1_ungetc (c1);
 625                               phase1_ungetc (c);
 626                               ++*backslash_counter;
 627                               return '\\';
 628                             }
 629                           if (c2 == '}')
 630                             break;
 631                           buf[i] = c2;
 632                         }
 633                       buf[i] = '\0';
 634
 635                       n = unicode_name_character ((char *) buf);
 636                       if (n != UNINAME_INVALID)
 637                         {
 638                           *backslash_counter = 0;
 639                           return n;
 640                         }
 641
 642                       phase1_ungetc ('}');
 643                       while (--i >= 0)
 644                         phase1_ungetc (buf[i]);
 645                     }
 646                   phase1_ungetc (c1);
 647                   phase1_ungetc (c);
 648                   ++*backslash_counter;
 649                   return '\\';
 650                 }
 651             }
 652         }
 653
 654       phase1_ungetc (c);
 655       ++*backslash_counter;
 656       return '\\';
 657     }
 658 }
 659
 660
 661 /* Combine characters into tokens.  Discard whitespace except newlines at
 662    the end of logical lines.  */
 663
 664 /* Number of pending open parentheses/braces/brackets.  */
 665 static int open_pbb;
 666
 667 static token_ty phase5_pushback[1];
 668 static int phase5_pushback_length;
 669
 670 static void
 671 phase5_get (token_ty *tp)
 672 {
 673   int c;
 674
 675   if (phase5_pushback_length)
 676     {
 677       *tp = phase5_pushback[--phase5_pushback_length];
 678       return;
 679     }
 680
 681   for (;;)
 682     {
 683       tp->line_number = line_number;
 684       c = phase2_getc ();
 685
 686       switch (c)
 687         {
 688         case EOF:
 689           tp->type = token_type_eof;
 690           return;
 691
 692         case ' ':
 693         case '\t':
 694         case '\f':
 695           /* Ignore whitespace and comments.  */
 696           continue;
 697
 698         case '\n':
 699           if (last_non_comment_line > last_comment_line)
 700             savable_comment_reset ();
 701           /* Ignore newline if and only if it is used for implicit line
 702              joining.  */
 703           if (open_pbb > 0)
 704             continue;
 705           tp->type = token_type_other;
 706           return;
 707         }
 708
 709       last_non_comment_line = tp->line_number;
 710
 711       switch (c)
 712         {
 713         case '.':
 714           {
 715             int c1 = phase2_getc ();
 716             phase2_ungetc (c1);
 717             if (!(c1 >= '0' && c1 <= '9'))
 718               {
 719
 720                 tp->type = token_type_other;
 721                 return;
 722               }
 723           }
 724           /* FALLTHROUGH */
 725         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 726         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 727         case 'M': case 'N': case 'O': case 'P': case 'Q':
 728         case 'S': case 'T':           case 'V': case 'W': case 'X':
 729         case 'Y': case 'Z':
 730         case '_':
 731         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 732         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 733         case 'm': case 'n': case 'o': case 'p': case 'q':
 734         case 's': case 't':           case 'v': case 'w': case 'x':
 735         case 'y': case 'z':
 736         case '0': case '1': case '2': case '3': case '4':
 737         case '5': case '6': case '7': case '8': case '9':
 738         symbol:
 739           /* Symbol, or part of a number.  */
 740           {
 741             static char *buffer;
 742             static int bufmax;
 743             int bufpos;
 744
 745             bufpos = 0;
 746             for (;;)
 747               {
 748                 if (bufpos >= bufmax)
 749                   {
 750                     bufmax = 2 * bufmax + 10;
 751                     buffer = xrealloc (buffer, bufmax);
 752                   }
 753                 buffer[bufpos++] = c;
 754                 c = phase2_getc ();
 755                 switch (c)
 756                   {
 757                   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 758                   case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 759                   case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 760                   case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 761                   case 'Y': case 'Z':
 762                   case '_':
 763                   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 764                   case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 765                   case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 766                   case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 767                   case 'y': case 'z':
 768                   case '0': case '1': case '2': case '3': case '4':
 769                   case '5': case '6': case '7': case '8': case '9':
 770                     continue;
 771                   default:
 772                     phase2_ungetc (c);
 773                     break;
 774                   }
 775                 break;
 776               }
 777             if (bufpos >= bufmax)
 778               {
 779                 bufmax = 2 * bufmax + 10;
 780                 buffer = xrealloc (buffer, bufmax);
 781               }
 782             buffer[bufpos] = '\0';
 783             tp->string = xstrdup (buffer);
 784             tp->type = token_type_symbol;
 785             return;
 786           }
 787
 788         /* Strings.  */
 789           {
 790             static unsigned short *buffer;
 791             static int bufmax;
 792             int bufpos;
 793             int quote_char;
 794             bool interpret_ansic;
 795             bool interpret_unicode;
 796             bool triple;
 797             unsigned int backslash_counter;
 798
 799             case 'R': case 'r':
 800               {
 801                 int c1 = phase1_getc ();
 802                 if (c1 == '"' || c1 == '\'')
 803                   {
 804                     quote_char = c1;
 805                     interpret_ansic = false;
 806                     interpret_unicode = false;
 807                     goto string;
 808                   }
 809                 phase1_ungetc (c1);
 810                 goto symbol;
 811               }
 812
 813             case 'U': case 'u':
 814               {
 815                 int c1 = phase1_getc ();
 816                 if (c1 == '"' || c1 == '\'')
 817                   {
 818                     quote_char = c1;
 819                     interpret_ansic = true;
 820                     interpret_unicode = true;
 821                     goto string;
 822                   }
 823                 if (c1 == 'R' || c1 == 'r')
 824                   {
 825                     int c2 = phase1_getc ();
 826                     if (c2 == '"' || c2 == '\'')
 827                       {
 828                         quote_char = c2;
 829                         interpret_ansic = false;
 830                         interpret_unicode = true;
 831                         goto string;
 832                       }
 833                     phase1_ungetc (c2);
 834                   }
 835                 phase1_ungetc (c1);
 836                 goto symbol;
 837               }
 838
 839             case '"': case '\'':
 840               quote_char = c;
 841               interpret_ansic = true;
 842               interpret_unicode = false;
 843             string:
 844               triple = false;
 845               {
 846                 int c1 = phase1_getc ();
 847                 if (c1 == quote_char)
 848                   {
 849                     int c2 = phase1_getc ();
 850                     if (c2 == quote_char)
 851                       triple = true;
 852                     else
 853                       {
 854                         phase1_ungetc (c2);
 855                         phase1_ungetc (c1);
 856                       }
 857                   }
 858                 else
 859                   phase1_ungetc (c1);
 860               }
 861               backslash_counter = 0;
 862               /* Start accumulating the string.  We store the string in
 863                  UTF-16 before converting it to UTF-8.  Why not converting
 864                  every character directly to UTF-8? Because a string can
 865                  contain surrogates like u"\uD800\uDF00", and we must
 866                  combine them to a single UTF-8 character.  */
 867               bufpos = 0;
 868               for (;;)
 869                 {
 870                   int uc = phase7_getuc (quote_char, triple, interpret_ansic,
 871                                          interpret_unicode, &backslash_counter);
 872                   unsigned int len;
 873
 874                   if (uc == P7_EOF || uc == P7_STRING_END)
 875                     break;
 876
 877                   assert (uc >= 0 && uc < 0x110000);
 878                   len = (uc < 0x10000 ? 1 : 2);
 879                   if (bufpos + len > bufmax)
 880                     {
 881                       bufmax = 2 * bufmax + 10;
 882                       buffer =
 883                         xrealloc (buffer, bufmax * sizeof (unsigned short));
 884                     }
 885                   if (uc < 0x10000)
 886                     buffer[bufpos++] = uc;
 887                   else
 888                     {
 889                       buffer[bufpos++] = 0xd800 + ((uc - 0x10000) >> 10);
 890                       buffer[bufpos++] = 0xdc00 + ((uc - 0x10000) & 0x3ff);
 891                     }
 892                 }
 893               /* Now convert from UTF-16 to UTF-8.  */
 894               {
 895                 int pos;
 896                 unsigned char *utf8_string;
 897                 unsigned char *q;
 898
 899                 /* Each UTF-16 word needs 3 bytes at worst.  */
 900                 utf8_string = (unsigned char *) xmalloc (3 * bufpos + 1);
 901                 for (pos = 0, q = utf8_string; pos < bufpos; )
 902                   {
 903                     unsigned int uc;
 904                     int n;
 905
 906                     pos += u16_mbtouc (&uc, buffer + pos, bufpos - pos);
 907                     n = u8_uctomb (q, uc, 6);
 908                     assert (n > 0);
 909                     q += n;
 910                   }
 911                 *q = '\0';
 912                 assert (q - utf8_string <= 3 * bufpos);
 913                 tp->string = (char *) utf8_string;
 914               }
 915               tp->comment = add_reference (savable_comment);
 916               tp->type = token_type_string;
 917               return;
 918           }
 919
 920         case '(':
 921           open_pbb++;
 922           tp->type = token_type_lparen;
 923           return;
 924
 925         case ')':
 926           if (open_pbb > 0)
 927             open_pbb--;
 928           tp->type = token_type_rparen;
 929           return;
 930
 931         case ',':
 932           tp->type = token_type_comma;
 933           return;
 934
 935         case '[': case '{':
 936           open_pbb++;
 937           tp->type = token_type_other;
 938           return;
 939
 940         case ']': case '}':
 941           if (open_pbb > 0)
 942             open_pbb--;
 943           tp->type = token_type_other;
 944           return;
 945
 946         default:
 947           /* We could carefully recognize each of the 2 and 3 character
 948              operators, but it is not necessary, as we only need to recognize
 949              gettext invocations.  Don't bother.  */
 950           tp->type = token_type_other;
 951           return;
 952         }
 953     }
 954 }
 955
 956 /* Supports only one pushback token.  */
 957 static void
 958 phase5_unget (token_ty *tp)
 959 {
 960   if (tp->type != token_type_eof)
 961     {
 962       if (phase5_pushback_length == SIZEOF (phase5_pushback))
 963         abort ();
 964       phase5_pushback[phase5_pushback_length++] = *tp;
 965     }
 966 }
 967
 968
 969 /* Combine adjacent strings to form a single string.  Note that the end
 970    of a logical line appears as a token of its own, therefore strings that
 971    belong to different logical lines will not be concatenated.  */
 972
 973 static void
 974 x_python_lex (token_ty *tp)
 975 {
 976   phase5_get (tp);
 977   if (tp->type != token_type_string)
 978     return;
 979   for (;;)
 980     {
 981       token_ty tmp;
 982       size_t len;
 983
 984       phase5_get (&tmp);
 985       if (tmp.type != token_type_string)
 986         {
 987           phase5_unget (&tmp);
 988           return;
 989         }
 990       len = strlen (tp->string);
 991       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
 992       strcpy (tp->string + len, tmp.string);
 993       free (tmp.string);
 994     }
 995 }
 996
 997
 998 /* ========================= Extracting strings.  ========================== */
 999
1000
1001 /* Context lookup table.  */
1002 static flag_context_list_table_ty *flag_context_list_table;
1003
1004
1005 /* The file is broken into tokens.  Scan the token stream, looking for
1006    a keyword, followed by a left paren, followed by a string.  When we
1007    see this sequence, we have something to remember.  We assume we are
1008    looking at a valid C or C++ program, and leave the complaints about
1009    the grammar to the compiler.
1010
1011      Normal handling: Look for
1012        keyword ( ... msgid ... )
1013      Plural handling: Look for
1014        keyword ( ... msgid ... msgid_plural ... )
1015
1016    We use recursion because the arguments before msgid or between msgid
1017    and msgid_plural can contain subexpressions of the same form.  */
1018
1019
1020 /* Extract messages until the next balanced closing parenthesis.
1021    Extracted messages are added to MLP.
1022    When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1023    if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1024    otherwise PLURAL_COMMAS = 0.
1025    When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1026    Return true upon eof, false upon closing parenthesis.  */
1027 static bool
1028 extract_parenthesized (message_list_ty *mlp,
1029                        flag_context_ty outer_context,
1030                        flag_context_list_iterator_ty context_iter,
1031                        int commas_to_skip, int plural_commas)
1032 {
1033   /* Remember the message containing the msgid, for msgid_plural.  */
1034   message_ty *plural_mp = NULL;
1035
1036   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1037   int state;
1038   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1039   int next_commas_to_skip = -1;
1040   int next_plural_commas = 0;
1041   /* Context iterator that will be used if the next token is a '('.  */
1042   flag_context_list_iterator_ty next_context_iter =
1043     passthrough_context_list_iterator;
1044   /* Current context.  */
1045   flag_context_ty inner_context =
1046     inherited_context (outer_context,
1047                        flag_context_list_iterator_advance (&context_iter));
1048
1049   /* Start state is 0.  */
1050   state = 0;
1051
1052   for (;;)
1053     {
1054       token_ty token;
1055
1056       x_python_lex (&token);
1057       switch (token.type)
1058         {
1059         case token_type_symbol:
1060           {
1061             void *keyword_value;
1062
1063             if (find_entry (&keywords, token.string, strlen (token.string),
1064                             &keyword_value)
1065                 == 0)
1066               {
1067                 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1068                 int argnum2 = (int) (long) keyword_value >> 10;
1069
1070                 next_commas_to_skip = argnum1 - 1;
1071                 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1072                 state = 1;
1073               }
1074             else
1075               state = 0;
1076           }
1077           next_context_iter =
1078             flag_context_list_iterator (
1079               flag_context_list_table_lookup (
1080                 flag_context_list_table,
1081                 token.string, strlen (token.string)));
1082           free (token.string);
1083           continue;
1084
1085         case token_type_lparen:
1086           if (extract_parenthesized (mlp, inner_context, next_context_iter,
1087                                      state ? next_commas_to_skip : -1,
1088                                      state ? next_plural_commas : 0))
1089             return true;
1090           next_context_iter = null_context_list_iterator;
1091           state = 0;
1092           continue;
1093
1094         case token_type_rparen:
1095           return false;
1096
1097         case token_type_comma:
1098           if (commas_to_skip >= 0)
1099             {
1100               if (commas_to_skip > 0)
1101                 commas_to_skip--;
1102               else
1103                 if (plural_mp != NULL && plural_commas > 0)
1104                   {
1105                     commas_to_skip = plural_commas - 1;
1106                     plural_commas = 0;
1107                   }
1108                 else
1109                   commas_to_skip = -1;
1110             }
1111           inner_context =
1112             inherited_context (outer_context,
1113                                flag_context_list_iterator_advance (
1114                                  &context_iter));
1115           next_context_iter = passthrough_context_list_iterator;
1116           state = 0;
1117           continue;
1118
1119         case token_type_string:
1120           {
1121             lex_pos_ty pos;
1122             pos.file_name = logical_file_name;
1123             pos.line_number = token.line_number;
1124
1125             if (extract_all)
1126               {
1127                 savable_comment_to_xgettext_comment (token.comment);
1128                 remember_a_message (mlp, token.string, inner_context, &pos);
1129                 savable_comment_reset ();
1130               }
1131             else
1132               {
1133                 if (commas_to_skip == 0)
1134                   {
1135                     if (plural_mp == NULL)
1136                       {
1137                         /* Seen an msgid.  */
1138                         message_ty *mp;
1139
1140                         savable_comment_to_xgettext_comment (token.comment);
1141                         mp = remember_a_message (mlp, token.string,
1142                                                 inner_context, &pos);
1143                         savable_comment_reset ();
1144                         if (plural_commas > 0)
1145                           plural_mp = mp;
1146                       }
1147                     else
1148                       {
1149                         /* Seen an msgid_plural.  */
1150                         remember_a_message_plural (plural_mp, token.string,
1151                                                    inner_context, &pos);
1152                         plural_mp = NULL;
1153                       }
1154                   }
1155                 else
1156                   free (token.string);
1157               }
1158           }
1159           drop_reference (token.comment);
1160           next_context_iter = null_context_list_iterator;
1161           state = 0;
1162           continue;
1163
1164         case token_type_eof:
1165           return true;
1166
1167         case token_type_other:
1168           next_context_iter = null_context_list_iterator;
1169           state = 0;
1170           continue;
1171
1172         default:
1173           abort ();
1174         }
1175     }
1176 }
1177
1178
1179 void
1180 extract_python (FILE *f,
1181                 const char *real_filename, const char *logical_filename,
1182                 flag_context_list_table_ty *flag_table,
1183                 msgdomain_list_ty *mdlp)
1184 {
1185   message_list_ty *mlp = mdlp->item[0]->messages;
1186
1187   /* We convert our strings to UTF-8 encoding.  */
1188   xgettext_current_source_encoding = po_charset_utf8;
1189
1190   fp = f;
1191   real_file_name = real_filename;
1192   logical_file_name = xstrdup (logical_filename);
1193   line_number = 1;
1194
1195   last_comment_line = -1;
1196   last_non_comment_line = -1;
1197
1198   open_pbb = 0;
1199
1200   flag_context_list_table = flag_table;
1201
1202   init_keywords ();
1203
1204   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1205      due to an unbalanced closing parenthesis, just restart it.  */
1206   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1207                                  -1, 0))
1208     ;
1209
1210   fp = NULL;
1211   real_file_name = NULL;
1212   logical_file_name = NULL;
1213   line_number = 0;
1214 }