gnu/dist/gettext/gettext-tools/src/x-java.c

   1 /* xgettext Java backend.
   2    Copyright (C) 2003 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2003.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 #include <errno.h>
  24 #include <stdbool.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #include "message.h"
  30 #include "xgettext.h"
  31 #include "x-java.h"
  32 #include "error.h"
  33 #include "xalloc.h"
  34 #include "exit.h"
  35 #include "hash.h"
  36 #include "po-charset.h"
  37 #include "utf16-ucs4.h"
  38 #include "ucs4-utf8.h"
  39 #include "gettext.h"
  40
  41 #define _(s) gettext(s)
  42
  43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  44
  45
  46 /* The Java syntax is defined in the
  47      Java Language Specification, Second Edition,
  48      (available from http://java.sun.com/),
  49      chapter 3 "Lexical Structure".  */
  50
  51
  52 /* ====================== Keyword set customization.  ====================== */
  53
  54 /* If true extract all strings.  */
  55 static bool extract_all = false;
  56
  57 static hash_table keywords;
  58 static bool default_keywords = true;
  59
  60
  61 void
  62 x_java_extract_all ()
  63 {
  64   extract_all = true;
  65 }
  66
  67
  68 void
  69 x_java_keyword (const char *name)
  70 {
  71   if (name == NULL)
  72     default_keywords = false;
  73   else
  74     {
  75       const char *end;
  76       int argnum1;
  77       int argnum2;
  78       const char *colon;
  79
  80       if (keywords.table == NULL)
  81         init_hash (&keywords, 100);
  82
  83       split_keywordspec (name, &end, &argnum1, &argnum2);
  84
  85       /* The characters between name and end should form a valid Java
  86          identifier sequence with dots.
  87          A colon means an invalid parse in split_keywordspec().  */
  88       colon = strchr (name, ':');
  89       if (colon == NULL || colon >= end)
  90         {
  91           if (argnum1 == 0)
  92             argnum1 = 1;
  93           insert_entry (&keywords, name, end - name,
  94                         (void *) (long) (argnum1 + (argnum2 << 10)));
  95         }
  96     }
  97 }
  98
  99 /* Finish initializing the keywords hash table.
 100    Called after argument processing, before each file is processed.  */
 101 static void
 102 init_keywords ()
 103 {
 104   if (default_keywords)
 105     {
 106       x_java_keyword ("GettextResource.gettext:2");     /* static method */
 107       x_java_keyword ("GettextResource.ngettext:2,3");  /* static method */
 108       x_java_keyword ("gettext");
 109       x_java_keyword ("ngettext:1,2");
 110       x_java_keyword ("getString");     /* ResourceBundle.getString */
 111       default_keywords = false;
 112     }
 113 }
 114
 115 void
 116 init_flag_table_java ()
 117 {
 118   xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
 119   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
 120   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
 121   xgettext_record_flag ("gettext:1:pass-java-format");
 122   xgettext_record_flag ("ngettext:1:pass-java-format");
 123   xgettext_record_flag ("ngettext:2:pass-java-format");
 124   xgettext_record_flag ("getString:1:pass-java-format");
 125   xgettext_record_flag ("MessageFormat:1:java-format");
 126   xgettext_record_flag ("MessageFormat.format:1:java-format");
 127 }
 128
 129
 130 /* ======================== Reading of characters.  ======================== */
 131
 132 /* Real filename, used in error messages about the input file.  */
 133 static const char *real_file_name;
 134
 135 /* Logical filename and line number, used to label the extracted messages.  */
 136 static char *logical_file_name;
 137 static int line_number;
 138
 139 /* The input file stream.  */
 140 static FILE *fp;
 141
 142
 143 /* Fetch the next single-byte character from the input file.
 144    Pushback can consist of an unlimited number of 'u' followed by up to 4
 145    other characters.  */
 146
 147 /* Special coding of multiple 'u's in the pushback buffer.  */
 148 #define MULTIPLE_U(count) (0x1000 + (count))
 149
 150 static int phase1_pushback[5];
 151 static unsigned int phase1_pushback_length;
 152
 153 static int
 154 phase1_getc ()
 155 {
 156   int c;
 157
 158   if (phase1_pushback_length)
 159     {
 160       c = phase1_pushback[--phase1_pushback_length];
 161       if (c >= MULTIPLE_U (0))
 162         {
 163           if (c > MULTIPLE_U (1))
 164             phase1_pushback[phase1_pushback_length++] = c - 1;
 165           return 'u';
 166         }
 167       else
 168         return c;
 169     }
 170
 171   c = getc (fp);
 172
 173   if (c == EOF)
 174     {
 175       if (ferror (fp))
 176         error (EXIT_FAILURE, errno, _("\
 177 error while reading \"%s\""), real_file_name);
 178     }
 179
 180   return c;
 181 }
 182
 183 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
 184 static void
 185 phase1_ungetc (int c)
 186 {
 187   if (c != EOF)
 188     {
 189       if (c == 'u')
 190         {
 191           if (phase1_pushback_length > 0
 192               && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
 193             phase1_pushback[phase1_pushback_length - 1]++;
 194           else
 195             {
 196               if (phase1_pushback_length == SIZEOF (phase1_pushback))
 197                 abort ();
 198               phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
 199             }
 200         }
 201       else
 202         {
 203           if (phase1_pushback_length == SIZEOF (phase1_pushback))
 204             abort ();
 205           phase1_pushback[phase1_pushback_length++] = c;
 206         }
 207     }
 208 }
 209
 210
 211 /* Fetch the next single-byte character or Unicode character from the file.
 212    (Here, as in the Java Language Specification, when we say "Unicode
 213    character", we actually mean "UTF-16 encoding unit".)  */
 214
 215 /* Return value of phase 2, 3, 4 when EOF is reached.  */
 216 #define P2_EOF 0xffff
 217
 218 /* Convert an UTF-16 code point to a return value that can be distinguished
 219    from a single-byte return value.  */
 220 #define UNICODE(code) (0x10000 + (code))
 221
 222 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
 223    point.  */
 224 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
 225
 226 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
 227 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
 228
 229 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
 230    so that it can be more easily compared against an ASCII character.
 231    (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
 232 #define RED(p2_result) ((p2_result) & 0xffff)
 233
 234 static int phase2_pushback[1];
 235 static int phase2_pushback_length;
 236
 237 static int
 238 phase2_getc ()
 239 {
 240   int c;
 241
 242   if (phase2_pushback_length)
 243     return phase2_pushback[--phase2_pushback_length];
 244
 245   c = phase1_getc ();
 246   if (c == EOF)
 247     return P2_EOF;
 248   if (c == '\\')
 249     {
 250       c = phase1_getc ();
 251       if (c == 'u')
 252         {
 253           unsigned int u_count = 1;
 254           unsigned char buf[4];
 255           unsigned int n;
 256           int i;
 257
 258           for (;;)
 259             {
 260               c = phase1_getc ();
 261               if (c != 'u')
 262                 break;
 263               u_count++;
 264             }
 265           phase1_ungetc (c);
 266
 267           n = 0;
 268           for (i = 0; i < 4; i++)
 269             {
 270               c = phase1_getc ();
 271
 272               if (c >= '0' && c <= '9')
 273                 n = (n << 4) + (c - '0');
 274               else if (c >= 'A' && c <= 'F')
 275                 n = (n << 4) + (c - 'A' + 10);
 276               else if (c >= 'a' && c <= 'f')
 277                 n = (n << 4) + (c - 'a' + 10);
 278               else
 279                 {
 280                   phase1_ungetc (c);
 281                   while (--i >= 0)
 282                     phase1_ungetc (buf[i]);
 283                   for (; u_count > 0; u_count--)
 284                     phase1_ungetc ('u');
 285                   return '\\';
 286                 }
 287
 288               buf[i] = c;
 289             }
 290           return UNICODE (n);
 291         }
 292       phase1_ungetc (c);
 293       return '\\';
 294     }
 295   return c;
 296 }
 297
 298 /* Supports only one pushback character.  */
 299 static void
 300 phase2_ungetc (int c)
 301 {
 302   if (c != P2_EOF)
 303     {
 304       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 305         abort ();
 306       phase2_pushback[phase2_pushback_length++] = c;
 307     }
 308 }
 309
 310
 311 /* Fetch the next single-byte character or Unicode character from the file.
 312    With line number handling.
 313    Convert line terminators to '\n' or UNICODE ('\n').  */
 314
 315 static int phase3_pushback[2];
 316 static int phase3_pushback_length;
 317
 318 static int
 319 phase3_getc ()
 320 {
 321   int c;
 322
 323   if (phase3_pushback_length)
 324     {
 325       c = phase3_pushback[--phase3_pushback_length];
 326       if (c == '\n')
 327         ++line_number;
 328       return c;
 329     }
 330
 331   c = phase2_getc ();
 332
 333   /* Handle line terminators.  */
 334   if (RED (c) == '\r')
 335     {
 336       int c1 = phase2_getc ();
 337
 338       if (RED (c1) != '\n')
 339         phase2_ungetc (c1);
 340
 341       /* Seen line terminator CR or CR/LF.  */
 342       if (c == '\r' || c1 == '\n')
 343         {
 344           ++line_number;
 345           return '\n';
 346         }
 347       else
 348         return UNICODE ('\n');
 349     }
 350   else if (RED (c) == '\n')
 351     {
 352       /* Seen line terminator LF.  */
 353       if (c == '\n')
 354         {
 355           ++line_number;
 356           return '\n';
 357         }
 358       else
 359         return UNICODE ('\n');
 360     }
 361
 362   return c;
 363 }
 364
 365 /* Supports 2 characters of pushback.  */
 366 static void
 367 phase3_ungetc (int c)
 368 {
 369   if (c != P2_EOF)
 370     {
 371       if (c == '\n')
 372         --line_number;
 373       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 374         abort ();
 375       phase3_pushback[phase3_pushback_length++] = c;
 376     }
 377 }
 378
 379
 380 /* ========================= Accumulating strings.  ======================== */
 381
 382 /* A string buffer type that allows appending bytes (in the
 383    xgettext_current_source_encoding) or Unicode characters.
 384    Returns the entire string in UTF-8 encoding.  */
 385
 386 struct string_buffer
 387 {
 388   /* The part of the string that has already been converted to UTF-8.  */
 389   char *utf8_buffer;
 390   size_t utf8_buflen;
 391   size_t utf8_allocated;
 392   /* The first half of an UTF-16 surrogate character.  */
 393   unsigned short utf16_surr;
 394   /* The part of the string that is still in the source encoding.  */
 395   char *curr_buffer;
 396   size_t curr_buflen;
 397   size_t curr_allocated;
 398 };
 399
 400 /* Initialize a 'struct string_buffer' to empty.  */
 401 static inline void
 402 init_string_buffer (struct string_buffer *bp)
 403 {
 404   bp->utf8_buffer = NULL;
 405   bp->utf8_buflen = 0;
 406   bp->utf8_allocated = 0;
 407   bp->utf16_surr = 0;
 408   bp->curr_buffer = NULL;
 409   bp->curr_buflen = 0;
 410   bp->curr_allocated = 0;
 411 }
 412
 413 /* Auxiliary function: Append a byte to bp->curr.  */
 414 static inline void
 415 string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
 416 {
 417   if (bp->curr_buflen == bp->curr_allocated)
 418     {
 419       bp->curr_allocated = 2 * bp->curr_allocated + 10;
 420       bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
 421     }
 422   bp->curr_buffer[bp->curr_buflen++] = c;
 423 }
 424
 425 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
 426 static inline void
 427 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
 428 {
 429   if (bp->utf8_buflen + count > bp->utf8_allocated)
 430     {
 431       size_t new_allocated = 2 * bp->utf8_allocated + 10;
 432       if (new_allocated < bp->utf8_buflen + count)
 433         new_allocated = bp->utf8_buflen + count;
 434       bp->utf8_allocated = new_allocated;
 435       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
 436     }
 437 }
 438
 439 /* Auxiliary function: Append a Unicode character to bp->utf8.
 440    uc must be < 0x110000.  */
 441 static inline void
 442 string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
 443 {
 444   unsigned char utf8buf[6];
 445   int count = u8_uctomb (utf8buf, uc, 6);
 446
 447   if (count < 0)
 448     /* The caller should have ensured that uc is not out-of-range.  */
 449     abort ();
 450
 451   string_buffer_append_unicode_grow (bp, count);
 452   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
 453   bp->utf8_buflen += count;
 454 }
 455
 456 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
 457 static inline void
 458 string_buffer_flush_utf16_surr (struct string_buffer *bp)
 459 {
 460   if (bp->utf16_surr != 0)
 461     {
 462       /* A half surrogate is invalid, therefore use U+FFFD instead.  */
 463       string_buffer_append_unicode (bp, 0xfffd);
 464       bp->utf16_surr = 0;
 465     }
 466 }
 467
 468 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
 469 static inline void
 470 string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
 471 {
 472   if (bp->curr_buflen > 0)
 473     {
 474       char *curr;
 475       size_t count;
 476
 477       string_buffer_append_byte (bp, '\0');
 478
 479       /* Convert from the source encoding to UTF-8.  */
 480       curr = from_current_source_encoding (bp->curr_buffer,
 481                                            logical_file_name, lineno);
 482
 483       /* Append it to bp->utf8_buffer.  */
 484       count = strlen (curr);
 485       string_buffer_append_unicode_grow (bp, count);
 486       memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
 487       bp->utf8_buflen += count;
 488
 489       if (curr != bp->curr_buffer)
 490         free (curr);
 491       bp->curr_buflen = 0;
 492     }
 493 }
 494
 495 /* Append a character or Unicode character to a 'struct string_buffer'.  */
 496 static void
 497 string_buffer_append (struct string_buffer *bp, int c)
 498 {
 499   if (IS_UNICODE (c))
 500     {
 501       /* Append a Unicode character.  */
 502
 503       /* Switch from multibyte character mode to Unicode character mode.  */
 504       string_buffer_flush_curr_buffer (bp, line_number);
 505
 506       /* Test whether this character and the previous one form a Unicode
 507          surrogate character pair.  */
 508       if (bp->utf16_surr != 0
 509           && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
 510         {
 511           unsigned short utf16buf[2];
 512           unsigned int uc;
 513
 514           utf16buf[0] = bp->utf16_surr;
 515           utf16buf[1] = UTF16_VALUE (c);
 516           if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
 517             abort ();
 518
 519           string_buffer_append_unicode (bp, uc);
 520           bp->utf16_surr = 0;
 521         }
 522       else
 523         {
 524           string_buffer_flush_utf16_surr (bp);
 525
 526           if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
 527             bp->utf16_surr = UTF16_VALUE (c);
 528           else
 529             string_buffer_append_unicode (bp, UTF16_VALUE (c));
 530         }
 531     }
 532   else
 533     {
 534       /* Append a single byte.  */
 535
 536       /* Switch from Unicode character mode to multibyte character mode.  */
 537       string_buffer_flush_utf16_surr (bp);
 538
 539       /* When a newline is seen, convert the accumulated multibyte sequence.
 540          This ensures a correct line number in the error message in case of
 541          a conversion error.  The "- 1" is to account for the newline.  */
 542       if (c == '\n')
 543         string_buffer_flush_curr_buffer (bp, line_number - 1);
 544
 545       string_buffer_append_byte (bp, (unsigned char) c);
 546     }
 547 }
 548
 549 /* Return the string buffer's contents.  */
 550 static char *
 551 string_buffer_result (struct string_buffer *bp)
 552 {
 553   /* Flush all into bp->utf8_buffer.  */
 554   string_buffer_flush_utf16_surr (bp);
 555   string_buffer_flush_curr_buffer (bp, line_number);
 556   /* NUL-terminate it.  */
 557   string_buffer_append_unicode_grow (bp, 1);
 558   bp->utf8_buffer[bp->utf8_buflen] = '\0';
 559   /* Return it.  */
 560   return bp->utf8_buffer;
 561 }
 562
 563 /* Free the memory pointed to by a 'struct string_buffer'.  */
 564 static inline void
 565 free_string_buffer (struct string_buffer *bp)
 566 {
 567   free (bp->utf8_buffer);
 568   free (bp->curr_buffer);
 569 }
 570
 571
 572 /* ======================== Accumulating comments.  ======================== */
 573
 574
 575 /* Accumulating a single comment line.  */
 576
 577 static struct string_buffer comment_buffer;
 578
 579 static inline void
 580 comment_start ()
 581 {
 582   comment_buffer.utf8_buflen = 0;
 583   comment_buffer.utf16_surr = 0;
 584   comment_buffer.curr_buflen = 0;
 585 }
 586
 587 static inline bool
 588 comment_at_start ()
 589 {
 590   return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
 591           && comment_buffer.curr_buflen == 0);
 592 }
 593
 594 static inline void
 595 comment_add (int c)
 596 {
 597   string_buffer_append (&comment_buffer, c);
 598 }
 599
 600 static inline void
 601 comment_line_end (size_t chars_to_remove)
 602 {
 603   char *buffer = string_buffer_result (&comment_buffer);
 604   size_t buflen = strlen (buffer);
 605
 606   buflen -= chars_to_remove;
 607   while (buflen >= 1
 608          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 609     --buflen;
 610   buffer[buflen] = '\0';
 611   savable_comment_add (buffer);
 612 }
 613
 614
 615 /* These are for tracking whether comments count as immediately before
 616    keyword.  */
 617 static int last_comment_line;
 618 static int last_non_comment_line;
 619
 620
 621 /* Replace each comment that is not inside a character constant or string
 622    literal with a space or newline character.  */
 623
 624 static int
 625 phase4_getc ()
 626 {
 627   int c0;
 628   int c;
 629   bool last_was_star;
 630
 631   c0 = phase3_getc ();
 632   if (RED (c0) != '/')
 633     return c0;
 634   c = phase3_getc ();
 635   switch (RED (c))
 636     {
 637     default:
 638       phase3_ungetc (c);
 639       return c0;
 640
 641     case '*':
 642       /* C style comment.  */
 643       comment_start ();
 644       last_was_star = false;
 645       for (;;)
 646         {
 647           c = phase3_getc ();
 648           if (c == P2_EOF)
 649             break;
 650           /* We skip all leading white space, but not EOLs.  */
 651           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
 652             comment_add (c);
 653           switch (RED (c))
 654             {
 655             case '\n':
 656               comment_line_end (1);
 657               comment_start ();
 658               last_was_star = false;
 659               continue;
 660
 661             case '*':
 662               last_was_star = true;
 663               continue;
 664
 665             case '/':
 666               if (last_was_star)
 667                 {
 668                   comment_line_end (2);
 669                   break;
 670                 }
 671               /* FALLTHROUGH */
 672
 673             default:
 674               last_was_star = false;
 675               continue;
 676             }
 677           break;
 678         }
 679       last_comment_line = line_number;
 680       return ' ';
 681
 682     case '/':
 683       /* C++ style comment.  */
 684       last_comment_line = line_number;
 685       comment_start ();
 686       for (;;)
 687         {
 688           c = phase3_getc ();
 689           if (RED (c) == '\n' || c == P2_EOF)
 690             break;
 691           /* We skip all leading white space, but not EOLs.  */
 692           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
 693             comment_add (c);
 694         }
 695       phase3_ungetc (c); /* push back the newline, to decrement line_number */
 696       comment_line_end (0);
 697       phase3_getc (); /* read the newline again */
 698       return '\n';
 699     }
 700 }
 701
 702 /* Supports only one pushback character.  */
 703 static void
 704 phase4_ungetc (int c)
 705 {
 706   phase3_ungetc (c);
 707 }
 708
 709
 710 /* ========================== Reading of tokens.  ========================== */
 711
 712 enum token_type_ty
 713 {
 714   token_type_eof,
 715   token_type_lparen,            /* ( */
 716   token_type_rparen,            /* ) */
 717   token_type_lbrace,            /* { */
 718   token_type_rbrace,            /* } */
 719   token_type_comma,             /* , */
 720   token_type_dot,               /* . */
 721   token_type_string_literal,    /* "abc" */
 722   token_type_number,            /* 1.23 */
 723   token_type_symbol,            /* identifier, keyword, null */
 724   token_type_plus,              /* + */
 725   token_type_other              /* character literal, misc. operator */
 726 };
 727 typedef enum token_type_ty token_type_ty;
 728
 729 typedef struct token_ty token_ty;
 730 struct token_ty
 731 {
 732   token_type_ty type;
 733   char *string;         /* for token_type_string_literal, token_type_symbol */
 734   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
 735   int line_number;
 736 };
 737
 738
 739 /* Free the memory pointed to by a 'struct token_ty'.  */
 740 static inline void
 741 free_token (token_ty *tp)
 742 {
 743   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
 744     free (tp->string);
 745   if (tp->type == token_type_string_literal)
 746     drop_reference (tp->comment);
 747 }
 748
 749
 750 /* Read an escape sequence inside a string literal or character literal.  */
 751 static inline int
 752 do_getc_escaped ()
 753 {
 754   int c;
 755
 756   /* Use phase 3, because phase 4 elides comments.  */
 757   c = phase3_getc ();
 758   if (c == P2_EOF)
 759     return UNICODE ('\\');
 760   switch (RED (c))
 761     {
 762     case 'b':
 763       return UNICODE (0x08);
 764     case 't':
 765       return UNICODE (0x09);
 766     case 'n':
 767       return UNICODE (0x0a);
 768     case 'f':
 769       return UNICODE (0x0c);
 770     case 'r':
 771       return UNICODE (0x0d);
 772     case '"':
 773       return UNICODE ('"');
 774     case '\'':
 775       return UNICODE ('\'');
 776     case '\\':
 777       return UNICODE ('\\');
 778     case '0': case '1': case '2': case '3':
 779     case '4': case '5': case '6': case '7':
 780       {
 781         int n = RED (c) - '0';
 782         bool maybe3digits = (n < 4);
 783
 784         c = phase3_getc ();
 785         if (RED (c) >= '0' && RED (c) <= '7')
 786           {
 787             n = (n << 3) + (RED (c) - '0');
 788             if (maybe3digits)
 789               {
 790                 c = phase3_getc ();
 791                 if (RED (c) >= '0' && RED (c) <= '7')
 792                   n = (n << 3) + (RED (c) - '0');
 793                 else
 794                   phase3_ungetc (c);
 795               }
 796           }
 797         else
 798           phase3_ungetc (c);
 799
 800         return UNICODE (n);
 801       }
 802     default:
 803       /* Invalid escape sequence.  */
 804       phase3_ungetc (c);
 805       return UNICODE ('\\');
 806     }
 807 }
 808
 809 /* Read a string literal or character literal.  */
 810 static void
 811 accumulate_escaped (struct string_buffer *literal, int delimiter)
 812 {
 813   int c;
 814
 815   for (;;)
 816     {
 817       /* Use phase 3, because phase 4 elides comments.  */
 818       c = phase3_getc ();
 819       if (c == P2_EOF || RED (c) == delimiter)
 820         break;
 821       if (RED (c) == '\n')
 822         {
 823           phase3_ungetc (c);
 824           error_with_progname = false;
 825           if (delimiter == '\'')
 826             error (0, 0, _("%s:%d: warning: unterminated character constant"),
 827                    logical_file_name, line_number);
 828           else
 829             error (0, 0, _("%s:%d: warning: unterminated string constant"),
 830                    logical_file_name, line_number);
 831           error_with_progname = true;
 832           break;
 833         }
 834       if (RED (c) == '\\')
 835         c = do_getc_escaped ();
 836       string_buffer_append (literal, c);
 837     }
 838 }
 839
 840
 841 /* Combine characters into tokens.  Discard whitespace.  */
 842
 843 static token_ty phase5_pushback[3];
 844 static int phase5_pushback_length;
 845
 846 static void
 847 phase5_get (token_ty *tp)
 848 {
 849   int c;
 850
 851   if (phase5_pushback_length)
 852     {
 853       *tp = phase5_pushback[--phase5_pushback_length];
 854       return;
 855     }
 856   tp->string = NULL;
 857
 858   for (;;)
 859     {
 860       tp->line_number = line_number;
 861       c = phase4_getc ();
 862
 863       if (c == P2_EOF)
 864         {
 865           tp->type = token_type_eof;
 866           return;
 867         }
 868
 869       switch (RED (c))
 870         {
 871         case '\n':
 872           if (last_non_comment_line > last_comment_line)
 873             savable_comment_reset ();
 874           /* FALLTHROUGH */
 875         case ' ':
 876         case '\t':
 877         case '\f':
 878           /* Ignore whitespace and comments.  */
 879           continue;
 880         }
 881
 882       last_non_comment_line = tp->line_number;
 883
 884       switch (RED (c))
 885         {
 886         case '(':
 887           tp->type = token_type_lparen;
 888           return;
 889
 890         case ')':
 891           tp->type = token_type_rparen;
 892           return;
 893
 894         case '{':
 895           tp->type = token_type_lbrace;
 896           return;
 897
 898         case '}':
 899           tp->type = token_type_rbrace;
 900           return;
 901
 902         case ',':
 903           tp->type = token_type_comma;
 904           return;
 905
 906         case '.':
 907           c = phase4_getc ();
 908           if (!(RED (c) >= '0' && RED (c) <= '9'))
 909             {
 910               phase4_ungetc (c);
 911               tp->type = token_type_dot;
 912               return;
 913             }
 914           /* FALLTHROUGH */
 915
 916         case '0': case '1': case '2': case '3': case '4':
 917         case '5': case '6': case '7': case '8': case '9':
 918           {
 919             /* Don't need to verify the complicated syntax of integers and
 920                floating-point numbers.  We assume a valid Java input.
 921                The simplified syntax that we recognize as number is: any
 922                sequence of alphanumeric characters, additionally '+' and '-'
 923                immediately after 'e' or 'E' except in hexadecimal numbers.  */
 924             bool hexadecimal = false;
 925
 926             for (;;)
 927               {
 928                 c = phase4_getc ();
 929                 if (RED (c) >= '0' && RED (c) <= '9')
 930                   continue;
 931                 if ((RED (c) >= 'A' && RED (c) <= 'Z')
 932                     || (RED (c) >= 'a' && RED (c) <= 'z'))
 933                   {
 934                     if (RED (c) == 'X' || RED (c) == 'x')
 935                       hexadecimal = true;
 936                     if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
 937                       {
 938                         c = phase4_getc ();
 939                         if (!(RED (c) == '+' || RED (c) == '-'))
 940                           phase4_ungetc (c);
 941                       }
 942                     continue;
 943                   }
 944                 if (RED (c) == '.')
 945                   continue;
 946                 break;
 947               }
 948             phase4_ungetc (c);
 949             tp->type = token_type_number;
 950             return;
 951           }
 952
 953         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
 954         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
 955         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
 956         case 'V': case 'W': case 'X': case 'Y': case 'Z':
 957         case '_':
 958         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
 959         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
 960         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
 961         case 'v': case 'w': case 'x': case 'y': case 'z':
 962           /* Although Java allows identifiers containing many Unicode
 963              characters, we recognize only identifiers consisting of ASCII
 964              characters.  This avoids conversion hassles w.r.t. the --keyword
 965              arguments, and shouldn't be a big problem in practice.  */
 966           {
 967             static char *buffer;
 968             static int bufmax;
 969             int bufpos = 0;
 970             for (;;)
 971               {
 972                 if (bufpos >= bufmax)
 973                   {
 974                     bufmax = 2 * bufmax + 10;
 975                     buffer = xrealloc (buffer, bufmax);
 976                   }
 977                 buffer[bufpos++] = RED (c);
 978                 c = phase4_getc ();
 979                 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
 980                       || (RED (c) >= 'a' && RED (c) <= 'z')
 981                       || (RED (c) >= '0' && RED (c) <= '9')
 982                       || RED (c) == '_'))
 983                   break;
 984               }
 985             phase4_ungetc (c);
 986             if (bufpos >= bufmax)
 987               {
 988                 bufmax = 2 * bufmax + 10;
 989                 buffer = xrealloc (buffer, bufmax);
 990               }
 991             buffer[bufpos] = '\0';
 992             tp->string = xstrdup (buffer);
 993             tp->type = token_type_symbol;
 994             return;
 995           }
 996
 997         case '"':
 998           /* String literal.  */
 999           {
1000             struct string_buffer literal;
1001
1002             init_string_buffer (&literal);
1003             accumulate_escaped (&literal, '"');
1004             tp->string = xstrdup (string_buffer_result (&literal));
1005             free_string_buffer (&literal);
1006             tp->comment = add_reference (savable_comment);
1007             tp->type = token_type_string_literal;
1008             return;
1009           }
1010
1011         case '\'':
1012           /* Character literal.  */
1013           {
1014             struct string_buffer literal;
1015
1016             init_string_buffer (&literal);
1017             accumulate_escaped (&literal, '\'');
1018             free_string_buffer (&literal);
1019             tp->type = token_type_other;
1020             return;
1021           }
1022
1023         case '+':
1024           c = phase4_getc ();
1025           if (RED (c) == '+')
1026             /* Operator ++ */
1027             tp->type = token_type_other;
1028           else if (RED (c) == '=')
1029             /* Operator += */
1030             tp->type = token_type_other;
1031           else
1032             {
1033               /* Operator + */
1034               phase4_ungetc (c);
1035               tp->type = token_type_plus;
1036             }
1037           return;
1038
1039         default:
1040           /* Misc. operator.  */
1041           tp->type = token_type_other;
1042           return;
1043         }
1044     }
1045 }
1046
1047 /* Supports 3 tokens of pushback.  */
1048 static void
1049 phase5_unget (token_ty *tp)
1050 {
1051   if (tp->type != token_type_eof)
1052     {
1053       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1054         abort ();
1055       phase5_pushback[phase5_pushback_length++] = *tp;
1056     }
1057 }
1058
1059
1060 /* Compile-time optimization of string literal concatenation.
1061    Combine "string1" + ... + "stringN" to the concatenated string if
1062      - the token before this expression is not ')' (because then the first
1063        string could be part of a cast expression),
1064      - the token after this expression is not '.' (because then the last
1065        string could be part of a method call expression).  */
1066
1067 static token_ty phase6_pushback[2];
1068 static int phase6_pushback_length;
1069
1070 static token_type_ty phase6_last;
1071
1072 static void
1073 phase6_get (token_ty *tp)
1074 {
1075   if (phase6_pushback_length)
1076     {
1077       *tp = phase6_pushback[--phase6_pushback_length];
1078       return;
1079     }
1080
1081   phase5_get (tp);
1082   if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1083     {
1084       char *sum = tp->string;
1085       size_t sum_len = strlen (sum);
1086
1087       for (;;)
1088         {
1089           token_ty token2;
1090
1091           phase5_get (&token2);
1092           if (token2.type == token_type_plus)
1093             {
1094               token_ty token3;
1095
1096               phase5_get (&token3);
1097               if (token3.type == token_type_string_literal)
1098                 {
1099                   token_ty token_after;
1100
1101                   phase5_get (&token_after);
1102                   if (token_after.type != token_type_dot)
1103                     {
1104                       char *addend = token3.string;
1105                       size_t addend_len = strlen (addend);
1106
1107                       sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1108                       memcpy (sum + sum_len, addend, addend_len + 1);
1109                       sum_len += addend_len;
1110
1111                       phase5_unget (&token_after);
1112                       free_token (&token3);
1113                       free_token (&token2);
1114                       continue;
1115                     }
1116                   phase5_unget (&token_after);
1117                 }
1118               phase5_unget (&token3);
1119             }
1120           phase5_unget (&token2);
1121           break;
1122         }
1123       tp->string = sum;
1124     }
1125   phase6_last = tp->type;
1126 }
1127
1128 /* Supports 2 tokens of pushback.  */
1129 static void
1130 phase6_unget (token_ty *tp)
1131 {
1132   if (tp->type != token_type_eof)
1133     {
1134       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1135         abort ();
1136       phase6_pushback[phase6_pushback_length++] = *tp;
1137     }
1138 }
1139
1140
1141 static void
1142 x_java_lex (token_ty *tp)
1143 {
1144   phase6_get (tp);
1145 }
1146
1147 /* Supports 2 tokens of pushback.  */
1148 static void
1149 x_java_unlex (token_ty *tp)
1150 {
1151   phase6_unget (tp);
1152 }
1153
1154
1155 /* ========================= Extracting strings.  ========================== */
1156
1157
1158 /* Context lookup table.  */
1159 static flag_context_list_table_ty *flag_context_list_table;
1160
1161
1162 /* The file is broken into tokens.  Scan the token stream, looking for
1163    a keyword, followed by a left paren, followed by a string.  When we
1164    see this sequence, we have something to remember.  We assume we are
1165    looking at a valid C or C++ program, and leave the complaints about
1166    the grammar to the compiler.
1167
1168      Normal handling: Look for
1169        keyword ( ... msgid ... )
1170      Plural handling: Look for
1171        keyword ( ... msgid ... msgid_plural ... )
1172
1173    We use recursion because the arguments before msgid or between msgid
1174    and msgid_plural can contain subexpressions of the same form.  */
1175
1176
1177 /* Extract messages until the next balanced closing parenthesis or brace,
1178    depending on TERMINATOR.
1179    Extracted messages are added to MLP.
1180    When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1181    if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1182    otherwise PLURAL_COMMAS = 0.
1183    When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1184    Return true upon eof, false upon closing parenthesis or brace.  */
1185 static bool
1186 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1187                        flag_context_ty outer_context,
1188                        flag_context_list_iterator_ty context_iter,
1189                        int commas_to_skip, int plural_commas)
1190 {
1191   /* Remember the message containing the msgid, for msgid_plural.  */
1192   message_ty *plural_mp = NULL;
1193
1194   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1195   int state;
1196   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1197   int next_commas_to_skip = -1;
1198   int next_plural_commas = 0;
1199   /* Context iterator that will be used if the next token is a '('.  */
1200   flag_context_list_iterator_ty next_context_iter =
1201     passthrough_context_list_iterator;
1202   /* Current context.  */
1203   flag_context_ty inner_context =
1204     inherited_context (outer_context,
1205                        flag_context_list_iterator_advance (&context_iter));
1206
1207   /* Start state is 0.  */
1208   state = 0;
1209
1210   for (;;)
1211     {
1212       token_ty token;
1213
1214       x_java_lex (&token);
1215       switch (token.type)
1216         {
1217         case token_type_symbol:
1218           {
1219             /* Combine symbol1 . ... . symbolN to a single strings, so that
1220                we can recognize static function calls like
1221                GettextResource.gettext.  The information present for
1222                symbolI.....symbolN has precedence over the information for
1223                symbolJ.....symbolN with J > I.  */
1224             char *sum = token.string;
1225             size_t sum_len = strlen (sum);
1226             const char *dottedname;
1227             flag_context_list_ty *context_list;
1228
1229             for (;;)
1230               {
1231                 token_ty token2;
1232
1233                 x_java_lex (&token2);
1234                 if (token2.type == token_type_dot)
1235                   {
1236                     token_ty token3;
1237
1238                     x_java_lex (&token3);
1239                     if (token3.type == token_type_symbol)
1240                       {
1241                         char *addend = token3.string;
1242                         size_t addend_len = strlen (addend);
1243
1244                         sum =
1245                           (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1246                         sum[sum_len] = '.';
1247                         memcpy (sum + sum_len + 1, addend, addend_len + 1);
1248                         sum_len += 1 + addend_len;
1249
1250                         free_token (&token3);
1251                         free_token (&token2);
1252                         continue;
1253                       }
1254                     x_java_unlex (&token3);
1255                   }
1256                 x_java_unlex (&token2);
1257                 break;
1258               }
1259
1260             for (dottedname = sum;;)
1261               {
1262                 void *keyword_value;
1263
1264                 if (find_entry (&keywords, dottedname, strlen (dottedname),
1265                                 &keyword_value)
1266                     == 0)
1267                   {
1268                     int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1269                     int argnum2 = (int) (long) keyword_value >> 10;
1270
1271                     next_commas_to_skip = argnum1 - 1;
1272                     next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1273                     state = 1;
1274                     break;
1275                   }
1276
1277                 dottedname = strchr (dottedname, '.');
1278                 if (dottedname == NULL)
1279                   {
1280                     state = 0;
1281                     break;
1282                   }
1283                 dottedname++;
1284               }
1285
1286             for (dottedname = sum;;)
1287               {
1288                 context_list =
1289                   flag_context_list_table_lookup (
1290                     flag_context_list_table,
1291                     dottedname, strlen (dottedname));
1292                 if (context_list != NULL)
1293                   break;
1294
1295                 dottedname = strchr (dottedname, '.');
1296                 if (dottedname == NULL)
1297                   break;
1298                 dottedname++;
1299               }
1300             next_context_iter = flag_context_list_iterator (context_list);
1301
1302             free (sum);
1303             continue;
1304           }
1305
1306         case token_type_lparen:
1307           if (extract_parenthesized (mlp, token_type_rparen,
1308                                      inner_context, next_context_iter,
1309                                      state ? next_commas_to_skip : -1,
1310                                      state ? next_plural_commas : 0))
1311             return true;
1312           next_context_iter = null_context_list_iterator;
1313           state = 0;
1314           continue;
1315
1316         case token_type_rparen:
1317           if (terminator == token_type_rparen)
1318             return false;
1319           if (terminator == token_type_rbrace)
1320             {
1321               error_with_progname = false;
1322               error (0, 0,
1323                      _("%s:%d: warning: ')' found where '}' was expected"),
1324                      logical_file_name, token.line_number);
1325               error_with_progname = true;
1326             }
1327           next_context_iter = null_context_list_iterator;
1328           state = 0;
1329           continue;
1330
1331         case token_type_lbrace:
1332           if (extract_parenthesized (mlp, token_type_rbrace,
1333                                      null_context, null_context_list_iterator,
1334                                      -1, 0))
1335             return true;
1336           next_context_iter = null_context_list_iterator;
1337           state = 0;
1338           continue;
1339
1340         case token_type_rbrace:
1341           if (terminator == token_type_rbrace)
1342             return false;
1343           if (terminator == token_type_rparen)
1344             {
1345               error_with_progname = false;
1346               error (0, 0,
1347                      _("%s:%d: warning: '}' found where ')' was expected"),
1348                      logical_file_name, token.line_number);
1349               error_with_progname = true;
1350             }
1351           next_context_iter = null_context_list_iterator;
1352           state = 0;
1353           continue;
1354
1355         case token_type_comma:
1356           if (commas_to_skip >= 0)
1357             {
1358               if (commas_to_skip > 0)
1359                 commas_to_skip--;
1360               else
1361                 if (plural_mp != NULL && plural_commas > 0)
1362                   {
1363                     commas_to_skip = plural_commas - 1;
1364                     plural_commas = 0;
1365                   }
1366                 else
1367                   commas_to_skip = -1;
1368             }
1369           inner_context =
1370             inherited_context (outer_context,
1371                                flag_context_list_iterator_advance (
1372                                  &context_iter));
1373           next_context_iter = passthrough_context_list_iterator;
1374           state = 0;
1375           continue;
1376
1377         case token_type_string_literal:
1378           {
1379             lex_pos_ty pos;
1380             pos.file_name = logical_file_name;
1381             pos.line_number = token.line_number;
1382
1383             if (extract_all)
1384               {
1385                 xgettext_current_source_encoding = po_charset_utf8;
1386                 savable_comment_to_xgettext_comment (token.comment);
1387                 remember_a_message (mlp, token.string, inner_context, &pos);
1388                 savable_comment_reset ();
1389                 xgettext_current_source_encoding = xgettext_global_source_encoding;
1390               }
1391             else
1392               {
1393                 if (commas_to_skip == 0)
1394                   {
1395                     if (plural_mp == NULL)
1396                       {
1397                         /* Seen an msgid.  */
1398                         message_ty *mp;
1399
1400                         xgettext_current_source_encoding = po_charset_utf8;
1401                         savable_comment_to_xgettext_comment (token.comment);
1402                         mp = remember_a_message (mlp, token.string,
1403                                                  inner_context, &pos);
1404                         savable_comment_reset ();
1405                         xgettext_current_source_encoding = xgettext_global_source_encoding;
1406                         if (plural_commas > 0)
1407                           plural_mp = mp;
1408                       }
1409                     else
1410                       {
1411                         /* Seen an msgid_plural.  */
1412                         xgettext_current_source_encoding = po_charset_utf8;
1413                         remember_a_message_plural (plural_mp, token.string,
1414                                                    inner_context, &pos);
1415                         xgettext_current_source_encoding = xgettext_global_source_encoding;
1416                         plural_mp = NULL;
1417                       }
1418                   }
1419                 else
1420                   free (token.string);
1421               }
1422           }
1423           drop_reference (token.comment);
1424           next_context_iter = null_context_list_iterator;
1425           state = 0;
1426           continue;
1427
1428         case token_type_eof:
1429           return true;
1430
1431         case token_type_dot:
1432         case token_type_number:
1433         case token_type_plus:
1434         case token_type_other:
1435           next_context_iter = null_context_list_iterator;
1436           state = 0;
1437           continue;
1438
1439         default:
1440           abort ();
1441         }
1442     }
1443 }
1444
1445
1446 void
1447 extract_java (FILE *f,
1448               const char *real_filename, const char *logical_filename,
1449               flag_context_list_table_ty *flag_table,
1450               msgdomain_list_ty *mdlp)
1451 {
1452   message_list_ty *mlp = mdlp->item[0]->messages;
1453
1454   fp = f;
1455   real_file_name = real_filename;
1456   logical_file_name = xstrdup (logical_filename);
1457   line_number = 1;
1458
1459   last_comment_line = -1;
1460   last_non_comment_line = -1;
1461
1462   phase6_last = token_type_eof;
1463
1464   flag_context_list_table = flag_table;
1465
1466   init_keywords ();
1467
1468   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1469      due to an unbalanced closing parenthesis, just restart it.  */
1470   while (!extract_parenthesized (mlp, token_type_eof,
1471                                  null_context, null_context_list_iterator,
1472                                  -1, 0))
1473     ;
1474
1475   fp = NULL;
1476   real_file_name = NULL;
1477   logical_file_name = NULL;
1478   line_number = 0;
1479 }