Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-java.c
blob821cca3d2af40c9facc07169df62189a1725f7d0
1 /* xgettext Java backend.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
23 #include <errno.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-java.h"
32 #include "error.h"
33 #include "xalloc.h"
34 #include "exit.h"
35 #include "hash.h"
36 #include "po-charset.h"
37 #include "utf16-ucs4.h"
38 #include "ucs4-utf8.h"
39 #include "gettext.h"
41 #define _(s) gettext(s)
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
46 /* The Java syntax is defined in the
47 Java Language Specification, Second Edition,
48 (available from http://java.sun.com/),
49 chapter 3 "Lexical Structure". */
52 /* ====================== Keyword set customization. ====================== */
54 /* If true extract all strings. */
55 static bool extract_all = false;
57 static hash_table keywords;
58 static bool default_keywords = true;
61 void
62 x_java_extract_all ()
64 extract_all = true;
68 void
69 x_java_keyword (const char *name)
71 if (name == NULL)
72 default_keywords = false;
73 else
75 const char *end;
76 int argnum1;
77 int argnum2;
78 const char *colon;
80 if (keywords.table == NULL)
81 init_hash (&keywords, 100);
83 split_keywordspec (name, &end, &argnum1, &argnum2);
85 /* The characters between name and end should form a valid Java
86 identifier sequence with dots.
87 A colon means an invalid parse in split_keywordspec(). */
88 colon = strchr (name, ':');
89 if (colon == NULL || colon >= end)
91 if (argnum1 == 0)
92 argnum1 = 1;
93 insert_entry (&keywords, name, end - name,
94 (void *) (long) (argnum1 + (argnum2 << 10)));
99 /* Finish initializing the keywords hash table.
100 Called after argument processing, before each file is processed. */
101 static void
102 init_keywords ()
104 if (default_keywords)
106 x_java_keyword ("GettextResource.gettext:2"); /* static method */
107 x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */
108 x_java_keyword ("gettext");
109 x_java_keyword ("ngettext:1,2");
110 x_java_keyword ("getString"); /* ResourceBundle.getString */
111 default_keywords = false;
115 void
116 init_flag_table_java ()
118 xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
119 xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
120 xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
121 xgettext_record_flag ("gettext:1:pass-java-format");
122 xgettext_record_flag ("ngettext:1:pass-java-format");
123 xgettext_record_flag ("ngettext:2:pass-java-format");
124 xgettext_record_flag ("getString:1:pass-java-format");
125 xgettext_record_flag ("MessageFormat:1:java-format");
126 xgettext_record_flag ("MessageFormat.format:1:java-format");
130 /* ======================== Reading of characters. ======================== */
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name;
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name;
137 static int line_number;
139 /* The input file stream. */
140 static FILE *fp;
143 /* Fetch the next single-byte character from the input file.
144 Pushback can consist of an unlimited number of 'u' followed by up to 4
145 other characters. */
147 /* Special coding of multiple 'u's in the pushback buffer. */
148 #define MULTIPLE_U(count) (0x1000 + (count))
150 static int phase1_pushback[5];
151 static unsigned int phase1_pushback_length;
153 static int
154 phase1_getc ()
156 int c;
158 if (phase1_pushback_length)
160 c = phase1_pushback[--phase1_pushback_length];
161 if (c >= MULTIPLE_U (0))
163 if (c > MULTIPLE_U (1))
164 phase1_pushback[phase1_pushback_length++] = c - 1;
165 return 'u';
167 else
168 return c;
171 c = getc (fp);
173 if (c == EOF)
175 if (ferror (fp))
176 error (EXIT_FAILURE, errno, _("\
177 error while reading \"%s\""), real_file_name);
180 return c;
183 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */
184 static void
185 phase1_ungetc (int c)
187 if (c != EOF)
189 if (c == 'u')
191 if (phase1_pushback_length > 0
192 && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
193 phase1_pushback[phase1_pushback_length - 1]++;
194 else
196 if (phase1_pushback_length == SIZEOF (phase1_pushback))
197 abort ();
198 phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
201 else
203 if (phase1_pushback_length == SIZEOF (phase1_pushback))
204 abort ();
205 phase1_pushback[phase1_pushback_length++] = c;
211 /* Fetch the next single-byte character or Unicode character from the file.
212 (Here, as in the Java Language Specification, when we say "Unicode
213 character", we actually mean "UTF-16 encoding unit".) */
215 /* Return value of phase 2, 3, 4 when EOF is reached. */
216 #define P2_EOF 0xffff
218 /* Convert an UTF-16 code point to a return value that can be distinguished
219 from a single-byte return value. */
220 #define UNICODE(code) (0x10000 + (code))
222 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
223 point. */
224 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
226 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
227 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
229 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
230 so that it can be more easily compared against an ASCII character.
231 (RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */
232 #define RED(p2_result) ((p2_result) & 0xffff)
234 static int phase2_pushback[1];
235 static int phase2_pushback_length;
237 static int
238 phase2_getc ()
240 int c;
242 if (phase2_pushback_length)
243 return phase2_pushback[--phase2_pushback_length];
245 c = phase1_getc ();
246 if (c == EOF)
247 return P2_EOF;
248 if (c == '\\')
250 c = phase1_getc ();
251 if (c == 'u')
253 unsigned int u_count = 1;
254 unsigned char buf[4];
255 unsigned int n;
256 int i;
258 for (;;)
260 c = phase1_getc ();
261 if (c != 'u')
262 break;
263 u_count++;
265 phase1_ungetc (c);
267 n = 0;
268 for (i = 0; i < 4; i++)
270 c = phase1_getc ();
272 if (c >= '0' && c <= '9')
273 n = (n << 4) + (c - '0');
274 else if (c >= 'A' && c <= 'F')
275 n = (n << 4) + (c - 'A' + 10);
276 else if (c >= 'a' && c <= 'f')
277 n = (n << 4) + (c - 'a' + 10);
278 else
280 phase1_ungetc (c);
281 while (--i >= 0)
282 phase1_ungetc (buf[i]);
283 for (; u_count > 0; u_count--)
284 phase1_ungetc ('u');
285 return '\\';
288 buf[i] = c;
290 return UNICODE (n);
292 phase1_ungetc (c);
293 return '\\';
295 return c;
298 /* Supports only one pushback character. */
299 static void
300 phase2_ungetc (int c)
302 if (c != P2_EOF)
304 if (phase2_pushback_length == SIZEOF (phase2_pushback))
305 abort ();
306 phase2_pushback[phase2_pushback_length++] = c;
311 /* Fetch the next single-byte character or Unicode character from the file.
312 With line number handling.
313 Convert line terminators to '\n' or UNICODE ('\n'). */
315 static int phase3_pushback[2];
316 static int phase3_pushback_length;
318 static int
319 phase3_getc ()
321 int c;
323 if (phase3_pushback_length)
325 c = phase3_pushback[--phase3_pushback_length];
326 if (c == '\n')
327 ++line_number;
328 return c;
331 c = phase2_getc ();
333 /* Handle line terminators. */
334 if (RED (c) == '\r')
336 int c1 = phase2_getc ();
338 if (RED (c1) != '\n')
339 phase2_ungetc (c1);
341 /* Seen line terminator CR or CR/LF. */
342 if (c == '\r' || c1 == '\n')
344 ++line_number;
345 return '\n';
347 else
348 return UNICODE ('\n');
350 else if (RED (c) == '\n')
352 /* Seen line terminator LF. */
353 if (c == '\n')
355 ++line_number;
356 return '\n';
358 else
359 return UNICODE ('\n');
362 return c;
365 /* Supports 2 characters of pushback. */
366 static void
367 phase3_ungetc (int c)
369 if (c != P2_EOF)
371 if (c == '\n')
372 --line_number;
373 if (phase3_pushback_length == SIZEOF (phase3_pushback))
374 abort ();
375 phase3_pushback[phase3_pushback_length++] = c;
380 /* ========================= Accumulating strings. ======================== */
382 /* A string buffer type that allows appending bytes (in the
383 xgettext_current_source_encoding) or Unicode characters.
384 Returns the entire string in UTF-8 encoding. */
386 struct string_buffer
388 /* The part of the string that has already been converted to UTF-8. */
389 char *utf8_buffer;
390 size_t utf8_buflen;
391 size_t utf8_allocated;
392 /* The first half of an UTF-16 surrogate character. */
393 unsigned short utf16_surr;
394 /* The part of the string that is still in the source encoding. */
395 char *curr_buffer;
396 size_t curr_buflen;
397 size_t curr_allocated;
400 /* Initialize a 'struct string_buffer' to empty. */
401 static inline void
402 init_string_buffer (struct string_buffer *bp)
404 bp->utf8_buffer = NULL;
405 bp->utf8_buflen = 0;
406 bp->utf8_allocated = 0;
407 bp->utf16_surr = 0;
408 bp->curr_buffer = NULL;
409 bp->curr_buflen = 0;
410 bp->curr_allocated = 0;
413 /* Auxiliary function: Append a byte to bp->curr. */
414 static inline void
415 string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
417 if (bp->curr_buflen == bp->curr_allocated)
419 bp->curr_allocated = 2 * bp->curr_allocated + 10;
420 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
422 bp->curr_buffer[bp->curr_buflen++] = c;
425 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
426 static inline void
427 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
429 if (bp->utf8_buflen + count > bp->utf8_allocated)
431 size_t new_allocated = 2 * bp->utf8_allocated + 10;
432 if (new_allocated < bp->utf8_buflen + count)
433 new_allocated = bp->utf8_buflen + count;
434 bp->utf8_allocated = new_allocated;
435 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
439 /* Auxiliary function: Append a Unicode character to bp->utf8.
440 uc must be < 0x110000. */
441 static inline void
442 string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
444 unsigned char utf8buf[6];
445 int count = u8_uctomb (utf8buf, uc, 6);
447 if (count < 0)
448 /* The caller should have ensured that uc is not out-of-range. */
449 abort ();
451 string_buffer_append_unicode_grow (bp, count);
452 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
453 bp->utf8_buflen += count;
456 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
457 static inline void
458 string_buffer_flush_utf16_surr (struct string_buffer *bp)
460 if (bp->utf16_surr != 0)
462 /* A half surrogate is invalid, therefore use U+FFFD instead. */
463 string_buffer_append_unicode (bp, 0xfffd);
464 bp->utf16_surr = 0;
468 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
469 static inline void
470 string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
472 if (bp->curr_buflen > 0)
474 char *curr;
475 size_t count;
477 string_buffer_append_byte (bp, '\0');
479 /* Convert from the source encoding to UTF-8. */
480 curr = from_current_source_encoding (bp->curr_buffer,
481 logical_file_name, lineno);
483 /* Append it to bp->utf8_buffer. */
484 count = strlen (curr);
485 string_buffer_append_unicode_grow (bp, count);
486 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
487 bp->utf8_buflen += count;
489 if (curr != bp->curr_buffer)
490 free (curr);
491 bp->curr_buflen = 0;
495 /* Append a character or Unicode character to a 'struct string_buffer'. */
496 static void
497 string_buffer_append (struct string_buffer *bp, int c)
499 if (IS_UNICODE (c))
501 /* Append a Unicode character. */
503 /* Switch from multibyte character mode to Unicode character mode. */
504 string_buffer_flush_curr_buffer (bp, line_number);
506 /* Test whether this character and the previous one form a Unicode
507 surrogate character pair. */
508 if (bp->utf16_surr != 0
509 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
511 unsigned short utf16buf[2];
512 unsigned int uc;
514 utf16buf[0] = bp->utf16_surr;
515 utf16buf[1] = UTF16_VALUE (c);
516 if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
517 abort ();
519 string_buffer_append_unicode (bp, uc);
520 bp->utf16_surr = 0;
522 else
524 string_buffer_flush_utf16_surr (bp);
526 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
527 bp->utf16_surr = UTF16_VALUE (c);
528 else
529 string_buffer_append_unicode (bp, UTF16_VALUE (c));
532 else
534 /* Append a single byte. */
536 /* Switch from Unicode character mode to multibyte character mode. */
537 string_buffer_flush_utf16_surr (bp);
539 /* When a newline is seen, convert the accumulated multibyte sequence.
540 This ensures a correct line number in the error message in case of
541 a conversion error. The "- 1" is to account for the newline. */
542 if (c == '\n')
543 string_buffer_flush_curr_buffer (bp, line_number - 1);
545 string_buffer_append_byte (bp, (unsigned char) c);
549 /* Return the string buffer's contents. */
550 static char *
551 string_buffer_result (struct string_buffer *bp)
553 /* Flush all into bp->utf8_buffer. */
554 string_buffer_flush_utf16_surr (bp);
555 string_buffer_flush_curr_buffer (bp, line_number);
556 /* NUL-terminate it. */
557 string_buffer_append_unicode_grow (bp, 1);
558 bp->utf8_buffer[bp->utf8_buflen] = '\0';
559 /* Return it. */
560 return bp->utf8_buffer;
563 /* Free the memory pointed to by a 'struct string_buffer'. */
564 static inline void
565 free_string_buffer (struct string_buffer *bp)
567 free (bp->utf8_buffer);
568 free (bp->curr_buffer);
572 /* ======================== Accumulating comments. ======================== */
575 /* Accumulating a single comment line. */
577 static struct string_buffer comment_buffer;
579 static inline void
580 comment_start ()
582 comment_buffer.utf8_buflen = 0;
583 comment_buffer.utf16_surr = 0;
584 comment_buffer.curr_buflen = 0;
587 static inline bool
588 comment_at_start ()
590 return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
591 && comment_buffer.curr_buflen == 0);
594 static inline void
595 comment_add (int c)
597 string_buffer_append (&comment_buffer, c);
600 static inline void
601 comment_line_end (size_t chars_to_remove)
603 char *buffer = string_buffer_result (&comment_buffer);
604 size_t buflen = strlen (buffer);
606 buflen -= chars_to_remove;
607 while (buflen >= 1
608 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
609 --buflen;
610 buffer[buflen] = '\0';
611 savable_comment_add (buffer);
615 /* These are for tracking whether comments count as immediately before
616 keyword. */
617 static int last_comment_line;
618 static int last_non_comment_line;
621 /* Replace each comment that is not inside a character constant or string
622 literal with a space or newline character. */
624 static int
625 phase4_getc ()
627 int c0;
628 int c;
629 bool last_was_star;
631 c0 = phase3_getc ();
632 if (RED (c0) != '/')
633 return c0;
634 c = phase3_getc ();
635 switch (RED (c))
637 default:
638 phase3_ungetc (c);
639 return c0;
641 case '*':
642 /* C style comment. */
643 comment_start ();
644 last_was_star = false;
645 for (;;)
647 c = phase3_getc ();
648 if (c == P2_EOF)
649 break;
650 /* We skip all leading white space, but not EOLs. */
651 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
652 comment_add (c);
653 switch (RED (c))
655 case '\n':
656 comment_line_end (1);
657 comment_start ();
658 last_was_star = false;
659 continue;
661 case '*':
662 last_was_star = true;
663 continue;
665 case '/':
666 if (last_was_star)
668 comment_line_end (2);
669 break;
671 /* FALLTHROUGH */
673 default:
674 last_was_star = false;
675 continue;
677 break;
679 last_comment_line = line_number;
680 return ' ';
682 case '/':
683 /* C++ style comment. */
684 last_comment_line = line_number;
685 comment_start ();
686 for (;;)
688 c = phase3_getc ();
689 if (RED (c) == '\n' || c == P2_EOF)
690 break;
691 /* We skip all leading white space, but not EOLs. */
692 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
693 comment_add (c);
695 phase3_ungetc (c); /* push back the newline, to decrement line_number */
696 comment_line_end (0);
697 phase3_getc (); /* read the newline again */
698 return '\n';
702 /* Supports only one pushback character. */
703 static void
704 phase4_ungetc (int c)
706 phase3_ungetc (c);
710 /* ========================== Reading of tokens. ========================== */
712 enum token_type_ty
714 token_type_eof,
715 token_type_lparen, /* ( */
716 token_type_rparen, /* ) */
717 token_type_lbrace, /* { */
718 token_type_rbrace, /* } */
719 token_type_comma, /* , */
720 token_type_dot, /* . */
721 token_type_string_literal, /* "abc" */
722 token_type_number, /* 1.23 */
723 token_type_symbol, /* identifier, keyword, null */
724 token_type_plus, /* + */
725 token_type_other /* character literal, misc. operator */
727 typedef enum token_type_ty token_type_ty;
729 typedef struct token_ty token_ty;
730 struct token_ty
732 token_type_ty type;
733 char *string; /* for token_type_string_literal, token_type_symbol */
734 refcounted_string_list_ty *comment; /* for token_type_string_literal */
735 int line_number;
739 /* Free the memory pointed to by a 'struct token_ty'. */
740 static inline void
741 free_token (token_ty *tp)
743 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
744 free (tp->string);
745 if (tp->type == token_type_string_literal)
746 drop_reference (tp->comment);
750 /* Read an escape sequence inside a string literal or character literal. */
751 static inline int
752 do_getc_escaped ()
754 int c;
756 /* Use phase 3, because phase 4 elides comments. */
757 c = phase3_getc ();
758 if (c == P2_EOF)
759 return UNICODE ('\\');
760 switch (RED (c))
762 case 'b':
763 return UNICODE (0x08);
764 case 't':
765 return UNICODE (0x09);
766 case 'n':
767 return UNICODE (0x0a);
768 case 'f':
769 return UNICODE (0x0c);
770 case 'r':
771 return UNICODE (0x0d);
772 case '"':
773 return UNICODE ('"');
774 case '\'':
775 return UNICODE ('\'');
776 case '\\':
777 return UNICODE ('\\');
778 case '0': case '1': case '2': case '3':
779 case '4': case '5': case '6': case '7':
781 int n = RED (c) - '0';
782 bool maybe3digits = (n < 4);
784 c = phase3_getc ();
785 if (RED (c) >= '0' && RED (c) <= '7')
787 n = (n << 3) + (RED (c) - '0');
788 if (maybe3digits)
790 c = phase3_getc ();
791 if (RED (c) >= '0' && RED (c) <= '7')
792 n = (n << 3) + (RED (c) - '0');
793 else
794 phase3_ungetc (c);
797 else
798 phase3_ungetc (c);
800 return UNICODE (n);
802 default:
803 /* Invalid escape sequence. */
804 phase3_ungetc (c);
805 return UNICODE ('\\');
809 /* Read a string literal or character literal. */
810 static void
811 accumulate_escaped (struct string_buffer *literal, int delimiter)
813 int c;
815 for (;;)
817 /* Use phase 3, because phase 4 elides comments. */
818 c = phase3_getc ();
819 if (c == P2_EOF || RED (c) == delimiter)
820 break;
821 if (RED (c) == '\n')
823 phase3_ungetc (c);
824 error_with_progname = false;
825 if (delimiter == '\'')
826 error (0, 0, _("%s:%d: warning: unterminated character constant"),
827 logical_file_name, line_number);
828 else
829 error (0, 0, _("%s:%d: warning: unterminated string constant"),
830 logical_file_name, line_number);
831 error_with_progname = true;
832 break;
834 if (RED (c) == '\\')
835 c = do_getc_escaped ();
836 string_buffer_append (literal, c);
841 /* Combine characters into tokens. Discard whitespace. */
843 static token_ty phase5_pushback[3];
844 static int phase5_pushback_length;
846 static void
847 phase5_get (token_ty *tp)
849 int c;
851 if (phase5_pushback_length)
853 *tp = phase5_pushback[--phase5_pushback_length];
854 return;
856 tp->string = NULL;
858 for (;;)
860 tp->line_number = line_number;
861 c = phase4_getc ();
863 if (c == P2_EOF)
865 tp->type = token_type_eof;
866 return;
869 switch (RED (c))
871 case '\n':
872 if (last_non_comment_line > last_comment_line)
873 savable_comment_reset ();
874 /* FALLTHROUGH */
875 case ' ':
876 case '\t':
877 case '\f':
878 /* Ignore whitespace and comments. */
879 continue;
882 last_non_comment_line = tp->line_number;
884 switch (RED (c))
886 case '(':
887 tp->type = token_type_lparen;
888 return;
890 case ')':
891 tp->type = token_type_rparen;
892 return;
894 case '{':
895 tp->type = token_type_lbrace;
896 return;
898 case '}':
899 tp->type = token_type_rbrace;
900 return;
902 case ',':
903 tp->type = token_type_comma;
904 return;
906 case '.':
907 c = phase4_getc ();
908 if (!(RED (c) >= '0' && RED (c) <= '9'))
910 phase4_ungetc (c);
911 tp->type = token_type_dot;
912 return;
914 /* FALLTHROUGH */
916 case '0': case '1': case '2': case '3': case '4':
917 case '5': case '6': case '7': case '8': case '9':
919 /* Don't need to verify the complicated syntax of integers and
920 floating-point numbers. We assume a valid Java input.
921 The simplified syntax that we recognize as number is: any
922 sequence of alphanumeric characters, additionally '+' and '-'
923 immediately after 'e' or 'E' except in hexadecimal numbers. */
924 bool hexadecimal = false;
926 for (;;)
928 c = phase4_getc ();
929 if (RED (c) >= '0' && RED (c) <= '9')
930 continue;
931 if ((RED (c) >= 'A' && RED (c) <= 'Z')
932 || (RED (c) >= 'a' && RED (c) <= 'z'))
934 if (RED (c) == 'X' || RED (c) == 'x')
935 hexadecimal = true;
936 if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
938 c = phase4_getc ();
939 if (!(RED (c) == '+' || RED (c) == '-'))
940 phase4_ungetc (c);
942 continue;
944 if (RED (c) == '.')
945 continue;
946 break;
948 phase4_ungetc (c);
949 tp->type = token_type_number;
950 return;
953 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
954 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
955 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
956 case 'V': case 'W': case 'X': case 'Y': case 'Z':
957 case '_':
958 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
959 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
960 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
961 case 'v': case 'w': case 'x': case 'y': case 'z':
962 /* Although Java allows identifiers containing many Unicode
963 characters, we recognize only identifiers consisting of ASCII
964 characters. This avoids conversion hassles w.r.t. the --keyword
965 arguments, and shouldn't be a big problem in practice. */
967 static char *buffer;
968 static int bufmax;
969 int bufpos = 0;
970 for (;;)
972 if (bufpos >= bufmax)
974 bufmax = 2 * bufmax + 10;
975 buffer = xrealloc (buffer, bufmax);
977 buffer[bufpos++] = RED (c);
978 c = phase4_getc ();
979 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
980 || (RED (c) >= 'a' && RED (c) <= 'z')
981 || (RED (c) >= '0' && RED (c) <= '9')
982 || RED (c) == '_'))
983 break;
985 phase4_ungetc (c);
986 if (bufpos >= bufmax)
988 bufmax = 2 * bufmax + 10;
989 buffer = xrealloc (buffer, bufmax);
991 buffer[bufpos] = '\0';
992 tp->string = xstrdup (buffer);
993 tp->type = token_type_symbol;
994 return;
997 case '"':
998 /* String literal. */
1000 struct string_buffer literal;
1002 init_string_buffer (&literal);
1003 accumulate_escaped (&literal, '"');
1004 tp->string = xstrdup (string_buffer_result (&literal));
1005 free_string_buffer (&literal);
1006 tp->comment = add_reference (savable_comment);
1007 tp->type = token_type_string_literal;
1008 return;
1011 case '\'':
1012 /* Character literal. */
1014 struct string_buffer literal;
1016 init_string_buffer (&literal);
1017 accumulate_escaped (&literal, '\'');
1018 free_string_buffer (&literal);
1019 tp->type = token_type_other;
1020 return;
1023 case '+':
1024 c = phase4_getc ();
1025 if (RED (c) == '+')
1026 /* Operator ++ */
1027 tp->type = token_type_other;
1028 else if (RED (c) == '=')
1029 /* Operator += */
1030 tp->type = token_type_other;
1031 else
1033 /* Operator + */
1034 phase4_ungetc (c);
1035 tp->type = token_type_plus;
1037 return;
1039 default:
1040 /* Misc. operator. */
1041 tp->type = token_type_other;
1042 return;
1047 /* Supports 3 tokens of pushback. */
1048 static void
1049 phase5_unget (token_ty *tp)
1051 if (tp->type != token_type_eof)
1053 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1054 abort ();
1055 phase5_pushback[phase5_pushback_length++] = *tp;
1060 /* Compile-time optimization of string literal concatenation.
1061 Combine "string1" + ... + "stringN" to the concatenated string if
1062 - the token before this expression is not ')' (because then the first
1063 string could be part of a cast expression),
1064 - the token after this expression is not '.' (because then the last
1065 string could be part of a method call expression). */
1067 static token_ty phase6_pushback[2];
1068 static int phase6_pushback_length;
1070 static token_type_ty phase6_last;
1072 static void
1073 phase6_get (token_ty *tp)
1075 if (phase6_pushback_length)
1077 *tp = phase6_pushback[--phase6_pushback_length];
1078 return;
1081 phase5_get (tp);
1082 if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1084 char *sum = tp->string;
1085 size_t sum_len = strlen (sum);
1087 for (;;)
1089 token_ty token2;
1091 phase5_get (&token2);
1092 if (token2.type == token_type_plus)
1094 token_ty token3;
1096 phase5_get (&token3);
1097 if (token3.type == token_type_string_literal)
1099 token_ty token_after;
1101 phase5_get (&token_after);
1102 if (token_after.type != token_type_dot)
1104 char *addend = token3.string;
1105 size_t addend_len = strlen (addend);
1107 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1108 memcpy (sum + sum_len, addend, addend_len + 1);
1109 sum_len += addend_len;
1111 phase5_unget (&token_after);
1112 free_token (&token3);
1113 free_token (&token2);
1114 continue;
1116 phase5_unget (&token_after);
1118 phase5_unget (&token3);
1120 phase5_unget (&token2);
1121 break;
1123 tp->string = sum;
1125 phase6_last = tp->type;
1128 /* Supports 2 tokens of pushback. */
1129 static void
1130 phase6_unget (token_ty *tp)
1132 if (tp->type != token_type_eof)
1134 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1135 abort ();
1136 phase6_pushback[phase6_pushback_length++] = *tp;
1141 static void
1142 x_java_lex (token_ty *tp)
1144 phase6_get (tp);
1147 /* Supports 2 tokens of pushback. */
1148 static void
1149 x_java_unlex (token_ty *tp)
1151 phase6_unget (tp);
1155 /* ========================= Extracting strings. ========================== */
1158 /* Context lookup table. */
1159 static flag_context_list_table_ty *flag_context_list_table;
1162 /* The file is broken into tokens. Scan the token stream, looking for
1163 a keyword, followed by a left paren, followed by a string. When we
1164 see this sequence, we have something to remember. We assume we are
1165 looking at a valid C or C++ program, and leave the complaints about
1166 the grammar to the compiler.
1168 Normal handling: Look for
1169 keyword ( ... msgid ... )
1170 Plural handling: Look for
1171 keyword ( ... msgid ... msgid_plural ... )
1173 We use recursion because the arguments before msgid or between msgid
1174 and msgid_plural can contain subexpressions of the same form. */
1177 /* Extract messages until the next balanced closing parenthesis or brace,
1178 depending on TERMINATOR.
1179 Extracted messages are added to MLP.
1180 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1181 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1182 otherwise PLURAL_COMMAS = 0.
1183 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1184 Return true upon eof, false upon closing parenthesis or brace. */
1185 static bool
1186 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1187 flag_context_ty outer_context,
1188 flag_context_list_iterator_ty context_iter,
1189 int commas_to_skip, int plural_commas)
1191 /* Remember the message containing the msgid, for msgid_plural. */
1192 message_ty *plural_mp = NULL;
1194 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1195 int state;
1196 /* Parameters of the keyword just seen. Defined only in state 1. */
1197 int next_commas_to_skip = -1;
1198 int next_plural_commas = 0;
1199 /* Context iterator that will be used if the next token is a '('. */
1200 flag_context_list_iterator_ty next_context_iter =
1201 passthrough_context_list_iterator;
1202 /* Current context. */
1203 flag_context_ty inner_context =
1204 inherited_context (outer_context,
1205 flag_context_list_iterator_advance (&context_iter));
1207 /* Start state is 0. */
1208 state = 0;
1210 for (;;)
1212 token_ty token;
1214 x_java_lex (&token);
1215 switch (token.type)
1217 case token_type_symbol:
1219 /* Combine symbol1 . ... . symbolN to a single strings, so that
1220 we can recognize static function calls like
1221 GettextResource.gettext. The information present for
1222 symbolI.....symbolN has precedence over the information for
1223 symbolJ.....symbolN with J > I. */
1224 char *sum = token.string;
1225 size_t sum_len = strlen (sum);
1226 const char *dottedname;
1227 flag_context_list_ty *context_list;
1229 for (;;)
1231 token_ty token2;
1233 x_java_lex (&token2);
1234 if (token2.type == token_type_dot)
1236 token_ty token3;
1238 x_java_lex (&token3);
1239 if (token3.type == token_type_symbol)
1241 char *addend = token3.string;
1242 size_t addend_len = strlen (addend);
1244 sum =
1245 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1246 sum[sum_len] = '.';
1247 memcpy (sum + sum_len + 1, addend, addend_len + 1);
1248 sum_len += 1 + addend_len;
1250 free_token (&token3);
1251 free_token (&token2);
1252 continue;
1254 x_java_unlex (&token3);
1256 x_java_unlex (&token2);
1257 break;
1260 for (dottedname = sum;;)
1262 void *keyword_value;
1264 if (find_entry (&keywords, dottedname, strlen (dottedname),
1265 &keyword_value)
1266 == 0)
1268 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1269 int argnum2 = (int) (long) keyword_value >> 10;
1271 next_commas_to_skip = argnum1 - 1;
1272 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1273 state = 1;
1274 break;
1277 dottedname = strchr (dottedname, '.');
1278 if (dottedname == NULL)
1280 state = 0;
1281 break;
1283 dottedname++;
1286 for (dottedname = sum;;)
1288 context_list =
1289 flag_context_list_table_lookup (
1290 flag_context_list_table,
1291 dottedname, strlen (dottedname));
1292 if (context_list != NULL)
1293 break;
1295 dottedname = strchr (dottedname, '.');
1296 if (dottedname == NULL)
1297 break;
1298 dottedname++;
1300 next_context_iter = flag_context_list_iterator (context_list);
1302 free (sum);
1303 continue;
1306 case token_type_lparen:
1307 if (extract_parenthesized (mlp, token_type_rparen,
1308 inner_context, next_context_iter,
1309 state ? next_commas_to_skip : -1,
1310 state ? next_plural_commas : 0))
1311 return true;
1312 next_context_iter = null_context_list_iterator;
1313 state = 0;
1314 continue;
1316 case token_type_rparen:
1317 if (terminator == token_type_rparen)
1318 return false;
1319 if (terminator == token_type_rbrace)
1321 error_with_progname = false;
1322 error (0, 0,
1323 _("%s:%d: warning: ')' found where '}' was expected"),
1324 logical_file_name, token.line_number);
1325 error_with_progname = true;
1327 next_context_iter = null_context_list_iterator;
1328 state = 0;
1329 continue;
1331 case token_type_lbrace:
1332 if (extract_parenthesized (mlp, token_type_rbrace,
1333 null_context, null_context_list_iterator,
1334 -1, 0))
1335 return true;
1336 next_context_iter = null_context_list_iterator;
1337 state = 0;
1338 continue;
1340 case token_type_rbrace:
1341 if (terminator == token_type_rbrace)
1342 return false;
1343 if (terminator == token_type_rparen)
1345 error_with_progname = false;
1346 error (0, 0,
1347 _("%s:%d: warning: '}' found where ')' was expected"),
1348 logical_file_name, token.line_number);
1349 error_with_progname = true;
1351 next_context_iter = null_context_list_iterator;
1352 state = 0;
1353 continue;
1355 case token_type_comma:
1356 if (commas_to_skip >= 0)
1358 if (commas_to_skip > 0)
1359 commas_to_skip--;
1360 else
1361 if (plural_mp != NULL && plural_commas > 0)
1363 commas_to_skip = plural_commas - 1;
1364 plural_commas = 0;
1366 else
1367 commas_to_skip = -1;
1369 inner_context =
1370 inherited_context (outer_context,
1371 flag_context_list_iterator_advance (
1372 &context_iter));
1373 next_context_iter = passthrough_context_list_iterator;
1374 state = 0;
1375 continue;
1377 case token_type_string_literal:
1379 lex_pos_ty pos;
1380 pos.file_name = logical_file_name;
1381 pos.line_number = token.line_number;
1383 if (extract_all)
1385 xgettext_current_source_encoding = po_charset_utf8;
1386 savable_comment_to_xgettext_comment (token.comment);
1387 remember_a_message (mlp, token.string, inner_context, &pos);
1388 savable_comment_reset ();
1389 xgettext_current_source_encoding = xgettext_global_source_encoding;
1391 else
1393 if (commas_to_skip == 0)
1395 if (plural_mp == NULL)
1397 /* Seen an msgid. */
1398 message_ty *mp;
1400 xgettext_current_source_encoding = po_charset_utf8;
1401 savable_comment_to_xgettext_comment (token.comment);
1402 mp = remember_a_message (mlp, token.string,
1403 inner_context, &pos);
1404 savable_comment_reset ();
1405 xgettext_current_source_encoding = xgettext_global_source_encoding;
1406 if (plural_commas > 0)
1407 plural_mp = mp;
1409 else
1411 /* Seen an msgid_plural. */
1412 xgettext_current_source_encoding = po_charset_utf8;
1413 remember_a_message_plural (plural_mp, token.string,
1414 inner_context, &pos);
1415 xgettext_current_source_encoding = xgettext_global_source_encoding;
1416 plural_mp = NULL;
1419 else
1420 free (token.string);
1423 drop_reference (token.comment);
1424 next_context_iter = null_context_list_iterator;
1425 state = 0;
1426 continue;
1428 case token_type_eof:
1429 return true;
1431 case token_type_dot:
1432 case token_type_number:
1433 case token_type_plus:
1434 case token_type_other:
1435 next_context_iter = null_context_list_iterator;
1436 state = 0;
1437 continue;
1439 default:
1440 abort ();
1446 void
1447 extract_java (FILE *f,
1448 const char *real_filename, const char *logical_filename,
1449 flag_context_list_table_ty *flag_table,
1450 msgdomain_list_ty *mdlp)
1452 message_list_ty *mlp = mdlp->item[0]->messages;
1454 fp = f;
1455 real_file_name = real_filename;
1456 logical_file_name = xstrdup (logical_filename);
1457 line_number = 1;
1459 last_comment_line = -1;
1460 last_non_comment_line = -1;
1462 phase6_last = token_type_eof;
1464 flag_context_list_table = flag_table;
1466 init_keywords ();
1468 /* Eat tokens until eof is seen. When extract_parenthesized returns
1469 due to an unbalanced closing parenthesis, just restart it. */
1470 while (!extract_parenthesized (mlp, token_type_eof,
1471 null_context, null_context_list_iterator,
1472 -1, 0))
1475 fp = NULL;
1476 real_file_name = NULL;
1477 logical_file_name = NULL;
1478 line_number = 0;