1 /* xgettext Java backend.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
36 #include "po-charset.h"
37 #include "utf16-ucs4.h"
38 #include "ucs4-utf8.h"
41 #define _(s) gettext(s)
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
46 /* The Java syntax is defined in the
47 Java Language Specification, Second Edition,
48 (available from http://java.sun.com/),
49 chapter 3 "Lexical Structure". */
52 /* ====================== Keyword set customization. ====================== */
54 /* If true extract all strings. */
55 static bool extract_all
= false;
57 static hash_table keywords
;
58 static bool default_keywords
= true;
69 x_java_keyword (const char *name
)
72 default_keywords
= false;
80 if (keywords
.table
== NULL
)
81 init_hash (&keywords
, 100);
83 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
85 /* The characters between name and end should form a valid Java
86 identifier sequence with dots.
87 A colon means an invalid parse in split_keywordspec(). */
88 colon
= strchr (name
, ':');
89 if (colon
== NULL
|| colon
>= end
)
93 insert_entry (&keywords
, name
, end
- name
,
94 (void *) (long) (argnum1
+ (argnum2
<< 10)));
99 /* Finish initializing the keywords hash table.
100 Called after argument processing, before each file is processed. */
104 if (default_keywords
)
106 x_java_keyword ("GettextResource.gettext:2"); /* static method */
107 x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */
108 x_java_keyword ("gettext");
109 x_java_keyword ("ngettext:1,2");
110 x_java_keyword ("getString"); /* ResourceBundle.getString */
111 default_keywords
= false;
116 init_flag_table_java ()
118 xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
119 xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
120 xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
121 xgettext_record_flag ("gettext:1:pass-java-format");
122 xgettext_record_flag ("ngettext:1:pass-java-format");
123 xgettext_record_flag ("ngettext:2:pass-java-format");
124 xgettext_record_flag ("getString:1:pass-java-format");
125 xgettext_record_flag ("MessageFormat:1:java-format");
126 xgettext_record_flag ("MessageFormat.format:1:java-format");
130 /* ======================== Reading of characters. ======================== */
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name
;
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name
;
137 static int line_number
;
139 /* The input file stream. */
143 /* Fetch the next single-byte character from the input file.
144 Pushback can consist of an unlimited number of 'u' followed by up to 4
147 /* Special coding of multiple 'u's in the pushback buffer. */
148 #define MULTIPLE_U(count) (0x1000 + (count))
150 static int phase1_pushback
[5];
151 static unsigned int phase1_pushback_length
;
158 if (phase1_pushback_length
)
160 c
= phase1_pushback
[--phase1_pushback_length
];
161 if (c
>= MULTIPLE_U (0))
163 if (c
> MULTIPLE_U (1))
164 phase1_pushback
[phase1_pushback_length
++] = c
- 1;
176 error (EXIT_FAILURE
, errno
, _("\
177 error while reading \"%s\""), real_file_name
);
183 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */
185 phase1_ungetc (int c
)
191 if (phase1_pushback_length
> 0
192 && phase1_pushback
[phase1_pushback_length
- 1] >= MULTIPLE_U (0))
193 phase1_pushback
[phase1_pushback_length
- 1]++;
196 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
198 phase1_pushback
[phase1_pushback_length
++] = MULTIPLE_U (1);
203 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
205 phase1_pushback
[phase1_pushback_length
++] = c
;
211 /* Fetch the next single-byte character or Unicode character from the file.
212 (Here, as in the Java Language Specification, when we say "Unicode
213 character", we actually mean "UTF-16 encoding unit".) */
215 /* Return value of phase 2, 3, 4 when EOF is reached. */
216 #define P2_EOF 0xffff
218 /* Convert an UTF-16 code point to a return value that can be distinguished
219 from a single-byte return value. */
220 #define UNICODE(code) (0x10000 + (code))
222 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
224 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
226 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
227 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
229 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
230 so that it can be more easily compared against an ASCII character.
231 (RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */
232 #define RED(p2_result) ((p2_result) & 0xffff)
234 static int phase2_pushback
[1];
235 static int phase2_pushback_length
;
242 if (phase2_pushback_length
)
243 return phase2_pushback
[--phase2_pushback_length
];
253 unsigned int u_count
= 1;
254 unsigned char buf
[4];
268 for (i
= 0; i
< 4; i
++)
272 if (c
>= '0' && c
<= '9')
273 n
= (n
<< 4) + (c
- '0');
274 else if (c
>= 'A' && c
<= 'F')
275 n
= (n
<< 4) + (c
- 'A' + 10);
276 else if (c
>= 'a' && c
<= 'f')
277 n
= (n
<< 4) + (c
- 'a' + 10);
282 phase1_ungetc (buf
[i
]);
283 for (; u_count
> 0; u_count
--)
298 /* Supports only one pushback character. */
300 phase2_ungetc (int c
)
304 if (phase2_pushback_length
== SIZEOF (phase2_pushback
))
306 phase2_pushback
[phase2_pushback_length
++] = c
;
311 /* Fetch the next single-byte character or Unicode character from the file.
312 With line number handling.
313 Convert line terminators to '\n' or UNICODE ('\n'). */
315 static int phase3_pushback
[2];
316 static int phase3_pushback_length
;
323 if (phase3_pushback_length
)
325 c
= phase3_pushback
[--phase3_pushback_length
];
333 /* Handle line terminators. */
336 int c1
= phase2_getc ();
338 if (RED (c1
) != '\n')
341 /* Seen line terminator CR or CR/LF. */
342 if (c
== '\r' || c1
== '\n')
348 return UNICODE ('\n');
350 else if (RED (c
) == '\n')
352 /* Seen line terminator LF. */
359 return UNICODE ('\n');
365 /* Supports 2 characters of pushback. */
367 phase3_ungetc (int c
)
373 if (phase3_pushback_length
== SIZEOF (phase3_pushback
))
375 phase3_pushback
[phase3_pushback_length
++] = c
;
380 /* ========================= Accumulating strings. ======================== */
382 /* A string buffer type that allows appending bytes (in the
383 xgettext_current_source_encoding) or Unicode characters.
384 Returns the entire string in UTF-8 encoding. */
388 /* The part of the string that has already been converted to UTF-8. */
391 size_t utf8_allocated
;
392 /* The first half of an UTF-16 surrogate character. */
393 unsigned short utf16_surr
;
394 /* The part of the string that is still in the source encoding. */
397 size_t curr_allocated
;
400 /* Initialize a 'struct string_buffer' to empty. */
402 init_string_buffer (struct string_buffer
*bp
)
404 bp
->utf8_buffer
= NULL
;
406 bp
->utf8_allocated
= 0;
408 bp
->curr_buffer
= NULL
;
410 bp
->curr_allocated
= 0;
413 /* Auxiliary function: Append a byte to bp->curr. */
415 string_buffer_append_byte (struct string_buffer
*bp
, unsigned char c
)
417 if (bp
->curr_buflen
== bp
->curr_allocated
)
419 bp
->curr_allocated
= 2 * bp
->curr_allocated
+ 10;
420 bp
->curr_buffer
= xrealloc (bp
->curr_buffer
, bp
->curr_allocated
);
422 bp
->curr_buffer
[bp
->curr_buflen
++] = c
;
425 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
427 string_buffer_append_unicode_grow (struct string_buffer
*bp
, size_t count
)
429 if (bp
->utf8_buflen
+ count
> bp
->utf8_allocated
)
431 size_t new_allocated
= 2 * bp
->utf8_allocated
+ 10;
432 if (new_allocated
< bp
->utf8_buflen
+ count
)
433 new_allocated
= bp
->utf8_buflen
+ count
;
434 bp
->utf8_allocated
= new_allocated
;
435 bp
->utf8_buffer
= xrealloc (bp
->utf8_buffer
, new_allocated
);
439 /* Auxiliary function: Append a Unicode character to bp->utf8.
440 uc must be < 0x110000. */
442 string_buffer_append_unicode (struct string_buffer
*bp
, unsigned int uc
)
444 unsigned char utf8buf
[6];
445 int count
= u8_uctomb (utf8buf
, uc
, 6);
448 /* The caller should have ensured that uc is not out-of-range. */
451 string_buffer_append_unicode_grow (bp
, count
);
452 memcpy (bp
->utf8_buffer
+ bp
->utf8_buflen
, utf8buf
, count
);
453 bp
->utf8_buflen
+= count
;
456 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
458 string_buffer_flush_utf16_surr (struct string_buffer
*bp
)
460 if (bp
->utf16_surr
!= 0)
462 /* A half surrogate is invalid, therefore use U+FFFD instead. */
463 string_buffer_append_unicode (bp
, 0xfffd);
468 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
470 string_buffer_flush_curr_buffer (struct string_buffer
*bp
, int lineno
)
472 if (bp
->curr_buflen
> 0)
477 string_buffer_append_byte (bp
, '\0');
479 /* Convert from the source encoding to UTF-8. */
480 curr
= from_current_source_encoding (bp
->curr_buffer
,
481 logical_file_name
, lineno
);
483 /* Append it to bp->utf8_buffer. */
484 count
= strlen (curr
);
485 string_buffer_append_unicode_grow (bp
, count
);
486 memcpy (bp
->utf8_buffer
+ bp
->utf8_buflen
, curr
, count
);
487 bp
->utf8_buflen
+= count
;
489 if (curr
!= bp
->curr_buffer
)
495 /* Append a character or Unicode character to a 'struct string_buffer'. */
497 string_buffer_append (struct string_buffer
*bp
, int c
)
501 /* Append a Unicode character. */
503 /* Switch from multibyte character mode to Unicode character mode. */
504 string_buffer_flush_curr_buffer (bp
, line_number
);
506 /* Test whether this character and the previous one form a Unicode
507 surrogate character pair. */
508 if (bp
->utf16_surr
!= 0
509 && (c
>= UNICODE (0xdc00) && c
< UNICODE (0xe000)))
511 unsigned short utf16buf
[2];
514 utf16buf
[0] = bp
->utf16_surr
;
515 utf16buf
[1] = UTF16_VALUE (c
);
516 if (u16_mbtouc_aux (&uc
, utf16buf
, 2) != 2)
519 string_buffer_append_unicode (bp
, uc
);
524 string_buffer_flush_utf16_surr (bp
);
526 if (c
>= UNICODE (0xd800) && c
< UNICODE (0xdc00))
527 bp
->utf16_surr
= UTF16_VALUE (c
);
529 string_buffer_append_unicode (bp
, UTF16_VALUE (c
));
534 /* Append a single byte. */
536 /* Switch from Unicode character mode to multibyte character mode. */
537 string_buffer_flush_utf16_surr (bp
);
539 /* When a newline is seen, convert the accumulated multibyte sequence.
540 This ensures a correct line number in the error message in case of
541 a conversion error. The "- 1" is to account for the newline. */
543 string_buffer_flush_curr_buffer (bp
, line_number
- 1);
545 string_buffer_append_byte (bp
, (unsigned char) c
);
549 /* Return the string buffer's contents. */
551 string_buffer_result (struct string_buffer
*bp
)
553 /* Flush all into bp->utf8_buffer. */
554 string_buffer_flush_utf16_surr (bp
);
555 string_buffer_flush_curr_buffer (bp
, line_number
);
556 /* NUL-terminate it. */
557 string_buffer_append_unicode_grow (bp
, 1);
558 bp
->utf8_buffer
[bp
->utf8_buflen
] = '\0';
560 return bp
->utf8_buffer
;
563 /* Free the memory pointed to by a 'struct string_buffer'. */
565 free_string_buffer (struct string_buffer
*bp
)
567 free (bp
->utf8_buffer
);
568 free (bp
->curr_buffer
);
572 /* ======================== Accumulating comments. ======================== */
575 /* Accumulating a single comment line. */
577 static struct string_buffer comment_buffer
;
582 comment_buffer
.utf8_buflen
= 0;
583 comment_buffer
.utf16_surr
= 0;
584 comment_buffer
.curr_buflen
= 0;
590 return (comment_buffer
.utf8_buflen
== 0 && comment_buffer
.utf16_surr
== 0
591 && comment_buffer
.curr_buflen
== 0);
597 string_buffer_append (&comment_buffer
, c
);
601 comment_line_end (size_t chars_to_remove
)
603 char *buffer
= string_buffer_result (&comment_buffer
);
604 size_t buflen
= strlen (buffer
);
606 buflen
-= chars_to_remove
;
608 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
610 buffer
[buflen
] = '\0';
611 savable_comment_add (buffer
);
615 /* These are for tracking whether comments count as immediately before
617 static int last_comment_line
;
618 static int last_non_comment_line
;
621 /* Replace each comment that is not inside a character constant or string
622 literal with a space or newline character. */
642 /* C style comment. */
644 last_was_star
= false;
650 /* We skip all leading white space, but not EOLs. */
651 if (!(comment_at_start () && (RED (c
) == ' ' || RED (c
) == '\t')))
656 comment_line_end (1);
658 last_was_star
= false;
662 last_was_star
= true;
668 comment_line_end (2);
674 last_was_star
= false;
679 last_comment_line
= line_number
;
683 /* C++ style comment. */
684 last_comment_line
= line_number
;
689 if (RED (c
) == '\n' || c
== P2_EOF
)
691 /* We skip all leading white space, but not EOLs. */
692 if (!(comment_at_start () && (RED (c
) == ' ' || RED (c
) == '\t')))
695 phase3_ungetc (c
); /* push back the newline, to decrement line_number */
696 comment_line_end (0);
697 phase3_getc (); /* read the newline again */
702 /* Supports only one pushback character. */
704 phase4_ungetc (int c
)
710 /* ========================== Reading of tokens. ========================== */
715 token_type_lparen
, /* ( */
716 token_type_rparen
, /* ) */
717 token_type_lbrace
, /* { */
718 token_type_rbrace
, /* } */
719 token_type_comma
, /* , */
720 token_type_dot
, /* . */
721 token_type_string_literal
, /* "abc" */
722 token_type_number
, /* 1.23 */
723 token_type_symbol
, /* identifier, keyword, null */
724 token_type_plus
, /* + */
725 token_type_other
/* character literal, misc. operator */
727 typedef enum token_type_ty token_type_ty
;
729 typedef struct token_ty token_ty
;
733 char *string
; /* for token_type_string_literal, token_type_symbol */
734 refcounted_string_list_ty
*comment
; /* for token_type_string_literal */
739 /* Free the memory pointed to by a 'struct token_ty'. */
741 free_token (token_ty
*tp
)
743 if (tp
->type
== token_type_string_literal
|| tp
->type
== token_type_symbol
)
745 if (tp
->type
== token_type_string_literal
)
746 drop_reference (tp
->comment
);
750 /* Read an escape sequence inside a string literal or character literal. */
756 /* Use phase 3, because phase 4 elides comments. */
759 return UNICODE ('\\');
763 return UNICODE (0x08);
765 return UNICODE (0x09);
767 return UNICODE (0x0a);
769 return UNICODE (0x0c);
771 return UNICODE (0x0d);
773 return UNICODE ('"');
775 return UNICODE ('\'');
777 return UNICODE ('\\');
778 case '0': case '1': case '2': case '3':
779 case '4': case '5': case '6': case '7':
781 int n
= RED (c
) - '0';
782 bool maybe3digits
= (n
< 4);
785 if (RED (c
) >= '0' && RED (c
) <= '7')
787 n
= (n
<< 3) + (RED (c
) - '0');
791 if (RED (c
) >= '0' && RED (c
) <= '7')
792 n
= (n
<< 3) + (RED (c
) - '0');
803 /* Invalid escape sequence. */
805 return UNICODE ('\\');
809 /* Read a string literal or character literal. */
811 accumulate_escaped (struct string_buffer
*literal
, int delimiter
)
817 /* Use phase 3, because phase 4 elides comments. */
819 if (c
== P2_EOF
|| RED (c
) == delimiter
)
824 error_with_progname
= false;
825 if (delimiter
== '\'')
826 error (0, 0, _("%s:%d: warning: unterminated character constant"),
827 logical_file_name
, line_number
);
829 error (0, 0, _("%s:%d: warning: unterminated string constant"),
830 logical_file_name
, line_number
);
831 error_with_progname
= true;
835 c
= do_getc_escaped ();
836 string_buffer_append (literal
, c
);
841 /* Combine characters into tokens. Discard whitespace. */
843 static token_ty phase5_pushback
[3];
844 static int phase5_pushback_length
;
847 phase5_get (token_ty
*tp
)
851 if (phase5_pushback_length
)
853 *tp
= phase5_pushback
[--phase5_pushback_length
];
860 tp
->line_number
= line_number
;
865 tp
->type
= token_type_eof
;
872 if (last_non_comment_line
> last_comment_line
)
873 savable_comment_reset ();
878 /* Ignore whitespace and comments. */
882 last_non_comment_line
= tp
->line_number
;
887 tp
->type
= token_type_lparen
;
891 tp
->type
= token_type_rparen
;
895 tp
->type
= token_type_lbrace
;
899 tp
->type
= token_type_rbrace
;
903 tp
->type
= token_type_comma
;
908 if (!(RED (c
) >= '0' && RED (c
) <= '9'))
911 tp
->type
= token_type_dot
;
916 case '0': case '1': case '2': case '3': case '4':
917 case '5': case '6': case '7': case '8': case '9':
919 /* Don't need to verify the complicated syntax of integers and
920 floating-point numbers. We assume a valid Java input.
921 The simplified syntax that we recognize as number is: any
922 sequence of alphanumeric characters, additionally '+' and '-'
923 immediately after 'e' or 'E' except in hexadecimal numbers. */
924 bool hexadecimal
= false;
929 if (RED (c
) >= '0' && RED (c
) <= '9')
931 if ((RED (c
) >= 'A' && RED (c
) <= 'Z')
932 || (RED (c
) >= 'a' && RED (c
) <= 'z'))
934 if (RED (c
) == 'X' || RED (c
) == 'x')
936 if ((RED (c
) == 'E' || RED (c
) == 'e') && !hexadecimal
)
939 if (!(RED (c
) == '+' || RED (c
) == '-'))
949 tp
->type
= token_type_number
;
953 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
954 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
955 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
956 case 'V': case 'W': case 'X': case 'Y': case 'Z':
958 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
959 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
960 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
961 case 'v': case 'w': case 'x': case 'y': case 'z':
962 /* Although Java allows identifiers containing many Unicode
963 characters, we recognize only identifiers consisting of ASCII
964 characters. This avoids conversion hassles w.r.t. the --keyword
965 arguments, and shouldn't be a big problem in practice. */
972 if (bufpos
>= bufmax
)
974 bufmax
= 2 * bufmax
+ 10;
975 buffer
= xrealloc (buffer
, bufmax
);
977 buffer
[bufpos
++] = RED (c
);
979 if (!((RED (c
) >= 'A' && RED (c
) <= 'Z')
980 || (RED (c
) >= 'a' && RED (c
) <= 'z')
981 || (RED (c
) >= '0' && RED (c
) <= '9')
986 if (bufpos
>= bufmax
)
988 bufmax
= 2 * bufmax
+ 10;
989 buffer
= xrealloc (buffer
, bufmax
);
991 buffer
[bufpos
] = '\0';
992 tp
->string
= xstrdup (buffer
);
993 tp
->type
= token_type_symbol
;
998 /* String literal. */
1000 struct string_buffer literal
;
1002 init_string_buffer (&literal
);
1003 accumulate_escaped (&literal
, '"');
1004 tp
->string
= xstrdup (string_buffer_result (&literal
));
1005 free_string_buffer (&literal
);
1006 tp
->comment
= add_reference (savable_comment
);
1007 tp
->type
= token_type_string_literal
;
1012 /* Character literal. */
1014 struct string_buffer literal
;
1016 init_string_buffer (&literal
);
1017 accumulate_escaped (&literal
, '\'');
1018 free_string_buffer (&literal
);
1019 tp
->type
= token_type_other
;
1027 tp
->type
= token_type_other
;
1028 else if (RED (c
) == '=')
1030 tp
->type
= token_type_other
;
1035 tp
->type
= token_type_plus
;
1040 /* Misc. operator. */
1041 tp
->type
= token_type_other
;
1047 /* Supports 3 tokens of pushback. */
1049 phase5_unget (token_ty
*tp
)
1051 if (tp
->type
!= token_type_eof
)
1053 if (phase5_pushback_length
== SIZEOF (phase5_pushback
))
1055 phase5_pushback
[phase5_pushback_length
++] = *tp
;
1060 /* Compile-time optimization of string literal concatenation.
1061 Combine "string1" + ... + "stringN" to the concatenated string if
1062 - the token before this expression is not ')' (because then the first
1063 string could be part of a cast expression),
1064 - the token after this expression is not '.' (because then the last
1065 string could be part of a method call expression). */
1067 static token_ty phase6_pushback
[2];
1068 static int phase6_pushback_length
;
1070 static token_type_ty phase6_last
;
1073 phase6_get (token_ty
*tp
)
1075 if (phase6_pushback_length
)
1077 *tp
= phase6_pushback
[--phase6_pushback_length
];
1082 if (tp
->type
== token_type_string_literal
&& phase6_last
!= token_type_rparen
)
1084 char *sum
= tp
->string
;
1085 size_t sum_len
= strlen (sum
);
1091 phase5_get (&token2
);
1092 if (token2
.type
== token_type_plus
)
1096 phase5_get (&token3
);
1097 if (token3
.type
== token_type_string_literal
)
1099 token_ty token_after
;
1101 phase5_get (&token_after
);
1102 if (token_after
.type
!= token_type_dot
)
1104 char *addend
= token3
.string
;
1105 size_t addend_len
= strlen (addend
);
1107 sum
= (char *) xrealloc (sum
, sum_len
+ addend_len
+ 1);
1108 memcpy (sum
+ sum_len
, addend
, addend_len
+ 1);
1109 sum_len
+= addend_len
;
1111 phase5_unget (&token_after
);
1112 free_token (&token3
);
1113 free_token (&token2
);
1116 phase5_unget (&token_after
);
1118 phase5_unget (&token3
);
1120 phase5_unget (&token2
);
1125 phase6_last
= tp
->type
;
1128 /* Supports 2 tokens of pushback. */
1130 phase6_unget (token_ty
*tp
)
1132 if (tp
->type
!= token_type_eof
)
1134 if (phase6_pushback_length
== SIZEOF (phase6_pushback
))
1136 phase6_pushback
[phase6_pushback_length
++] = *tp
;
1142 x_java_lex (token_ty
*tp
)
1147 /* Supports 2 tokens of pushback. */
1149 x_java_unlex (token_ty
*tp
)
1155 /* ========================= Extracting strings. ========================== */
1158 /* Context lookup table. */
1159 static flag_context_list_table_ty
*flag_context_list_table
;
1162 /* The file is broken into tokens. Scan the token stream, looking for
1163 a keyword, followed by a left paren, followed by a string. When we
1164 see this sequence, we have something to remember. We assume we are
1165 looking at a valid C or C++ program, and leave the complaints about
1166 the grammar to the compiler.
1168 Normal handling: Look for
1169 keyword ( ... msgid ... )
1170 Plural handling: Look for
1171 keyword ( ... msgid ... msgid_plural ... )
1173 We use recursion because the arguments before msgid or between msgid
1174 and msgid_plural can contain subexpressions of the same form. */
1177 /* Extract messages until the next balanced closing parenthesis or brace,
1178 depending on TERMINATOR.
1179 Extracted messages are added to MLP.
1180 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1181 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1182 otherwise PLURAL_COMMAS = 0.
1183 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1184 Return true upon eof, false upon closing parenthesis or brace. */
1186 extract_parenthesized (message_list_ty
*mlp
, token_type_ty terminator
,
1187 flag_context_ty outer_context
,
1188 flag_context_list_iterator_ty context_iter
,
1189 int commas_to_skip
, int plural_commas
)
1191 /* Remember the message containing the msgid, for msgid_plural. */
1192 message_ty
*plural_mp
= NULL
;
1194 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1196 /* Parameters of the keyword just seen. Defined only in state 1. */
1197 int next_commas_to_skip
= -1;
1198 int next_plural_commas
= 0;
1199 /* Context iterator that will be used if the next token is a '('. */
1200 flag_context_list_iterator_ty next_context_iter
=
1201 passthrough_context_list_iterator
;
1202 /* Current context. */
1203 flag_context_ty inner_context
=
1204 inherited_context (outer_context
,
1205 flag_context_list_iterator_advance (&context_iter
));
1207 /* Start state is 0. */
1214 x_java_lex (&token
);
1217 case token_type_symbol
:
1219 /* Combine symbol1 . ... . symbolN to a single strings, so that
1220 we can recognize static function calls like
1221 GettextResource.gettext. The information present for
1222 symbolI.....symbolN has precedence over the information for
1223 symbolJ.....symbolN with J > I. */
1224 char *sum
= token
.string
;
1225 size_t sum_len
= strlen (sum
);
1226 const char *dottedname
;
1227 flag_context_list_ty
*context_list
;
1233 x_java_lex (&token2
);
1234 if (token2
.type
== token_type_dot
)
1238 x_java_lex (&token3
);
1239 if (token3
.type
== token_type_symbol
)
1241 char *addend
= token3
.string
;
1242 size_t addend_len
= strlen (addend
);
1245 (char *) xrealloc (sum
, sum_len
+ 1 + addend_len
+ 1);
1247 memcpy (sum
+ sum_len
+ 1, addend
, addend_len
+ 1);
1248 sum_len
+= 1 + addend_len
;
1250 free_token (&token3
);
1251 free_token (&token2
);
1254 x_java_unlex (&token3
);
1256 x_java_unlex (&token2
);
1260 for (dottedname
= sum
;;)
1262 void *keyword_value
;
1264 if (find_entry (&keywords
, dottedname
, strlen (dottedname
),
1268 int argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
1269 int argnum2
= (int) (long) keyword_value
>> 10;
1271 next_commas_to_skip
= argnum1
- 1;
1272 next_plural_commas
= (argnum2
> argnum1
? argnum2
- argnum1
: 0);
1277 dottedname
= strchr (dottedname
, '.');
1278 if (dottedname
== NULL
)
1286 for (dottedname
= sum
;;)
1289 flag_context_list_table_lookup (
1290 flag_context_list_table
,
1291 dottedname
, strlen (dottedname
));
1292 if (context_list
!= NULL
)
1295 dottedname
= strchr (dottedname
, '.');
1296 if (dottedname
== NULL
)
1300 next_context_iter
= flag_context_list_iterator (context_list
);
1306 case token_type_lparen
:
1307 if (extract_parenthesized (mlp
, token_type_rparen
,
1308 inner_context
, next_context_iter
,
1309 state
? next_commas_to_skip
: -1,
1310 state
? next_plural_commas
: 0))
1312 next_context_iter
= null_context_list_iterator
;
1316 case token_type_rparen
:
1317 if (terminator
== token_type_rparen
)
1319 if (terminator
== token_type_rbrace
)
1321 error_with_progname
= false;
1323 _("%s:%d: warning: ')' found where '}' was expected"),
1324 logical_file_name
, token
.line_number
);
1325 error_with_progname
= true;
1327 next_context_iter
= null_context_list_iterator
;
1331 case token_type_lbrace
:
1332 if (extract_parenthesized (mlp
, token_type_rbrace
,
1333 null_context
, null_context_list_iterator
,
1336 next_context_iter
= null_context_list_iterator
;
1340 case token_type_rbrace
:
1341 if (terminator
== token_type_rbrace
)
1343 if (terminator
== token_type_rparen
)
1345 error_with_progname
= false;
1347 _("%s:%d: warning: '}' found where ')' was expected"),
1348 logical_file_name
, token
.line_number
);
1349 error_with_progname
= true;
1351 next_context_iter
= null_context_list_iterator
;
1355 case token_type_comma
:
1356 if (commas_to_skip
>= 0)
1358 if (commas_to_skip
> 0)
1361 if (plural_mp
!= NULL
&& plural_commas
> 0)
1363 commas_to_skip
= plural_commas
- 1;
1367 commas_to_skip
= -1;
1370 inherited_context (outer_context
,
1371 flag_context_list_iterator_advance (
1373 next_context_iter
= passthrough_context_list_iterator
;
1377 case token_type_string_literal
:
1380 pos
.file_name
= logical_file_name
;
1381 pos
.line_number
= token
.line_number
;
1385 xgettext_current_source_encoding
= po_charset_utf8
;
1386 savable_comment_to_xgettext_comment (token
.comment
);
1387 remember_a_message (mlp
, token
.string
, inner_context
, &pos
);
1388 savable_comment_reset ();
1389 xgettext_current_source_encoding
= xgettext_global_source_encoding
;
1393 if (commas_to_skip
== 0)
1395 if (plural_mp
== NULL
)
1397 /* Seen an msgid. */
1400 xgettext_current_source_encoding
= po_charset_utf8
;
1401 savable_comment_to_xgettext_comment (token
.comment
);
1402 mp
= remember_a_message (mlp
, token
.string
,
1403 inner_context
, &pos
);
1404 savable_comment_reset ();
1405 xgettext_current_source_encoding
= xgettext_global_source_encoding
;
1406 if (plural_commas
> 0)
1411 /* Seen an msgid_plural. */
1412 xgettext_current_source_encoding
= po_charset_utf8
;
1413 remember_a_message_plural (plural_mp
, token
.string
,
1414 inner_context
, &pos
);
1415 xgettext_current_source_encoding
= xgettext_global_source_encoding
;
1420 free (token
.string
);
1423 drop_reference (token
.comment
);
1424 next_context_iter
= null_context_list_iterator
;
1428 case token_type_eof
:
1431 case token_type_dot
:
1432 case token_type_number
:
1433 case token_type_plus
:
1434 case token_type_other
:
1435 next_context_iter
= null_context_list_iterator
;
1447 extract_java (FILE *f
,
1448 const char *real_filename
, const char *logical_filename
,
1449 flag_context_list_table_ty
*flag_table
,
1450 msgdomain_list_ty
*mdlp
)
1452 message_list_ty
*mlp
= mdlp
->item
[0]->messages
;
1455 real_file_name
= real_filename
;
1456 logical_file_name
= xstrdup (logical_filename
);
1459 last_comment_line
= -1;
1460 last_non_comment_line
= -1;
1462 phase6_last
= token_type_eof
;
1464 flag_context_list_table
= flag_table
;
1468 /* Eat tokens until eof is seen. When extract_parenthesized returns
1469 due to an unbalanced closing parenthesis, just restart it. */
1470 while (!extract_parenthesized (mlp
, token_type_eof
,
1471 null_context
, null_context_list_iterator
,
1476 real_file_name
= NULL
;
1477 logical_file_name
= NULL
;