1 /* xgettext C/C++/ObjectiveC backend.
2 Copyright (C) 1995-1998, 2000-2004 Free Software Foundation, Inc.
4 This file was written by Peter Miller <millerp@canb.auug.org.au>
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34 #include "error-progname.h"
40 #define _(s) gettext(s)
42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
45 /* The ANSI C standard defines several phases of translation:
47 1. Terminate line by \n, regardless of the external representation
48 of a text line. Stdio does this for us.
50 2. Convert trigraphs to their single character equivalents.
52 3. Concatenate each line ending in backslash (\) with the following
55 4. Replace each comment with a space character.
57 5. Parse each resulting logical line as preprocessing tokens a
60 6. Recognize and carry out directives (it also expands macros on
61 non-directive lines, which we do not do here).
63 7. Replaces escape sequences within character strings with their
64 single character equivalents (we do this in step 5, because we
65 don't have to worry about the #include argument).
67 8. Concatenates adjacent string literals to form single string
68 literals (because we don't expand macros, there are a few things
71 9. Converts the remaining preprocessing tokens to C tokens and
72 discards any white space from the translation unit.
74 This lexer implements the above, and presents the scanner (in
75 xgettext.c) with a stream of C tokens. The comments are
76 accumulated in a buffer, and given to xgettext when asked for. */
79 /* ========================= Lexer customization. ========================= */
81 static bool trigraphs
= false;
90 /* ====================== Keyword set customization. ====================== */
92 /* If true extract all strings. */
93 static bool extract_all
= false;
95 static hash_table c_keywords
;
96 static hash_table objc_keywords
;
97 static bool default_keywords
= true;
108 add_keyword (const char *name
, hash_table
*keywords
)
111 default_keywords
= false;
119 if (keywords
->table
== NULL
)
120 init_hash (keywords
, 100);
122 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
124 /* The characters between name and end should form a valid C identifier.
125 A colon means an invalid parse in split_keywordspec(). */
126 colon
= strchr (name
, ':');
127 if (colon
== NULL
|| colon
>= end
)
131 insert_entry (keywords
, name
, end
- name
,
132 (void *) (long) (argnum1
+ (argnum2
<< 10)));
138 x_c_keyword (const char *name
)
140 add_keyword (name
, &c_keywords
);
144 x_objc_keyword (const char *name
)
146 add_keyword (name
, &objc_keywords
);
149 /* Finish initializing the keywords hash tables.
150 Called after argument processing, before each file is processed. */
154 if (default_keywords
)
156 x_c_keyword ("gettext");
157 x_c_keyword ("dgettext:2");
158 x_c_keyword ("dcgettext:2");
159 x_c_keyword ("ngettext:1,2");
160 x_c_keyword ("dngettext:2,3");
161 x_c_keyword ("dcngettext:2,3");
162 x_c_keyword ("gettext_noop");
164 x_objc_keyword ("gettext");
165 x_objc_keyword ("dgettext:2");
166 x_objc_keyword ("dcgettext:2");
167 x_objc_keyword ("ngettext:1,2");
168 x_objc_keyword ("dngettext:2,3");
169 x_objc_keyword ("dcngettext:2,3");
170 x_objc_keyword ("gettext_noop");
171 x_objc_keyword ("NSLocalizedString"); /* similar to gettext */
172 x_objc_keyword ("_"); /* similar to gettext */
173 x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
174 x_objc_keyword ("__"); /* similar to gettext_noop */
176 default_keywords
= false;
183 xgettext_record_flag ("gettext:1:pass-c-format");
184 xgettext_record_flag ("dgettext:2:pass-c-format");
185 xgettext_record_flag ("dcgettext:2:pass-c-format");
186 xgettext_record_flag ("ngettext:1:pass-c-format");
187 xgettext_record_flag ("ngettext:2:pass-c-format");
188 xgettext_record_flag ("dngettext:2:pass-c-format");
189 xgettext_record_flag ("dngettext:3:pass-c-format");
190 xgettext_record_flag ("dcngettext:2:pass-c-format");
191 xgettext_record_flag ("dcngettext:3:pass-c-format");
192 xgettext_record_flag ("gettext_noop:1:pass-c-format");
194 xgettext_record_flag ("fprintf:2:c-format");
195 xgettext_record_flag ("vfprintf:2:c-format");
196 xgettext_record_flag ("printf:1:c-format");
197 xgettext_record_flag ("vprintf:1:c-format");
198 xgettext_record_flag ("sprintf:2:c-format");
199 xgettext_record_flag ("vsprintf:2:c-format");
200 xgettext_record_flag ("snprintf:3:c-format");
201 xgettext_record_flag ("vsnprintf:3:c-format");
202 #if 0 /* These functions are not standard. */
204 xgettext_record_flag ("asprintf:2:c-format");
205 xgettext_record_flag ("vasprintf:2:c-format");
206 xgettext_record_flag ("dprintf:2:c-format");
207 xgettext_record_flag ("vdprintf:2:c-format");
208 xgettext_record_flag ("obstack_printf:2:c-format");
209 xgettext_record_flag ("obstack_vprintf:2:c-format");
211 xgettext_record_flag ("error:3:c-format");
212 xgettext_record_flag ("error_at_line:5:c-format");
214 xgettext_record_flag ("argp_error:2:c-format");
215 xgettext_record_flag ("argp_failure:2:c-format");
220 init_flag_table_objc ()
222 /* Since the settings done in init_flag_table_c() also have an effect for
223 the ObjectiveC parser, we don't have to repeat them here. */
224 xgettext_record_flag ("gettext:1:pass-objc-format");
225 xgettext_record_flag ("dgettext:2:pass-objc-format");
226 xgettext_record_flag ("dcgettext:2:pass-objc-format");
227 xgettext_record_flag ("ngettext:1:pass-objc-format");
228 xgettext_record_flag ("ngettext:2:pass-objc-format");
229 xgettext_record_flag ("dngettext:2:pass-objc-format");
230 xgettext_record_flag ("dngettext:3:pass-objc-format");
231 xgettext_record_flag ("dcngettext:2:pass-objc-format");
232 xgettext_record_flag ("dcngettext:3:pass-objc-format");
233 xgettext_record_flag ("gettext_noop:1:pass-objc-format");
234 xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
235 xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
236 xgettext_record_flag ("_:1:pass-c-format");
237 xgettext_record_flag ("_:1:pass-objc-format");
238 xgettext_record_flag ("stringWithFormat::1:objc-format");
239 xgettext_record_flag ("initWithFormat::1:objc-format");
240 xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
241 xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
242 xgettext_record_flag ("appendFormat::1:objc-format");
246 init_flag_table_gcc_internal ()
248 xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
249 xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
250 xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
251 xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
252 xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
253 xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
254 xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
255 xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
256 xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
257 xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
258 #if 0 /* This should better be done inside GCC. */
259 /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
261 xgettext_record_flag ("status_warning:2:gcc-internal-format");
263 xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
265 //xgettext_record_flag ("error:1:c-format"); // 3 different versions
266 xgettext_record_flag ("notice:1:c-format");
267 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
268 xgettext_record_flag ("fatal_perror:1:c-format");
270 xgettext_record_flag ("cpp_error:3:c-format");
271 xgettext_record_flag ("cpp_error_with_line:5:c-format");
273 xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
274 xgettext_record_flag ("output_printf:2:gcc-internal-format");
275 xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
276 xgettext_record_flag ("verbatim:1:gcc-internal-format");
277 xgettext_record_flag ("inform:1:pass-gcc-internal-format");
279 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
280 //xgettext_record_flag ("error:1:c-format"); // 3 different versions
282 xgettext_record_flag ("attr_printf:2:pass-c-format");
284 xgettext_record_flag ("error_at_line:2:pass-c-format");
285 xgettext_record_flag ("xvasprintf:2:pass-c-format");
286 xgettext_record_flag ("xasprintf:1:pass-c-format");
287 xgettext_record_flag ("oprintf:2:pass-c-format");
289 xgettext_record_flag ("message_with_line:2:pass-c-format");
291 xgettext_record_flag ("output_operand_lossage:1:c-format");
293 xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
295 xgettext_record_flag ("fnotice:2:c-format");
296 xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
297 xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
298 xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
299 xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
300 xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
301 xgettext_record_flag ("pedwarn:1:gcc-internal-format");
302 xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
303 xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
304 xgettext_record_flag ("sorry:1:gcc-internal-format");
305 xgettext_record_flag ("error:1:pass-gcc-internal-format");
306 xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
307 xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
308 xgettext_record_flag ("warning:1:pass-gcc-internal-format");
309 xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
310 xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
312 xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
314 xgettext_record_flag ("ffests_printf:2:pass-c-format");
315 /* java/java-tree.h */
316 xgettext_record_flag ("parse_error_context:2:pass-c-format");
321 /* ======================== Reading of characters. ======================== */
323 /* Real filename, used in error messages about the input file. */
324 static const char *real_file_name
;
326 /* Logical filename and line number, used to label the extracted messages. */
327 static char *logical_file_name
;
328 static int line_number
;
330 /* The input file stream. */
334 /* 0. Terminate line by \n, regardless whether the external representation of
335 a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
336 It is debatable whether supporting CR/LF line terminators in C sources
337 on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
338 unconditionally, it must be OK.
339 The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
340 automatically, but here we also need this conversion on Unix. As a side
341 effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
354 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
363 if (c1
!= EOF
&& c1
!= '\n')
366 /* Seen line terminator CR or CR/LF. */
374 /* Supports only one pushback character, and not '\n'. */
376 phase0_ungetc (int c
)
383 /* 1. line_number handling. Combine backslash-newline to nothing. */
385 static unsigned char phase1_pushback
[2];
386 static int phase1_pushback_length
;
394 if (phase1_pushback_length
)
396 c
= phase1_pushback
[--phase1_pushback_length
];
427 /* Supports 2 characters of pushback. */
429 phase1_ungetc (int c
)
441 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
443 phase1_pushback
[phase1_pushback_length
++] = c
;
449 /* 2. Convert trigraphs to their single character equivalents. Most
450 sane human beings vomit copiously at the mention of trigraphs, which
451 is why they are an option. */
453 static unsigned char phase2_pushback
[1];
454 static int phase2_pushback_length
;
462 if (phase2_pushback_length
)
463 return phase2_pushback
[--phase2_pushback_length
];
465 return phase1_getc ();
504 /* Supports only one pushback character. */
506 phase2_ungetc (int c
)
510 if (phase2_pushback_length
== SIZEOF (phase2_pushback
))
512 phase2_pushback
[phase2_pushback_length
++] = c
;
517 /* 3. Concatenate each line ending in backslash (\) with the following
518 line. Basically, all you need to do is elide "\\\n" sequences from
521 static unsigned char phase3_pushback
[2];
522 static int phase3_pushback_length
;
528 if (phase3_pushback_length
)
529 return phase3_pushback
[--phase3_pushback_length
];
532 int c
= phase2_getc ();
545 /* Supports 2 characters of pushback. */
547 phase3_ungetc (int c
)
551 if (phase3_pushback_length
== SIZEOF (phase3_pushback
))
553 phase3_pushback
[phase3_pushback_length
++] = c
;
558 /* Accumulating comments. */
561 static size_t bufmax
;
562 static size_t buflen
;
573 if (buflen
>= bufmax
)
575 bufmax
= 2 * bufmax
+ 10;
576 buffer
= xrealloc (buffer
, bufmax
);
578 buffer
[buflen
++] = c
;
582 comment_line_end (size_t chars_to_remove
)
584 buflen
-= chars_to_remove
;
586 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
588 if (chars_to_remove
== 0 && buflen
>= bufmax
)
590 bufmax
= 2 * bufmax
+ 10;
591 buffer
= xrealloc (buffer
, bufmax
);
593 buffer
[buflen
] = '\0';
594 savable_comment_add (buffer
);
598 /* These are for tracking whether comments count as immediately before
600 static int last_comment_line
;
601 static int last_non_comment_line
;
602 static int newline_count
;
605 /* 4. Replace each comment that is not inside a character constant or
606 string literal with a space character. We need to remember the
607 comment for later, because it may be attached to a keyword string.
608 We also optionally understand C++ comments. */
629 last_was_star
= false;
635 /* We skip all leading white space, but not EOLs. */
636 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
641 comment_line_end (1);
643 last_was_star
= false;
647 last_was_star
= true;
653 comment_line_end (2);
659 last_was_star
= false;
664 last_comment_line
= newline_count
;
668 /* C++ or ISO C 99 comment. */
673 if (c
== '\n' || c
== EOF
)
675 /* We skip all leading white space, but not EOLs. */
676 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
679 comment_line_end (0);
680 last_comment_line
= newline_count
;
686 /* Supports only one pushback character. */
688 phase4_ungetc (int c
)
694 /* ========================== Reading of tokens. ========================== */
697 /* True if ObjectiveC extensions are recognized. */
698 static bool objc_extensions
;
702 token_type_character_constant
, /* 'x' */
705 token_type_hash
, /* # */
706 token_type_lparen
, /* ( */
707 token_type_rparen
, /* ) */
708 token_type_comma
, /* , */
709 token_type_colon
, /* : */
710 token_type_name
, /* abc */
711 token_type_number
, /* 2.7 */
712 token_type_string_literal
, /* "abc" */
713 token_type_symbol
, /* < > = etc. */
714 token_type_objc_special
, /* @ */
715 token_type_white_space
717 typedef enum token_type_ty token_type_ty
;
719 typedef struct token_ty token_ty
;
723 char *string
; /* for token_type_name, token_type_string_literal */
724 refcounted_string_list_ty
*comment
; /* for token_type_string_literal,
725 token_type_objc_special */
731 /* 7. Replace escape sequences within character strings with their
732 single character equivalents. This is called from phase 5, because
733 we don't have to worry about the #include argument. There are
734 pathological cases which could bite us (like the DOS directory
735 separator), but just pretend it can't happen. */
737 #define P7_QUOTES (1000 + '"')
738 #define P7_QUOTE (1000 + '\'')
739 #define P7_NEWLINE (1000 + '\n')
746 /* Use phase 3, because phase 4 elides comments. */
749 /* Return a magic newline indicator, so that we can distinguish
750 between the user requesting a newline in the string (e.g. using
751 "\n" or "\012") from the user failing to terminate the string or
752 character constant. The ANSI C standard says: 3.1.3.4 Character
753 Constants contain ``any character except single quote, backslash or
754 newline; or an escape sequence'' and 3.1.4 String Literals contain
755 ``any character except double quote, backslash or newline; or an
758 Most compilers give a fatal error in this case, however gcc is
759 stupidly silent, even though this is a very common typo. OK, so
760 gcc --pedantic will tell me, but that gripes about too much other
761 stuff. Could I have a ``gcc -Wnewline-in-string'' option, or
762 better yet a ``gcc -fno-newline-in-string'' option, please? Gcc is
763 also inconsistent between string literals and character constants:
764 you may not embed newlines in character constants; try it, you get
765 a useful diagnostic. --PMiller */
779 /* Unknown escape sequences really should be an error, but just
780 ignore them, and let the real compiler complain. */
795 /* The \e escape is preculiar to gcc, and assumes an ASCII
796 character set (or superset). We don't provide support for it
819 case '0': case '1': case '2': case '3': case '4':
820 case '5': case '6': case '7': case '8': case '9':
821 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
822 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
834 case '0': case '1': case '2': case '3': case '4':
835 case '5': case '6': case '7': case '8': case '9':
836 n
= n
* 16 + c
- '0';
839 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
840 n
= n
* 16 + 10 + c
- 'A';
843 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
844 n
= n
* 16 + 10 + c
- 'a';
851 case '0': case '1': case '2': case '3':
852 case '4': case '5': case '6': case '7':
854 for (j
= 0; j
< 3; ++j
)
863 case '0': case '1': case '2': case '3':
864 case '4': case '5': case '6': case '7':
876 phase7_ungetc (int c
)
882 /* Free the memory pointed to by a 'struct token_ty'. */
884 free_token (token_ty
*tp
)
886 if (tp
->type
== token_type_name
|| tp
->type
== token_type_string_literal
)
888 if (tp
->type
== token_type_string_literal
889 || tp
->type
== token_type_objc_special
)
890 drop_reference (tp
->comment
);
894 /* 5. Parse each resulting logical line as preprocessing tokens and
895 white space. Preprocessing tokens and C tokens don't always match. */
897 static token_ty phase5_pushback
[1];
898 static int phase5_pushback_length
;
902 phase5_get (token_ty
*tp
)
909 if (phase5_pushback_length
)
911 *tp
= phase5_pushback
[--phase5_pushback_length
];
916 tp
->line_number
= line_number
;
921 tp
->type
= token_type_eof
;
925 tp
->type
= token_type_eoln
;
947 tp
->type
= token_type_white_space
;
950 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
951 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
952 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
953 case 'V': case 'W': case 'X': case 'Y': case 'Z':
955 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
956 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
957 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
958 case 'v': case 'w': case 'x': case 'y': case 'z':
962 if (bufpos
>= bufmax
)
964 bufmax
= 2 * bufmax
+ 10;
965 buffer
= xrealloc (buffer
, bufmax
);
967 buffer
[bufpos
++] = c
;
971 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
972 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
973 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
974 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
977 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
978 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
979 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
980 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
982 case '0': case '1': case '2': case '3': case '4':
983 case '5': case '6': case '7': case '8': case '9':
992 if (bufpos
>= bufmax
)
994 bufmax
= 2 * bufmax
+ 10;
995 buffer
= xrealloc (buffer
, bufmax
);
998 tp
->string
= xstrdup (buffer
);
999 tp
->type
= token_type_name
;
1008 tp
->type
= token_type_symbol
;
1011 case '0': case '1': case '2': case '3': case '4':
1012 case '5': case '6': case '7': case '8': case '9':
1018 case '0': case '1': case '2': case '3': case '4':
1019 case '5': case '6': case '7': case '8': case '9':
1020 /* The preprocessing number token is more "generous" than the C
1021 number tokens. This is mostly due to token pasting (another
1022 thing we can ignore here). */
1026 if (bufpos
>= bufmax
)
1028 bufmax
= 2 * bufmax
+ 10;
1029 buffer
= xrealloc (buffer
, bufmax
);
1031 buffer
[bufpos
++] = c
;
1037 if (bufpos
>= bufmax
)
1039 bufmax
= 2 * bufmax
+ 10;
1040 buffer
= xrealloc (buffer
, bufmax
);
1042 buffer
[bufpos
++] = c
;
1044 if (c
!= '+' || c
!= '-')
1051 case 'A': case 'B': case 'C': case 'D': case 'F':
1052 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1053 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1054 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1056 case 'a': case 'b': case 'c': case 'd': case 'f':
1057 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1058 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1059 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1061 case '0': case '1': case '2': case '3': case '4':
1062 case '5': case '6': case '7': case '8': case '9':
1072 if (bufpos
>= bufmax
)
1074 bufmax
= 2 * bufmax
+ 10;
1075 buffer
= xrealloc (buffer
, bufmax
);
1078 tp
->type
= token_type_number
;
1079 tp
->number
= atol (buffer
);
1083 /* We could worry about the 'L' before wide character constants,
1084 but ignoring it has no effect unless one of the keywords is
1085 "L". Just pretend it won't happen. Also, we don't need to
1086 remember the character constant. */
1090 if (c
== P7_NEWLINE
)
1092 error_with_progname
= false;
1093 error (0, 0, _("%s:%d: warning: unterminated character constant"),
1094 logical_file_name
, line_number
- 1);
1095 error_with_progname
= true;
1096 phase7_ungetc ('\n');
1099 if (c
== EOF
|| c
== P7_QUOTE
)
1102 tp
->type
= token_type_character_constant
;
1106 /* We could worry about the 'L' before wide string constants,
1107 but since gettext's argument is not a wide character string,
1108 let the compiler complain about the argument not matching the
1109 prototype. Just pretend it won't happen. */
1114 if (c
== P7_NEWLINE
)
1116 error_with_progname
= false;
1117 error (0, 0, _("%s:%d: warning: unterminated string literal"),
1118 logical_file_name
, line_number
- 1);
1119 error_with_progname
= true;
1120 phase7_ungetc ('\n');
1123 if (c
== EOF
|| c
== P7_QUOTES
)
1127 if (bufpos
>= bufmax
)
1129 bufmax
= 2 * bufmax
+ 10;
1130 buffer
= xrealloc (buffer
, bufmax
);
1132 buffer
[bufpos
++] = c
;
1134 if (bufpos
>= bufmax
)
1136 bufmax
= 2 * bufmax
+ 10;
1137 buffer
= xrealloc (buffer
, bufmax
);
1140 tp
->type
= token_type_string_literal
;
1141 tp
->string
= xstrdup (buffer
);
1142 tp
->comment
= add_reference (savable_comment
);
1146 tp
->type
= token_type_lparen
;
1150 tp
->type
= token_type_rparen
;
1154 tp
->type
= token_type_comma
;
1158 tp
->type
= token_type_hash
;
1162 tp
->type
= token_type_colon
;
1166 if (objc_extensions
)
1168 tp
->type
= token_type_objc_special
;
1169 tp
->comment
= add_reference (savable_comment
);
1175 /* We could carefully recognize each of the 2 and 3 character
1176 operators, but it is not necessary, as we only need to recognize
1177 gettext invocations. Don't bother. */
1178 tp
->type
= token_type_symbol
;
1184 /* Supports only one pushback token. */
1186 phase5_unget (token_ty
*tp
)
1188 if (tp
->type
!= token_type_eof
)
1190 if (phase5_pushback_length
== SIZEOF (phase5_pushback
))
1192 phase5_pushback
[phase5_pushback_length
++] = *tp
;
1197 /* X. Recognize a leading # symbol. Leave leading hash as a hash, but
1198 turn hash in the middle of a line into a plain symbol token. This
1199 makes the phase 6 easier. */
1202 phaseX_get (token_ty
*tp
)
1204 static bool middle
; /* false at the beginning of a line, true otherwise. */
1208 if (tp
->type
== token_type_eoln
|| tp
->type
== token_type_eof
)
1214 /* Turn hash in the middle of a line into a plain symbol token. */
1215 if (tp
->type
== token_type_hash
)
1216 tp
->type
= token_type_symbol
;
1220 /* When we see leading whitespace followed by a hash sign,
1221 discard the leading white space token. The hash is all
1222 phase 6 is interested in. */
1223 if (tp
->type
== token_type_white_space
)
1228 if (next
.type
== token_type_hash
)
1231 phase5_unget (&next
);
1239 /* 6. Recognize and carry out directives (it also expands macros on
1240 non-directive lines, which we do not do here). The only directive
1241 we care about are the #line and #define directive. We throw all the
1244 static token_ty phase6_pushback
[2];
1245 static int phase6_pushback_length
;
1249 phase6_get (token_ty
*tp
)
1251 static token_ty
*buf
;
1256 if (phase6_pushback_length
)
1258 *tp
= phase6_pushback
[--phase6_pushback_length
];
1263 /* Get the next token. If it is not a '#' at the beginning of a
1264 line (ignoring whitespace), return immediately. */
1266 if (tp
->type
!= token_type_hash
)
1269 /* Accumulate the rest of the directive in a buffer, until the
1270 "define" keyword is seen or until end of line. */
1275 if (tp
->type
== token_type_eoln
|| tp
->type
== token_type_eof
)
1278 /* Before the "define" keyword and inside other directives
1279 white space is irrelevant. So just throw it away. */
1280 if (tp
->type
!= token_type_white_space
)
1282 /* If it is a #define directive, return immediately,
1283 thus treating the body of the #define directive like
1286 && tp
->type
== token_type_name
1287 && strcmp (tp
->string
, "define") == 0)
1291 if (bufpos
>= bufmax
)
1293 bufmax
= 2 * bufmax
+ 10;
1294 buf
= xrealloc (buf
, bufmax
* sizeof (buf
[0]));
1296 buf
[bufpos
++] = *tp
;
1300 /* If it is a #line directive, with no macros to expand, act on
1301 it. Ignore all other directives. */
1302 if (bufpos
>= 3 && buf
[0].type
== token_type_name
1303 && strcmp (buf
[0].string
, "line") == 0
1304 && buf
[1].type
== token_type_number
1305 && buf
[2].type
== token_type_string_literal
)
1307 logical_file_name
= xstrdup (buf
[2].string
);
1308 line_number
= buf
[1].number
;
1310 if (bufpos
>= 2 && buf
[0].type
== token_type_number
1311 && buf
[1].type
== token_type_string_literal
)
1313 logical_file_name
= xstrdup (buf
[1].string
);
1314 line_number
= buf
[0].number
;
1317 /* Release the storage held by the directive. */
1318 for (j
= 0; j
< bufpos
; ++j
)
1319 free_token (&buf
[j
]);
1321 /* We must reset the selected comments. */
1322 savable_comment_reset ();
1327 /* Supports 2 tokens of pushback. */
1329 phase6_unget (token_ty
*tp
)
1331 if (tp
->type
!= token_type_eof
)
1333 if (phase6_pushback_length
== SIZEOF (phase6_pushback
))
1335 phase6_pushback
[phase6_pushback_length
++] = *tp
;
1340 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1341 literal placeholders. */
1343 /* Test for an ISO C 99 section 7.8.1 format string directive. */
1345 is_inttypes_macro (const char *name
)
1348 P R I { d | i | o | u | x | X }
1349 { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */
1350 if (name
[0] == 'P' && name
[1] == 'R' && name
[2] == 'I')
1353 if (name
[0] == 'd' || name
[0] == 'i' || name
[0] == 'o' || name
[0] == 'u'
1354 || name
[0] == 'x' || name
[0] == 'X')
1357 if (name
[0] == 'M' && name
[1] == 'A' && name
[2] == 'X'
1360 if (name
[0] == 'P' && name
[1] == 'T' && name
[2] == 'R'
1363 if (name
[0] == 'L' && name
[1] == 'E' && name
[2] == 'A'
1364 && name
[3] == 'S' && name
[4] == 'T')
1366 else if (name
[0] == 'F' && name
[1] == 'A' && name
[2] == 'S'
1369 if (name
[0] == '8' && name
[1] == '\0')
1371 if (name
[0] == '1' && name
[1] == '6' && name
[2] == '\0')
1373 if (name
[0] == '3' && name
[1] == '2' && name
[2] == '\0')
1375 if (name
[0] == '6' && name
[1] == '4' && name
[2] == '\0')
1383 phase8a_get (token_ty
*tp
)
1386 if (tp
->type
== token_type_name
&& is_inttypes_macro (tp
->string
))
1388 /* Turn PRIdXXX into "<PRIdXXX>". */
1389 size_t len
= strlen (tp
->string
);
1390 char *new_string
= (char *) xmalloc (len
+ 3);
1391 new_string
[0] = '<';
1392 memcpy (new_string
+ 1, tp
->string
, len
);
1393 new_string
[len
+ 1] = '>';
1394 new_string
[len
+ 2] = '\0';
1396 tp
->string
= new_string
;
1397 tp
->comment
= add_reference (savable_comment
);
1398 tp
->type
= token_type_string_literal
;
1402 /* Supports 2 tokens of pushback. */
1404 phase8a_unget (token_ty
*tp
)
1410 /* 8b. Drop whitespace. */
1412 phase8b_get (token_ty
*tp
)
1418 if (tp
->type
== token_type_white_space
)
1420 if (tp
->type
== token_type_eoln
)
1422 /* We have to track the last occurrence of a string. One
1423 mode of xgettext allows to group an extracted message
1424 with a comment for documentation. The rule which states
1425 which comment is assumed to be grouped with the message
1426 says it should immediately precede it. Our
1427 interpretation: between the last line of the comment and
1428 the line in which the keyword is found must be no line
1429 with non-white space tokens. */
1431 if (last_non_comment_line
> last_comment_line
)
1432 savable_comment_reset ();
1439 /* Supports 2 tokens of pushback. */
1441 phase8b_unget (token_ty
*tp
)
1447 /* 8c. In ObjectiveC mode, drop '@' before a literal string. We need to
1448 do this before performing concatenation of adjacent string literals. */
1450 phase8c_get (token_ty
*tp
)
1455 if (tp
->type
!= token_type_objc_special
)
1458 if (tmp
.type
!= token_type_string_literal
)
1460 phase8b_unget (&tmp
);
1463 /* Drop the '@' token and return immediately the following string. */
1464 drop_reference (tmp
.comment
);
1465 tmp
.comment
= tp
->comment
;
1469 /* Supports only one pushback token. */
1471 phase8c_unget (token_ty
*tp
)
1477 /* 8. Concatenate adjacent string literals to form single string
1478 literals (because we don't expand macros, there are a few things we
1482 phase8_get (token_ty
*tp
)
1485 if (tp
->type
!= token_type_string_literal
)
1493 if (tmp
.type
!= token_type_string_literal
)
1495 phase8c_unget (&tmp
);
1498 len
= strlen (tp
->string
);
1499 tp
->string
= xrealloc (tp
->string
, len
+ strlen (tmp
.string
) + 1);
1500 strcpy (tp
->string
+ len
, tmp
.string
);
1506 /* ===================== Reading of high-level tokens. ==================== */
1509 enum xgettext_token_type_ty
1511 xgettext_token_type_eof
,
1512 xgettext_token_type_keyword
,
1513 xgettext_token_type_symbol
,
1514 xgettext_token_type_lparen
,
1515 xgettext_token_type_rparen
,
1516 xgettext_token_type_comma
,
1517 xgettext_token_type_colon
,
1518 xgettext_token_type_string_literal
,
1519 xgettext_token_type_other
1521 typedef enum xgettext_token_type_ty xgettext_token_type_ty
;
1523 typedef struct xgettext_token_ty xgettext_token_ty
;
1524 struct xgettext_token_ty
1526 xgettext_token_type_ty type
;
1528 /* These fields are used only for xgettext_token_type_keyword. */
1532 /* This field is used only for xgettext_token_type_string_literal,
1533 xgettext_token_type_keyword, xgettext_token_type_symbol. */
1536 /* This field is used only for xgettext_token_type_string_literal. */
1537 refcounted_string_list_ty
*comment
;
1539 /* These fields are only for
1540 xgettext_token_type_keyword,
1541 xgettext_token_type_string_literal. */
1546 /* 9. Convert the remaining preprocessing tokens to C tokens and
1547 discards any white space from the translation unit. */
1550 x_c_lex (xgettext_token_ty
*tp
)
1555 void *keyword_value
;
1557 phase8_get (&token
);
1560 case token_type_eof
:
1561 tp
->type
= xgettext_token_type_eof
;
1564 case token_type_name
:
1565 last_non_comment_line
= newline_count
;
1567 if (find_entry (objc_extensions
? &objc_keywords
: &c_keywords
,
1568 token
.string
, strlen (token
.string
), &keyword_value
)
1571 tp
->type
= xgettext_token_type_keyword
;
1572 tp
->argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
1573 tp
->argnum2
= (int) (long) keyword_value
>> 10;
1574 tp
->pos
.file_name
= logical_file_name
;
1575 tp
->pos
.line_number
= token
.line_number
;
1578 tp
->type
= xgettext_token_type_symbol
;
1579 tp
->string
= token
.string
;
1582 case token_type_lparen
:
1583 last_non_comment_line
= newline_count
;
1585 tp
->type
= xgettext_token_type_lparen
;
1588 case token_type_rparen
:
1589 last_non_comment_line
= newline_count
;
1591 tp
->type
= xgettext_token_type_rparen
;
1594 case token_type_comma
:
1595 last_non_comment_line
= newline_count
;
1597 tp
->type
= xgettext_token_type_comma
;
1600 case token_type_colon
:
1601 last_non_comment_line
= newline_count
;
1603 tp
->type
= xgettext_token_type_colon
;
1606 case token_type_string_literal
:
1607 last_non_comment_line
= newline_count
;
1609 tp
->type
= xgettext_token_type_string_literal
;
1610 tp
->string
= token
.string
;
1611 tp
->comment
= token
.comment
;
1612 tp
->pos
.file_name
= logical_file_name
;
1613 tp
->pos
.line_number
= token
.line_number
;
1616 case token_type_objc_special
:
1617 drop_reference (token
.comment
);
1621 last_non_comment_line
= newline_count
;
1623 tp
->type
= xgettext_token_type_other
;
1630 /* ========================= Extracting strings. ========================== */
1633 /* Context lookup table. */
1634 static flag_context_list_table_ty
*flag_context_list_table
;
1637 /* The file is broken into tokens. Scan the token stream, looking for
1638 a keyword, followed by a left paren, followed by a string. When we
1639 see this sequence, we have something to remember. We assume we are
1640 looking at a valid C or C++ program, and leave the complaints about
1641 the grammar to the compiler.
1643 Normal handling: Look for
1644 keyword ( ... msgid ... )
1645 Plural handling: Look for
1646 keyword ( ... msgid ... msgid_plural ... )
1648 We use recursion because the arguments before msgid or between msgid
1649 and msgid_plural can contain subexpressions of the same form. */
1652 /* Extract messages until the next balanced closing parenthesis.
1653 Extracted messages are added to MLP.
1654 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1655 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1656 otherwise PLURAL_COMMAS = 0.
1657 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1658 Return true upon eof, false upon closing parenthesis. */
1660 extract_parenthesized (message_list_ty
*mlp
,
1661 flag_context_ty outer_context
,
1662 flag_context_list_iterator_ty context_iter
,
1663 int commas_to_skip
, int plural_commas
)
1665 /* Remember the message containing the msgid, for msgid_plural. */
1666 message_ty
*plural_mp
= NULL
;
1668 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1670 /* Parameters of the keyword just seen. Defined only in state 1. */
1671 int next_commas_to_skip
= -1;
1672 int next_plural_commas
= 0;
1673 /* Context iterator that will be used if the next token is a '('. */
1674 flag_context_list_iterator_ty next_context_iter
=
1675 passthrough_context_list_iterator
;
1676 /* Context iterator that will be used if the next token is a ':'.
1677 (Objective C selector syntax.) */
1678 flag_context_list_iterator_ty selectorcall_context_iter
=
1679 passthrough_context_list_iterator
;
1680 /* Current context. */
1681 flag_context_ty inner_context
=
1682 inherited_context (outer_context
,
1683 flag_context_list_iterator_advance (&context_iter
));
1685 /* Start state is 0. */
1690 xgettext_token_ty token
;
1695 case xgettext_token_type_keyword
:
1696 next_commas_to_skip
= token
.argnum1
- 1;
1697 next_plural_commas
= (token
.argnum2
> token
.argnum1
1698 ? token
.argnum2
- token
.argnum1
: 0);
1700 goto keyword_or_symbol
;
1702 case xgettext_token_type_symbol
:
1706 flag_context_list_iterator (
1707 flag_context_list_table_lookup (
1708 flag_context_list_table
,
1709 token
.string
, strlen (token
.string
)));
1710 if (objc_extensions
)
1712 size_t token_string_len
= strlen (token
.string
);
1713 token
.string
= xrealloc (token
.string
, token_string_len
+ 2);
1714 token
.string
[token_string_len
] = ':';
1715 token
.string
[token_string_len
+ 1] = '\0';
1716 selectorcall_context_iter
=
1717 flag_context_list_iterator (
1718 flag_context_list_table_lookup (
1719 flag_context_list_table
,
1720 token
.string
, token_string_len
+ 1));
1722 free (token
.string
);
1725 case xgettext_token_type_lparen
:
1726 if (extract_parenthesized (mlp
, inner_context
, next_context_iter
,
1727 state
? next_commas_to_skip
: -1,
1728 state
? next_plural_commas
: 0))
1730 next_context_iter
= null_context_list_iterator
;
1731 selectorcall_context_iter
= null_context_list_iterator
;
1735 case xgettext_token_type_rparen
:
1738 case xgettext_token_type_comma
:
1739 if (commas_to_skip
>= 0)
1741 if (commas_to_skip
> 0)
1744 if (plural_mp
!= NULL
&& plural_commas
> 0)
1746 commas_to_skip
= plural_commas
- 1;
1750 commas_to_skip
= -1;
1753 inherited_context (outer_context
,
1754 flag_context_list_iterator_advance (
1756 next_context_iter
= passthrough_context_list_iterator
;
1757 selectorcall_context_iter
= passthrough_context_list_iterator
;
1761 case xgettext_token_type_colon
:
1762 if (objc_extensions
)
1764 context_iter
= selectorcall_context_iter
;
1766 inherited_context (inner_context
,
1767 flag_context_list_iterator_advance (
1769 next_context_iter
= passthrough_context_list_iterator
;
1770 selectorcall_context_iter
= passthrough_context_list_iterator
;
1774 next_context_iter
= null_context_list_iterator
;
1775 selectorcall_context_iter
= null_context_list_iterator
;
1780 case xgettext_token_type_string_literal
:
1783 savable_comment_to_xgettext_comment (token
.comment
);
1784 remember_a_message (mlp
, token
.string
, inner_context
, &token
.pos
);
1785 savable_comment_reset ();
1789 if (commas_to_skip
== 0)
1791 if (plural_mp
== NULL
)
1793 /* Seen an msgid. */
1796 savable_comment_to_xgettext_comment (token
.comment
);
1797 mp
= remember_a_message (mlp
, token
.string
,
1798 inner_context
, &token
.pos
);
1799 savable_comment_reset ();
1800 if (plural_commas
> 0)
1805 /* Seen an msgid_plural. */
1806 remember_a_message_plural (plural_mp
, token
.string
,
1807 inner_context
, &token
.pos
);
1812 free (token
.string
);
1814 drop_reference (token
.comment
);
1815 next_context_iter
= null_context_list_iterator
;
1816 selectorcall_context_iter
= null_context_list_iterator
;
1820 case xgettext_token_type_other
:
1821 next_context_iter
= null_context_list_iterator
;
1822 selectorcall_context_iter
= null_context_list_iterator
;
1826 case xgettext_token_type_eof
:
1837 extract_whole_file (FILE *f
,
1838 const char *real_filename
, const char *logical_filename
,
1839 flag_context_list_table_ty
*flag_table
,
1840 msgdomain_list_ty
*mdlp
)
1842 message_list_ty
*mlp
= mdlp
->item
[0]->messages
;
1845 real_file_name
= real_filename
;
1846 logical_file_name
= xstrdup (logical_filename
);
1850 last_comment_line
= -1;
1851 last_non_comment_line
= -1;
1853 flag_context_list_table
= flag_table
;
1857 /* Eat tokens until eof is seen. When extract_parenthesized returns
1858 due to an unbalanced closing parenthesis, just restart it. */
1859 while (!extract_parenthesized (mlp
, null_context
, null_context_list_iterator
,
1863 /* Close scanner. */
1865 real_file_name
= NULL
;
1866 logical_file_name
= NULL
;
1873 const char *real_filename
, const char *logical_filename
,
1874 flag_context_list_table_ty
*flag_table
,
1875 msgdomain_list_ty
*mdlp
)
1877 objc_extensions
= false;
1878 extract_whole_file (f
, real_filename
, logical_filename
, flag_table
, mdlp
);
1882 extract_objc (FILE *f
,
1883 const char *real_filename
, const char *logical_filename
,
1884 flag_context_list_table_ty
*flag_table
,
1885 msgdomain_list_ty
*mdlp
)
1887 objc_extensions
= true;
1888 extract_whole_file (f
, real_filename
, logical_filename
, flag_table
, mdlp
);