1 /* xgettext Python backend.
2 Copyright (C) 2002-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
35 #include "error-progname.h"
38 #include "po-charset.h"
40 #include "utf16-ucs4.h"
41 #include "ucs4-utf8.h"
44 #define _(s) gettext(s)
46 #define max(a,b) ((a) > (b) ? (a) : (b))
48 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
51 /* The Python syntax is defined in the Python Reference Manual
52 /usr/share/doc/packages/python/html/ref/index.html.
53 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
54 Python-2.0/Objects/unicodeobject.c. */
57 /* ====================== Keyword set customization. ====================== */
59 /* If true extract all strings. */
60 static bool extract_all
= false;
62 static hash_table keywords
;
63 static bool default_keywords
= true;
67 x_python_extract_all ()
74 x_python_keyword (const char *name
)
77 default_keywords
= false;
85 if (keywords
.table
== NULL
)
86 init_hash (&keywords
, 100);
88 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
90 /* The characters between name and end should form a valid C identifier.
91 A colon means an invalid parse in split_keywordspec(). */
92 colon
= strchr (name
, ':');
93 if (colon
== NULL
|| colon
>= end
)
97 insert_entry (&keywords
, name
, end
- name
,
98 (void *) (long) (argnum1
+ (argnum2
<< 10)));
103 /* Finish initializing the keywords hash table.
104 Called after argument processing, before each file is processed. */
108 if (default_keywords
)
110 x_python_keyword ("gettext");
111 x_python_keyword ("ugettext");
112 x_python_keyword ("dgettext:2");
113 x_python_keyword ("ngettext:1,2");
114 x_python_keyword ("ungettext:1,2");
115 x_python_keyword ("dngettext:2,3");
116 x_python_keyword ("_");
117 default_keywords
= false;
122 init_flag_table_python ()
124 xgettext_record_flag ("gettext:1:pass-python-format");
125 xgettext_record_flag ("ugettext:1:pass-python-format");
126 xgettext_record_flag ("dgettext:2:pass-python-format");
127 xgettext_record_flag ("ngettext:1:pass-python-format");
128 xgettext_record_flag ("ngettext:2:pass-python-format");
129 xgettext_record_flag ("ungettext:1:pass-python-format");
130 xgettext_record_flag ("ungettext:2:pass-python-format");
131 xgettext_record_flag ("dngettext:2:pass-python-format");
132 xgettext_record_flag ("dngettext:3:pass-python-format");
133 xgettext_record_flag ("_:1:pass-python-format");
134 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
138 /* ======================== Reading of characters. ======================== */
140 /* Real filename, used in error messages about the input file. */
141 static const char *real_file_name
;
143 /* Logical filename and line number, used to label the extracted messages. */
144 static char *logical_file_name
;
145 static int line_number
;
147 /* The input file stream. */
151 /* 1. line_number handling. Also allow a lookahead. */
153 static unsigned char phase1_pushback
[max (9, UNINAME_MAX
+ 3)];
154 static int phase1_pushback_length
;
161 if (phase1_pushback_length
)
162 c
= phase1_pushback
[--phase1_pushback_length
];
170 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
182 /* Supports max (9, UNINAME_MAX + 3) characters of pushback. */
184 phase1_ungetc (int c
)
191 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
193 phase1_pushback
[phase1_pushback_length
++] = c
;
198 /* Accumulating comments. */
201 static size_t bufmax
;
202 static size_t buflen
;
213 /* We assume the program source is in ISO-8859-1 (for consistency with
214 Python's \ooo and \xnn syntax inside strings), but we produce a POT
215 file in UTF-8 encoding. */
216 size_t len
= ((unsigned char) c
< 0x80 ? 1 : 2);
217 if (buflen
+ len
> bufmax
)
219 bufmax
= 2 * bufmax
+ 10;
220 buffer
= xrealloc (buffer
, bufmax
);
222 if ((unsigned char) c
< 0x80)
223 buffer
[buflen
++] = c
;
226 buffer
[buflen
++] = 0xc0 | ((unsigned char) c
>> 6);
227 buffer
[buflen
++] = 0x80 | ((unsigned char) c
& 0x3f);
235 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
237 if (buflen
>= bufmax
)
239 bufmax
= 2 * bufmax
+ 10;
240 buffer
= xrealloc (buffer
, bufmax
);
242 buffer
[buflen
] = '\0';
243 savable_comment_add (buffer
);
246 /* These are for tracking whether comments count as immediately before
248 static int last_comment_line
;
249 static int last_non_comment_line
;
252 /* 2. Outside strings, replace backslash-newline with nothing and a comment
269 /* This shouldn't happen usually, because "A backslash is
270 illegal elsewhere on a line outside a string literal." */
273 /* Eat backslash-newline. */
278 last_comment_line
= line_number
;
283 if (c
== EOF
|| c
== '\n')
285 /* We skip all leading white space, but not EOLs. */
286 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
297 /* Supports only one pushback character. */
299 phase2_ungetc (int c
)
305 /* ========================== Reading of tokens. ========================== */
311 token_type_lparen
, /* ( */
312 token_type_rparen
, /* ) */
313 token_type_comma
, /* , */
314 token_type_string
, /* "abc", 'abc', """abc""", '''abc''' */
315 token_type_symbol
, /* symbol, number */
316 token_type_other
/* misc. operator */
318 typedef enum token_type_ty token_type_ty
;
320 typedef struct token_ty token_ty
;
324 char *string
; /* for token_type_string, token_type_symbol */
325 refcounted_string_list_ty
*comment
; /* for token_type_string */
330 /* There are two different input syntaxes for strings, "abc" and r"abc",
331 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
332 Which escape sequences are understood, i.e. what is interpreted specially
334 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
336 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
338 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
339 \unnnn items. The \ooo and \xnn values are ISO-8859-1 values: u"\xff" and
340 u"\u00ff" are the same. */
343 #define P7_STRING_END (-2)
346 phase7_getuc (int quote_char
,
347 bool triple
, bool interpret_ansic
, bool interpret_unicode
,
348 unsigned int *backslash_counter
)
354 /* Use phase 1, because phase 2 elides comments. */
360 if (c
== quote_char
&& (interpret_ansic
|| (*backslash_counter
& 1) == 0))
364 int c1
= phase1_getc ();
365 if (c1
== quote_char
)
367 int c2
= phase1_getc ();
368 if (c2
== quote_char
)
369 return P7_STRING_END
;
376 return P7_STRING_END
;
383 *backslash_counter
= 0;
386 /* In r"..." and ur"..." strings, newline is only allowed
387 immediately after an odd number of backslashes (although the
388 backslashes are not interpreted!). */
389 if (!(interpret_ansic
|| (*backslash_counter
& 1) == 0))
391 *backslash_counter
= 0;
395 error_with_progname
= false;
396 error (0, 0, _("%s:%d: warning: unterminated string"),
397 logical_file_name
, line_number
);
398 error_with_progname
= true;
399 return P7_STRING_END
;
404 *backslash_counter
= 0;
408 /* Backslash handling. */
410 if (!interpret_ansic
&& !interpret_unicode
)
412 ++*backslash_counter
;
416 /* Dispatch according to the character following the backslash. */
420 ++*backslash_counter
;
430 ++*backslash_counter
;
433 *backslash_counter
= 0;
436 *backslash_counter
= 0;
439 *backslash_counter
= 0;
442 *backslash_counter
= 0;
445 *backslash_counter
= 0;
448 *backslash_counter
= 0;
451 *backslash_counter
= 0;
454 *backslash_counter
= 0;
456 case '0': case '1': case '2': case '3': case '4':
457 case '5': case '6': case '7':
464 if (c
>= '0' && c
<= '7')
466 n
= (n
<< 3) + (c
- '0');
470 if (c
>= '0' && c
<= '7')
471 n
= (n
<< 3) + (c
- '0');
479 *backslash_counter
= 0;
480 return (unsigned char) n
;
484 int c1
= phase1_getc ();
487 if (c1
>= '0' && c1
<= '9')
489 else if (c1
>= 'A' && c1
<= 'F')
491 else if (c1
>= 'a' && c1
<= 'f')
498 int c2
= phase1_getc ();
501 if (c2
>= '0' && c2
<= '9')
503 else if (c2
>= 'A' && c2
<= 'F')
505 else if (c2
>= 'a' && c2
<= 'f')
512 *backslash_counter
= 0;
513 return (unsigned char) ((n1
<< 4) + n2
);
520 ++*backslash_counter
;
525 if (interpret_unicode
)
529 unsigned char buf
[4];
533 for (i
= 0; i
< 4; i
++)
535 int c1
= phase1_getc ();
537 if (c1
>= '0' && c1
<= '9')
538 n
= (n
<< 4) + (c1
- '0');
539 else if (c1
>= 'A' && c1
<= 'F')
540 n
= (n
<< 4) + (c1
- 'A' + 10);
541 else if (c1
>= 'a' && c1
<= 'f')
542 n
= (n
<< 4) + (c1
- 'a' + 10);
547 phase1_ungetc (buf
[i
]);
549 ++*backslash_counter
;
555 *backslash_counter
= 0;
563 unsigned char buf
[8];
567 for (i
= 0; i
< 8; i
++)
569 int c1
= phase1_getc ();
571 if (c1
>= '0' && c1
<= '9')
572 n
= (n
<< 4) + (c1
- '0');
573 else if (c1
>= 'A' && c1
<= 'F')
574 n
= (n
<< 4) + (c1
- 'A' + 10);
575 else if (c1
>= 'a' && c1
<= 'f')
576 n
= (n
<< 4) + (c1
- 'a' + 10);
581 phase1_ungetc (buf
[i
]);
583 ++*backslash_counter
;
591 *backslash_counter
= 0;
595 error_with_progname
= false;
596 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
597 logical_file_name
, line_number
);
598 error_with_progname
= true;
601 phase1_ungetc (buf
[i
]);
603 ++*backslash_counter
;
609 int c1
= phase1_getc ();
612 unsigned char buf
[UNINAME_MAX
+ 1];
616 for (i
= 0; i
< UNINAME_MAX
; i
++)
618 int c2
= phase1_getc ();
619 if (!(c2
>= ' ' && c2
<= '~'))
623 phase1_ungetc (buf
[i
]);
626 ++*backslash_counter
;
635 n
= unicode_name_character ((char *) buf
);
636 if (n
!= UNINAME_INVALID
)
638 *backslash_counter
= 0;
644 phase1_ungetc (buf
[i
]);
648 ++*backslash_counter
;
655 ++*backslash_counter
;
661 /* Combine characters into tokens. Discard whitespace except newlines at
662 the end of logical lines. */
664 /* Number of pending open parentheses/braces/brackets. */
667 static token_ty phase5_pushback
[1];
668 static int phase5_pushback_length
;
671 phase5_get (token_ty
*tp
)
675 if (phase5_pushback_length
)
677 *tp
= phase5_pushback
[--phase5_pushback_length
];
683 tp
->line_number
= line_number
;
689 tp
->type
= token_type_eof
;
695 /* Ignore whitespace and comments. */
699 if (last_non_comment_line
> last_comment_line
)
700 savable_comment_reset ();
701 /* Ignore newline if and only if it is used for implicit line
705 tp
->type
= token_type_other
;
709 last_non_comment_line
= tp
->line_number
;
715 int c1
= phase2_getc ();
717 if (!(c1
>= '0' && c1
<= '9'))
720 tp
->type
= token_type_other
;
725 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
726 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
727 case 'M': case 'N': case 'O': case 'P': case 'Q':
728 case 'S': case 'T': case 'V': case 'W': case 'X':
731 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
732 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
733 case 'm': case 'n': case 'o': case 'p': case 'q':
734 case 's': case 't': case 'v': case 'w': case 'x':
736 case '0': case '1': case '2': case '3': case '4':
737 case '5': case '6': case '7': case '8': case '9':
739 /* Symbol, or part of a number. */
748 if (bufpos
>= bufmax
)
750 bufmax
= 2 * bufmax
+ 10;
751 buffer
= xrealloc (buffer
, bufmax
);
753 buffer
[bufpos
++] = c
;
757 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
758 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
759 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
760 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
763 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
764 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
765 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
766 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
768 case '0': case '1': case '2': case '3': case '4':
769 case '5': case '6': case '7': case '8': case '9':
777 if (bufpos
>= bufmax
)
779 bufmax
= 2 * bufmax
+ 10;
780 buffer
= xrealloc (buffer
, bufmax
);
782 buffer
[bufpos
] = '\0';
783 tp
->string
= xstrdup (buffer
);
784 tp
->type
= token_type_symbol
;
790 static unsigned short *buffer
;
794 bool interpret_ansic
;
795 bool interpret_unicode
;
797 unsigned int backslash_counter
;
801 int c1
= phase1_getc ();
802 if (c1
== '"' || c1
== '\'')
805 interpret_ansic
= false;
806 interpret_unicode
= false;
815 int c1
= phase1_getc ();
816 if (c1
== '"' || c1
== '\'')
819 interpret_ansic
= true;
820 interpret_unicode
= true;
823 if (c1
== 'R' || c1
== 'r')
825 int c2
= phase1_getc ();
826 if (c2
== '"' || c2
== '\'')
829 interpret_ansic
= false;
830 interpret_unicode
= true;
841 interpret_ansic
= true;
842 interpret_unicode
= false;
846 int c1
= phase1_getc ();
847 if (c1
== quote_char
)
849 int c2
= phase1_getc ();
850 if (c2
== quote_char
)
861 backslash_counter
= 0;
862 /* Start accumulating the string. We store the string in
863 UTF-16 before converting it to UTF-8. Why not converting
864 every character directly to UTF-8? Because a string can
865 contain surrogates like u"\uD800\uDF00", and we must
866 combine them to a single UTF-8 character. */
870 int uc
= phase7_getuc (quote_char
, triple
, interpret_ansic
,
871 interpret_unicode
, &backslash_counter
);
874 if (uc
== P7_EOF
|| uc
== P7_STRING_END
)
877 assert (uc
>= 0 && uc
< 0x110000);
878 len
= (uc
< 0x10000 ? 1 : 2);
879 if (bufpos
+ len
> bufmax
)
881 bufmax
= 2 * bufmax
+ 10;
883 xrealloc (buffer
, bufmax
* sizeof (unsigned short));
886 buffer
[bufpos
++] = uc
;
889 buffer
[bufpos
++] = 0xd800 + ((uc
- 0x10000) >> 10);
890 buffer
[bufpos
++] = 0xdc00 + ((uc
- 0x10000) & 0x3ff);
893 /* Now convert from UTF-16 to UTF-8. */
896 unsigned char *utf8_string
;
899 /* Each UTF-16 word needs 3 bytes at worst. */
900 utf8_string
= (unsigned char *) xmalloc (3 * bufpos
+ 1);
901 for (pos
= 0, q
= utf8_string
; pos
< bufpos
; )
906 pos
+= u16_mbtouc (&uc
, buffer
+ pos
, bufpos
- pos
);
907 n
= u8_uctomb (q
, uc
, 6);
912 assert (q
- utf8_string
<= 3 * bufpos
);
913 tp
->string
= (char *) utf8_string
;
915 tp
->comment
= add_reference (savable_comment
);
916 tp
->type
= token_type_string
;
922 tp
->type
= token_type_lparen
;
928 tp
->type
= token_type_rparen
;
932 tp
->type
= token_type_comma
;
937 tp
->type
= token_type_other
;
943 tp
->type
= token_type_other
;
947 /* We could carefully recognize each of the 2 and 3 character
948 operators, but it is not necessary, as we only need to recognize
949 gettext invocations. Don't bother. */
950 tp
->type
= token_type_other
;
956 /* Supports only one pushback token. */
958 phase5_unget (token_ty
*tp
)
960 if (tp
->type
!= token_type_eof
)
962 if (phase5_pushback_length
== SIZEOF (phase5_pushback
))
964 phase5_pushback
[phase5_pushback_length
++] = *tp
;
969 /* Combine adjacent strings to form a single string. Note that the end
970 of a logical line appears as a token of its own, therefore strings that
971 belong to different logical lines will not be concatenated. */
974 x_python_lex (token_ty
*tp
)
977 if (tp
->type
!= token_type_string
)
985 if (tmp
.type
!= token_type_string
)
990 len
= strlen (tp
->string
);
991 tp
->string
= xrealloc (tp
->string
, len
+ strlen (tmp
.string
) + 1);
992 strcpy (tp
->string
+ len
, tmp
.string
);
998 /* ========================= Extracting strings. ========================== */
1001 /* Context lookup table. */
1002 static flag_context_list_table_ty
*flag_context_list_table
;
1005 /* The file is broken into tokens. Scan the token stream, looking for
1006 a keyword, followed by a left paren, followed by a string. When we
1007 see this sequence, we have something to remember. We assume we are
1008 looking at a valid C or C++ program, and leave the complaints about
1009 the grammar to the compiler.
1011 Normal handling: Look for
1012 keyword ( ... msgid ... )
1013 Plural handling: Look for
1014 keyword ( ... msgid ... msgid_plural ... )
1016 We use recursion because the arguments before msgid or between msgid
1017 and msgid_plural can contain subexpressions of the same form. */
1020 /* Extract messages until the next balanced closing parenthesis.
1021 Extracted messages are added to MLP.
1022 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1023 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1024 otherwise PLURAL_COMMAS = 0.
1025 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1026 Return true upon eof, false upon closing parenthesis. */
1028 extract_parenthesized (message_list_ty
*mlp
,
1029 flag_context_ty outer_context
,
1030 flag_context_list_iterator_ty context_iter
,
1031 int commas_to_skip
, int plural_commas
)
1033 /* Remember the message containing the msgid, for msgid_plural. */
1034 message_ty
*plural_mp
= NULL
;
1036 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1038 /* Parameters of the keyword just seen. Defined only in state 1. */
1039 int next_commas_to_skip
= -1;
1040 int next_plural_commas
= 0;
1041 /* Context iterator that will be used if the next token is a '('. */
1042 flag_context_list_iterator_ty next_context_iter
=
1043 passthrough_context_list_iterator
;
1044 /* Current context. */
1045 flag_context_ty inner_context
=
1046 inherited_context (outer_context
,
1047 flag_context_list_iterator_advance (&context_iter
));
1049 /* Start state is 0. */
1056 x_python_lex (&token
);
1059 case token_type_symbol
:
1061 void *keyword_value
;
1063 if (find_entry (&keywords
, token
.string
, strlen (token
.string
),
1067 int argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
1068 int argnum2
= (int) (long) keyword_value
>> 10;
1070 next_commas_to_skip
= argnum1
- 1;
1071 next_plural_commas
= (argnum2
> argnum1
? argnum2
- argnum1
: 0);
1078 flag_context_list_iterator (
1079 flag_context_list_table_lookup (
1080 flag_context_list_table
,
1081 token
.string
, strlen (token
.string
)));
1082 free (token
.string
);
1085 case token_type_lparen
:
1086 if (extract_parenthesized (mlp
, inner_context
, next_context_iter
,
1087 state
? next_commas_to_skip
: -1,
1088 state
? next_plural_commas
: 0))
1090 next_context_iter
= null_context_list_iterator
;
1094 case token_type_rparen
:
1097 case token_type_comma
:
1098 if (commas_to_skip
>= 0)
1100 if (commas_to_skip
> 0)
1103 if (plural_mp
!= NULL
&& plural_commas
> 0)
1105 commas_to_skip
= plural_commas
- 1;
1109 commas_to_skip
= -1;
1112 inherited_context (outer_context
,
1113 flag_context_list_iterator_advance (
1115 next_context_iter
= passthrough_context_list_iterator
;
1119 case token_type_string
:
1122 pos
.file_name
= logical_file_name
;
1123 pos
.line_number
= token
.line_number
;
1127 savable_comment_to_xgettext_comment (token
.comment
);
1128 remember_a_message (mlp
, token
.string
, inner_context
, &pos
);
1129 savable_comment_reset ();
1133 if (commas_to_skip
== 0)
1135 if (plural_mp
== NULL
)
1137 /* Seen an msgid. */
1140 savable_comment_to_xgettext_comment (token
.comment
);
1141 mp
= remember_a_message (mlp
, token
.string
,
1142 inner_context
, &pos
);
1143 savable_comment_reset ();
1144 if (plural_commas
> 0)
1149 /* Seen an msgid_plural. */
1150 remember_a_message_plural (plural_mp
, token
.string
,
1151 inner_context
, &pos
);
1156 free (token
.string
);
1159 drop_reference (token
.comment
);
1160 next_context_iter
= null_context_list_iterator
;
1164 case token_type_eof
:
1167 case token_type_other
:
1168 next_context_iter
= null_context_list_iterator
;
1180 extract_python (FILE *f
,
1181 const char *real_filename
, const char *logical_filename
,
1182 flag_context_list_table_ty
*flag_table
,
1183 msgdomain_list_ty
*mdlp
)
1185 message_list_ty
*mlp
= mdlp
->item
[0]->messages
;
1187 /* We convert our strings to UTF-8 encoding. */
1188 xgettext_current_source_encoding
= po_charset_utf8
;
1191 real_file_name
= real_filename
;
1192 logical_file_name
= xstrdup (logical_filename
);
1195 last_comment_line
= -1;
1196 last_non_comment_line
= -1;
1200 flag_context_list_table
= flag_table
;
1204 /* Eat tokens until eof is seen. When extract_parenthesized returns
1205 due to an unbalanced closing parenthesis, just restart it. */
1206 while (!extract_parenthesized (mlp
, null_context
, null_context_list_iterator
,
1211 real_file_name
= NULL
;
1212 logical_file_name
= NULL
;