Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-python.c
blobd2b6a85ac28d98d0e3ca0294f35d65e06835b13f
1 /* xgettext Python backend.
2 Copyright (C) 2002-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <assert.h>
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
31 #include "message.h"
32 #include "xgettext.h"
33 #include "x-python.h"
34 #include "error.h"
35 #include "error-progname.h"
36 #include "xalloc.h"
37 #include "exit.h"
38 #include "po-charset.h"
39 #include "uniname.h"
40 #include "utf16-ucs4.h"
41 #include "ucs4-utf8.h"
42 #include "gettext.h"
44 #define _(s) gettext(s)
46 #define max(a,b) ((a) > (b) ? (a) : (b))
48 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
51 /* The Python syntax is defined in the Python Reference Manual
52 /usr/share/doc/packages/python/html/ref/index.html.
53 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
54 Python-2.0/Objects/unicodeobject.c. */
57 /* ====================== Keyword set customization. ====================== */
59 /* If true extract all strings. */
60 static bool extract_all = false;
62 static hash_table keywords;
63 static bool default_keywords = true;
66 void
67 x_python_extract_all ()
69 extract_all = true;
73 void
74 x_python_keyword (const char *name)
76 if (name == NULL)
77 default_keywords = false;
78 else
80 const char *end;
81 int argnum1;
82 int argnum2;
83 const char *colon;
85 if (keywords.table == NULL)
86 init_hash (&keywords, 100);
88 split_keywordspec (name, &end, &argnum1, &argnum2);
90 /* The characters between name and end should form a valid C identifier.
91 A colon means an invalid parse in split_keywordspec(). */
92 colon = strchr (name, ':');
93 if (colon == NULL || colon >= end)
95 if (argnum1 == 0)
96 argnum1 = 1;
97 insert_entry (&keywords, name, end - name,
98 (void *) (long) (argnum1 + (argnum2 << 10)));
103 /* Finish initializing the keywords hash table.
104 Called after argument processing, before each file is processed. */
105 static void
106 init_keywords ()
108 if (default_keywords)
110 x_python_keyword ("gettext");
111 x_python_keyword ("ugettext");
112 x_python_keyword ("dgettext:2");
113 x_python_keyword ("ngettext:1,2");
114 x_python_keyword ("ungettext:1,2");
115 x_python_keyword ("dngettext:2,3");
116 x_python_keyword ("_");
117 default_keywords = false;
121 void
122 init_flag_table_python ()
124 xgettext_record_flag ("gettext:1:pass-python-format");
125 xgettext_record_flag ("ugettext:1:pass-python-format");
126 xgettext_record_flag ("dgettext:2:pass-python-format");
127 xgettext_record_flag ("ngettext:1:pass-python-format");
128 xgettext_record_flag ("ngettext:2:pass-python-format");
129 xgettext_record_flag ("ungettext:1:pass-python-format");
130 xgettext_record_flag ("ungettext:2:pass-python-format");
131 xgettext_record_flag ("dngettext:2:pass-python-format");
132 xgettext_record_flag ("dngettext:3:pass-python-format");
133 xgettext_record_flag ("_:1:pass-python-format");
134 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
138 /* ======================== Reading of characters. ======================== */
140 /* Real filename, used in error messages about the input file. */
141 static const char *real_file_name;
143 /* Logical filename and line number, used to label the extracted messages. */
144 static char *logical_file_name;
145 static int line_number;
147 /* The input file stream. */
148 static FILE *fp;
151 /* 1. line_number handling. Also allow a lookahead. */
153 static unsigned char phase1_pushback[max (9, UNINAME_MAX + 3)];
154 static int phase1_pushback_length;
156 static int
157 phase1_getc ()
159 int c;
161 if (phase1_pushback_length)
162 c = phase1_pushback[--phase1_pushback_length];
163 else
165 c = getc (fp);
167 if (c == EOF)
169 if (ferror (fp))
170 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
171 real_file_name);
172 return EOF;
176 if (c == '\n')
177 line_number++;
179 return c;
182 /* Supports max (9, UNINAME_MAX + 3) characters of pushback. */
183 static void
184 phase1_ungetc (int c)
186 if (c != EOF)
188 if (c == '\n')
189 --line_number;
191 if (phase1_pushback_length == SIZEOF (phase1_pushback))
192 abort ();
193 phase1_pushback[phase1_pushback_length++] = c;
198 /* Accumulating comments. */
200 static char *buffer;
201 static size_t bufmax;
202 static size_t buflen;
204 static inline void
205 comment_start ()
207 buflen = 0;
210 static inline void
211 comment_add (int c)
213 /* We assume the program source is in ISO-8859-1 (for consistency with
214 Python's \ooo and \xnn syntax inside strings), but we produce a POT
215 file in UTF-8 encoding. */
216 size_t len = ((unsigned char) c < 0x80 ? 1 : 2);
217 if (buflen + len > bufmax)
219 bufmax = 2 * bufmax + 10;
220 buffer = xrealloc (buffer, bufmax);
222 if ((unsigned char) c < 0x80)
223 buffer[buflen++] = c;
224 else
226 buffer[buflen++] = 0xc0 | ((unsigned char) c >> 6);
227 buffer[buflen++] = 0x80 | ((unsigned char) c & 0x3f);
231 static inline void
232 comment_line_end ()
234 while (buflen >= 1
235 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
236 --buflen;
237 if (buflen >= bufmax)
239 bufmax = 2 * bufmax + 10;
240 buffer = xrealloc (buffer, bufmax);
242 buffer[buflen] = '\0';
243 savable_comment_add (buffer);
246 /* These are for tracking whether comments count as immediately before
247 keyword. */
248 static int last_comment_line;
249 static int last_non_comment_line;
252 /* 2. Outside strings, replace backslash-newline with nothing and a comment
253 with nothing. */
255 static int
256 phase2_getc ()
258 int c;
260 for (;;)
262 c = phase1_getc ();
263 if (c == '\\')
265 c = phase1_getc ();
266 if (c != '\n')
268 phase1_ungetc (c);
269 /* This shouldn't happen usually, because "A backslash is
270 illegal elsewhere on a line outside a string literal." */
271 return '\\';
273 /* Eat backslash-newline. */
275 else if (c == '#')
277 /* Eat a comment. */
278 last_comment_line = line_number;
279 comment_start ();
280 for (;;)
282 c = phase1_getc ();
283 if (c == EOF || c == '\n')
284 break;
285 /* We skip all leading white space, but not EOLs. */
286 if (!(buflen == 0 && (c == ' ' || c == '\t')))
287 comment_add (c);
289 comment_line_end ();
290 return c;
292 else
293 return c;
297 /* Supports only one pushback character. */
298 static void
299 phase2_ungetc (int c)
301 phase1_ungetc (c);
305 /* ========================== Reading of tokens. ========================== */
308 enum token_type_ty
310 token_type_eof,
311 token_type_lparen, /* ( */
312 token_type_rparen, /* ) */
313 token_type_comma, /* , */
314 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */
315 token_type_symbol, /* symbol, number */
316 token_type_other /* misc. operator */
318 typedef enum token_type_ty token_type_ty;
320 typedef struct token_ty token_ty;
321 struct token_ty
323 token_type_ty type;
324 char *string; /* for token_type_string, token_type_symbol */
325 refcounted_string_list_ty *comment; /* for token_type_string */
326 int line_number;
330 /* There are two different input syntaxes for strings, "abc" and r"abc",
331 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
332 Which escape sequences are understood, i.e. what is interpreted specially
333 after backslash?
334 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
335 r"abc"
336 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
337 ur"abc" \unnnn
338 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
339 \unnnn items. The \ooo and \xnn values are ISO-8859-1 values: u"\xff" and
340 u"\u00ff" are the same. */
342 #define P7_EOF (-1)
343 #define P7_STRING_END (-2)
345 static int
346 phase7_getuc (int quote_char,
347 bool triple, bool interpret_ansic, bool interpret_unicode,
348 unsigned int *backslash_counter)
350 int c;
352 for (;;)
354 /* Use phase 1, because phase 2 elides comments. */
355 c = phase1_getc ();
357 if (c == EOF)
358 return P7_EOF;
360 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
362 if (triple)
364 int c1 = phase1_getc ();
365 if (c1 == quote_char)
367 int c2 = phase1_getc ();
368 if (c2 == quote_char)
369 return P7_STRING_END;
370 phase1_ungetc (c2);
372 phase1_ungetc (c1);
373 return c;
375 else
376 return P7_STRING_END;
379 if (c == '\n')
381 if (triple)
383 *backslash_counter = 0;
384 return '\n';
386 /* In r"..." and ur"..." strings, newline is only allowed
387 immediately after an odd number of backslashes (although the
388 backslashes are not interpreted!). */
389 if (!(interpret_ansic || (*backslash_counter & 1) == 0))
391 *backslash_counter = 0;
392 return '\n';
394 phase1_ungetc (c);
395 error_with_progname = false;
396 error (0, 0, _("%s:%d: warning: unterminated string"),
397 logical_file_name, line_number);
398 error_with_progname = true;
399 return P7_STRING_END;
402 if (c != '\\')
404 *backslash_counter = 0;
405 return c;
408 /* Backslash handling. */
410 if (!interpret_ansic && !interpret_unicode)
412 ++*backslash_counter;
413 return '\\';
416 /* Dispatch according to the character following the backslash. */
417 c = phase1_getc ();
418 if (c == EOF)
420 ++*backslash_counter;
421 return '\\';
424 if (interpret_ansic)
425 switch (c)
427 case '\n':
428 continue;
429 case '\\':
430 ++*backslash_counter;
431 return c;
432 case '\'': case '"':
433 *backslash_counter = 0;
434 return c;
435 case 'a':
436 *backslash_counter = 0;
437 return '\a';
438 case 'b':
439 *backslash_counter = 0;
440 return '\b';
441 case 'f':
442 *backslash_counter = 0;
443 return '\f';
444 case 'n':
445 *backslash_counter = 0;
446 return '\n';
447 case 'r':
448 *backslash_counter = 0;
449 return '\r';
450 case 't':
451 *backslash_counter = 0;
452 return '\t';
453 case 'v':
454 *backslash_counter = 0;
455 return '\v';
456 case '0': case '1': case '2': case '3': case '4':
457 case '5': case '6': case '7':
459 int n = c - '0';
461 c = phase1_getc ();
462 if (c != EOF)
464 if (c >= '0' && c <= '7')
466 n = (n << 3) + (c - '0');
467 c = phase1_getc ();
468 if (c != EOF)
470 if (c >= '0' && c <= '7')
471 n = (n << 3) + (c - '0');
472 else
473 phase1_ungetc (c);
476 else
477 phase1_ungetc (c);
479 *backslash_counter = 0;
480 return (unsigned char) n;
482 case 'x':
484 int c1 = phase1_getc ();
485 int n1;
487 if (c1 >= '0' && c1 <= '9')
488 n1 = c1 - '0';
489 else if (c1 >= 'A' && c1 <= 'F')
490 n1 = c1 - 'A' + 10;
491 else if (c1 >= 'a' && c1 <= 'f')
492 n1 = c1 - 'a' + 10;
493 else
494 n1 = -1;
496 if (n1 >= 0)
498 int c2 = phase1_getc ();
499 int n2;
501 if (c2 >= '0' && c2 <= '9')
502 n2 = c2 - '0';
503 else if (c2 >= 'A' && c2 <= 'F')
504 n2 = c2 - 'A' + 10;
505 else if (c2 >= 'a' && c2 <= 'f')
506 n2 = c2 - 'a' + 10;
507 else
508 n2 = -1;
510 if (n2 >= 0)
512 *backslash_counter = 0;
513 return (unsigned char) ((n1 << 4) + n2);
516 phase1_ungetc (c2);
518 phase1_ungetc (c1);
519 phase1_ungetc (c);
520 ++*backslash_counter;
521 return '\\';
525 if (interpret_unicode)
527 if (c == 'u')
529 unsigned char buf[4];
530 unsigned int n = 0;
531 int i;
533 for (i = 0; i < 4; i++)
535 int c1 = phase1_getc ();
537 if (c1 >= '0' && c1 <= '9')
538 n = (n << 4) + (c1 - '0');
539 else if (c1 >= 'A' && c1 <= 'F')
540 n = (n << 4) + (c1 - 'A' + 10);
541 else if (c1 >= 'a' && c1 <= 'f')
542 n = (n << 4) + (c1 - 'a' + 10);
543 else
545 phase1_ungetc (c1);
546 while (--i >= 0)
547 phase1_ungetc (buf[i]);
548 phase1_ungetc (c);
549 ++*backslash_counter;
550 return '\\';
553 buf[i] = c1;
555 *backslash_counter = 0;
556 return n;
559 if (interpret_ansic)
561 if (c == 'U')
563 unsigned char buf[8];
564 unsigned int n = 0;
565 int i;
567 for (i = 0; i < 8; i++)
569 int c1 = phase1_getc ();
571 if (c1 >= '0' && c1 <= '9')
572 n = (n << 4) + (c1 - '0');
573 else if (c1 >= 'A' && c1 <= 'F')
574 n = (n << 4) + (c1 - 'A' + 10);
575 else if (c1 >= 'a' && c1 <= 'f')
576 n = (n << 4) + (c1 - 'a' + 10);
577 else
579 phase1_ungetc (c1);
580 while (--i >= 0)
581 phase1_ungetc (buf[i]);
582 phase1_ungetc (c);
583 ++*backslash_counter;
584 return '\\';
587 buf[i] = c1;
589 if (n < 0x110000)
591 *backslash_counter = 0;
592 return n;
595 error_with_progname = false;
596 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
597 logical_file_name, line_number);
598 error_with_progname = true;
600 while (--i >= 0)
601 phase1_ungetc (buf[i]);
602 phase1_ungetc (c);
603 ++*backslash_counter;
604 return '\\';
607 if (c == 'N')
609 int c1 = phase1_getc ();
610 if (c1 == '{')
612 unsigned char buf[UNINAME_MAX + 1];
613 int i;
614 unsigned int n;
616 for (i = 0; i < UNINAME_MAX; i++)
618 int c2 = phase1_getc ();
619 if (!(c2 >= ' ' && c2 <= '~'))
621 phase1_ungetc (c2);
622 while (--i >= 0)
623 phase1_ungetc (buf[i]);
624 phase1_ungetc (c1);
625 phase1_ungetc (c);
626 ++*backslash_counter;
627 return '\\';
629 if (c2 == '}')
630 break;
631 buf[i] = c2;
633 buf[i] = '\0';
635 n = unicode_name_character ((char *) buf);
636 if (n != UNINAME_INVALID)
638 *backslash_counter = 0;
639 return n;
642 phase1_ungetc ('}');
643 while (--i >= 0)
644 phase1_ungetc (buf[i]);
646 phase1_ungetc (c1);
647 phase1_ungetc (c);
648 ++*backslash_counter;
649 return '\\';
654 phase1_ungetc (c);
655 ++*backslash_counter;
656 return '\\';
661 /* Combine characters into tokens. Discard whitespace except newlines at
662 the end of logical lines. */
664 /* Number of pending open parentheses/braces/brackets. */
665 static int open_pbb;
667 static token_ty phase5_pushback[1];
668 static int phase5_pushback_length;
670 static void
671 phase5_get (token_ty *tp)
673 int c;
675 if (phase5_pushback_length)
677 *tp = phase5_pushback[--phase5_pushback_length];
678 return;
681 for (;;)
683 tp->line_number = line_number;
684 c = phase2_getc ();
686 switch (c)
688 case EOF:
689 tp->type = token_type_eof;
690 return;
692 case ' ':
693 case '\t':
694 case '\f':
695 /* Ignore whitespace and comments. */
696 continue;
698 case '\n':
699 if (last_non_comment_line > last_comment_line)
700 savable_comment_reset ();
701 /* Ignore newline if and only if it is used for implicit line
702 joining. */
703 if (open_pbb > 0)
704 continue;
705 tp->type = token_type_other;
706 return;
709 last_non_comment_line = tp->line_number;
711 switch (c)
713 case '.':
715 int c1 = phase2_getc ();
716 phase2_ungetc (c1);
717 if (!(c1 >= '0' && c1 <= '9'))
720 tp->type = token_type_other;
721 return;
724 /* FALLTHROUGH */
725 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
726 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
727 case 'M': case 'N': case 'O': case 'P': case 'Q':
728 case 'S': case 'T': case 'V': case 'W': case 'X':
729 case 'Y': case 'Z':
730 case '_':
731 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
732 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
733 case 'm': case 'n': case 'o': case 'p': case 'q':
734 case 's': case 't': case 'v': case 'w': case 'x':
735 case 'y': case 'z':
736 case '0': case '1': case '2': case '3': case '4':
737 case '5': case '6': case '7': case '8': case '9':
738 symbol:
739 /* Symbol, or part of a number. */
741 static char *buffer;
742 static int bufmax;
743 int bufpos;
745 bufpos = 0;
746 for (;;)
748 if (bufpos >= bufmax)
750 bufmax = 2 * bufmax + 10;
751 buffer = xrealloc (buffer, bufmax);
753 buffer[bufpos++] = c;
754 c = phase2_getc ();
755 switch (c)
757 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
758 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
759 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
760 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
761 case 'Y': case 'Z':
762 case '_':
763 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
764 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
765 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
766 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
767 case 'y': case 'z':
768 case '0': case '1': case '2': case '3': case '4':
769 case '5': case '6': case '7': case '8': case '9':
770 continue;
771 default:
772 phase2_ungetc (c);
773 break;
775 break;
777 if (bufpos >= bufmax)
779 bufmax = 2 * bufmax + 10;
780 buffer = xrealloc (buffer, bufmax);
782 buffer[bufpos] = '\0';
783 tp->string = xstrdup (buffer);
784 tp->type = token_type_symbol;
785 return;
788 /* Strings. */
790 static unsigned short *buffer;
791 static int bufmax;
792 int bufpos;
793 int quote_char;
794 bool interpret_ansic;
795 bool interpret_unicode;
796 bool triple;
797 unsigned int backslash_counter;
799 case 'R': case 'r':
801 int c1 = phase1_getc ();
802 if (c1 == '"' || c1 == '\'')
804 quote_char = c1;
805 interpret_ansic = false;
806 interpret_unicode = false;
807 goto string;
809 phase1_ungetc (c1);
810 goto symbol;
813 case 'U': case 'u':
815 int c1 = phase1_getc ();
816 if (c1 == '"' || c1 == '\'')
818 quote_char = c1;
819 interpret_ansic = true;
820 interpret_unicode = true;
821 goto string;
823 if (c1 == 'R' || c1 == 'r')
825 int c2 = phase1_getc ();
826 if (c2 == '"' || c2 == '\'')
828 quote_char = c2;
829 interpret_ansic = false;
830 interpret_unicode = true;
831 goto string;
833 phase1_ungetc (c2);
835 phase1_ungetc (c1);
836 goto symbol;
839 case '"': case '\'':
840 quote_char = c;
841 interpret_ansic = true;
842 interpret_unicode = false;
843 string:
844 triple = false;
846 int c1 = phase1_getc ();
847 if (c1 == quote_char)
849 int c2 = phase1_getc ();
850 if (c2 == quote_char)
851 triple = true;
852 else
854 phase1_ungetc (c2);
855 phase1_ungetc (c1);
858 else
859 phase1_ungetc (c1);
861 backslash_counter = 0;
862 /* Start accumulating the string. We store the string in
863 UTF-16 before converting it to UTF-8. Why not converting
864 every character directly to UTF-8? Because a string can
865 contain surrogates like u"\uD800\uDF00", and we must
866 combine them to a single UTF-8 character. */
867 bufpos = 0;
868 for (;;)
870 int uc = phase7_getuc (quote_char, triple, interpret_ansic,
871 interpret_unicode, &backslash_counter);
872 unsigned int len;
874 if (uc == P7_EOF || uc == P7_STRING_END)
875 break;
877 assert (uc >= 0 && uc < 0x110000);
878 len = (uc < 0x10000 ? 1 : 2);
879 if (bufpos + len > bufmax)
881 bufmax = 2 * bufmax + 10;
882 buffer =
883 xrealloc (buffer, bufmax * sizeof (unsigned short));
885 if (uc < 0x10000)
886 buffer[bufpos++] = uc;
887 else
889 buffer[bufpos++] = 0xd800 + ((uc - 0x10000) >> 10);
890 buffer[bufpos++] = 0xdc00 + ((uc - 0x10000) & 0x3ff);
893 /* Now convert from UTF-16 to UTF-8. */
895 int pos;
896 unsigned char *utf8_string;
897 unsigned char *q;
899 /* Each UTF-16 word needs 3 bytes at worst. */
900 utf8_string = (unsigned char *) xmalloc (3 * bufpos + 1);
901 for (pos = 0, q = utf8_string; pos < bufpos; )
903 unsigned int uc;
904 int n;
906 pos += u16_mbtouc (&uc, buffer + pos, bufpos - pos);
907 n = u8_uctomb (q, uc, 6);
908 assert (n > 0);
909 q += n;
911 *q = '\0';
912 assert (q - utf8_string <= 3 * bufpos);
913 tp->string = (char *) utf8_string;
915 tp->comment = add_reference (savable_comment);
916 tp->type = token_type_string;
917 return;
920 case '(':
921 open_pbb++;
922 tp->type = token_type_lparen;
923 return;
925 case ')':
926 if (open_pbb > 0)
927 open_pbb--;
928 tp->type = token_type_rparen;
929 return;
931 case ',':
932 tp->type = token_type_comma;
933 return;
935 case '[': case '{':
936 open_pbb++;
937 tp->type = token_type_other;
938 return;
940 case ']': case '}':
941 if (open_pbb > 0)
942 open_pbb--;
943 tp->type = token_type_other;
944 return;
946 default:
947 /* We could carefully recognize each of the 2 and 3 character
948 operators, but it is not necessary, as we only need to recognize
949 gettext invocations. Don't bother. */
950 tp->type = token_type_other;
951 return;
956 /* Supports only one pushback token. */
957 static void
958 phase5_unget (token_ty *tp)
960 if (tp->type != token_type_eof)
962 if (phase5_pushback_length == SIZEOF (phase5_pushback))
963 abort ();
964 phase5_pushback[phase5_pushback_length++] = *tp;
969 /* Combine adjacent strings to form a single string. Note that the end
970 of a logical line appears as a token of its own, therefore strings that
971 belong to different logical lines will not be concatenated. */
973 static void
974 x_python_lex (token_ty *tp)
976 phase5_get (tp);
977 if (tp->type != token_type_string)
978 return;
979 for (;;)
981 token_ty tmp;
982 size_t len;
984 phase5_get (&tmp);
985 if (tmp.type != token_type_string)
987 phase5_unget (&tmp);
988 return;
990 len = strlen (tp->string);
991 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
992 strcpy (tp->string + len, tmp.string);
993 free (tmp.string);
998 /* ========================= Extracting strings. ========================== */
1001 /* Context lookup table. */
1002 static flag_context_list_table_ty *flag_context_list_table;
1005 /* The file is broken into tokens. Scan the token stream, looking for
1006 a keyword, followed by a left paren, followed by a string. When we
1007 see this sequence, we have something to remember. We assume we are
1008 looking at a valid C or C++ program, and leave the complaints about
1009 the grammar to the compiler.
1011 Normal handling: Look for
1012 keyword ( ... msgid ... )
1013 Plural handling: Look for
1014 keyword ( ... msgid ... msgid_plural ... )
1016 We use recursion because the arguments before msgid or between msgid
1017 and msgid_plural can contain subexpressions of the same form. */
1020 /* Extract messages until the next balanced closing parenthesis.
1021 Extracted messages are added to MLP.
1022 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1023 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1024 otherwise PLURAL_COMMAS = 0.
1025 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1026 Return true upon eof, false upon closing parenthesis. */
1027 static bool
1028 extract_parenthesized (message_list_ty *mlp,
1029 flag_context_ty outer_context,
1030 flag_context_list_iterator_ty context_iter,
1031 int commas_to_skip, int plural_commas)
1033 /* Remember the message containing the msgid, for msgid_plural. */
1034 message_ty *plural_mp = NULL;
1036 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1037 int state;
1038 /* Parameters of the keyword just seen. Defined only in state 1. */
1039 int next_commas_to_skip = -1;
1040 int next_plural_commas = 0;
1041 /* Context iterator that will be used if the next token is a '('. */
1042 flag_context_list_iterator_ty next_context_iter =
1043 passthrough_context_list_iterator;
1044 /* Current context. */
1045 flag_context_ty inner_context =
1046 inherited_context (outer_context,
1047 flag_context_list_iterator_advance (&context_iter));
1049 /* Start state is 0. */
1050 state = 0;
1052 for (;;)
1054 token_ty token;
1056 x_python_lex (&token);
1057 switch (token.type)
1059 case token_type_symbol:
1061 void *keyword_value;
1063 if (find_entry (&keywords, token.string, strlen (token.string),
1064 &keyword_value)
1065 == 0)
1067 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1068 int argnum2 = (int) (long) keyword_value >> 10;
1070 next_commas_to_skip = argnum1 - 1;
1071 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1072 state = 1;
1074 else
1075 state = 0;
1077 next_context_iter =
1078 flag_context_list_iterator (
1079 flag_context_list_table_lookup (
1080 flag_context_list_table,
1081 token.string, strlen (token.string)));
1082 free (token.string);
1083 continue;
1085 case token_type_lparen:
1086 if (extract_parenthesized (mlp, inner_context, next_context_iter,
1087 state ? next_commas_to_skip : -1,
1088 state ? next_plural_commas : 0))
1089 return true;
1090 next_context_iter = null_context_list_iterator;
1091 state = 0;
1092 continue;
1094 case token_type_rparen:
1095 return false;
1097 case token_type_comma:
1098 if (commas_to_skip >= 0)
1100 if (commas_to_skip > 0)
1101 commas_to_skip--;
1102 else
1103 if (plural_mp != NULL && plural_commas > 0)
1105 commas_to_skip = plural_commas - 1;
1106 plural_commas = 0;
1108 else
1109 commas_to_skip = -1;
1111 inner_context =
1112 inherited_context (outer_context,
1113 flag_context_list_iterator_advance (
1114 &context_iter));
1115 next_context_iter = passthrough_context_list_iterator;
1116 state = 0;
1117 continue;
1119 case token_type_string:
1121 lex_pos_ty pos;
1122 pos.file_name = logical_file_name;
1123 pos.line_number = token.line_number;
1125 if (extract_all)
1127 savable_comment_to_xgettext_comment (token.comment);
1128 remember_a_message (mlp, token.string, inner_context, &pos);
1129 savable_comment_reset ();
1131 else
1133 if (commas_to_skip == 0)
1135 if (plural_mp == NULL)
1137 /* Seen an msgid. */
1138 message_ty *mp;
1140 savable_comment_to_xgettext_comment (token.comment);
1141 mp = remember_a_message (mlp, token.string,
1142 inner_context, &pos);
1143 savable_comment_reset ();
1144 if (plural_commas > 0)
1145 plural_mp = mp;
1147 else
1149 /* Seen an msgid_plural. */
1150 remember_a_message_plural (plural_mp, token.string,
1151 inner_context, &pos);
1152 plural_mp = NULL;
1155 else
1156 free (token.string);
1159 drop_reference (token.comment);
1160 next_context_iter = null_context_list_iterator;
1161 state = 0;
1162 continue;
1164 case token_type_eof:
1165 return true;
1167 case token_type_other:
1168 next_context_iter = null_context_list_iterator;
1169 state = 0;
1170 continue;
1172 default:
1173 abort ();
1179 void
1180 extract_python (FILE *f,
1181 const char *real_filename, const char *logical_filename,
1182 flag_context_list_table_ty *flag_table,
1183 msgdomain_list_ty *mdlp)
1185 message_list_ty *mlp = mdlp->item[0]->messages;
1187 /* We convert our strings to UTF-8 encoding. */
1188 xgettext_current_source_encoding = po_charset_utf8;
1190 fp = f;
1191 real_file_name = real_filename;
1192 logical_file_name = xstrdup (logical_filename);
1193 line_number = 1;
1195 last_comment_line = -1;
1196 last_non_comment_line = -1;
1198 open_pbb = 0;
1200 flag_context_list_table = flag_table;
1202 init_keywords ();
1204 /* Eat tokens until eof is seen. When extract_parenthesized returns
1205 due to an unbalanced closing parenthesis, just restart it. */
1206 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1207 -1, 0))
1210 fp = NULL;
1211 real_file_name = NULL;
1212 logical_file_name = NULL;
1213 line_number = 0;