Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-php.c
blob80175e7ad15f0276ddda91a517e005ab67f5a9cf
1 /* xgettext PHP backend.
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <bruno@clisp.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-php.h"
32 #include "error.h"
33 #include "xalloc.h"
34 #include "exit.h"
35 #include "gettext.h"
37 #define _(s) gettext(s)
39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
43 See also php-4.1.0/Zend/zend_language_scanner.l. */
46 /* ====================== Keyword set customization. ====================== */
48 /* If true extract all strings. */
49 static bool extract_all = false;
51 static hash_table keywords;
52 static bool default_keywords = true;
55 void
56 x_php_extract_all ()
58 extract_all = true;
62 void
63 x_php_keyword (const char *name)
65 if (name == NULL)
66 default_keywords = false;
67 else
69 const char *end;
70 int argnum1;
71 int argnum2;
72 const char *colon;
74 if (keywords.table == NULL)
75 init_hash (&keywords, 100);
77 split_keywordspec (name, &end, &argnum1, &argnum2);
79 /* The characters between name and end should form a valid C identifier.
80 A colon means an invalid parse in split_keywordspec(). */
81 colon = strchr (name, ':');
82 if (colon == NULL || colon >= end)
84 if (argnum1 == 0)
85 argnum1 = 1;
86 insert_entry (&keywords, name, end - name,
87 (void *) (long) (argnum1 + (argnum2 << 10)));
92 /* Finish initializing the keywords hash table.
93 Called after argument processing, before each file is processed. */
94 static void
95 init_keywords ()
97 if (default_keywords)
99 x_php_keyword ("_");
100 x_php_keyword ("gettext");
101 x_php_keyword ("dgettext:2");
102 x_php_keyword ("dcgettext:2");
103 /* The following were added in PHP 4.2.0. */
104 x_php_keyword ("ngettext:1,2");
105 x_php_keyword ("dngettext:2,3");
106 x_php_keyword ("dcngettext:2,3");
107 default_keywords = false;
111 void
112 init_flag_table_php ()
114 xgettext_record_flag ("_:1:pass-php-format");
115 xgettext_record_flag ("gettext:1:pass-php-format");
116 xgettext_record_flag ("dgettext:2:pass-php-format");
117 xgettext_record_flag ("dcgettext:2:pass-php-format");
118 xgettext_record_flag ("ngettext:1:pass-php-format");
119 xgettext_record_flag ("ngettext:2:pass-php-format");
120 xgettext_record_flag ("dngettext:2:pass-php-format");
121 xgettext_record_flag ("dngettext:3:pass-php-format");
122 xgettext_record_flag ("dcngettext:2:pass-php-format");
123 xgettext_record_flag ("dcngettext:3:pass-php-format");
124 xgettext_record_flag ("sprintf:1:php-format");
125 xgettext_record_flag ("printf:1:php-format");
129 /* ======================== Reading of characters. ======================== */
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name;
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name;
137 static int line_number;
139 /* The input file stream. */
140 static FILE *fp;
143 /* 1. line_number handling. */
145 static unsigned char phase1_pushback[2];
146 static int phase1_pushback_length;
148 static int
149 phase1_getc ()
151 int c;
153 if (phase1_pushback_length)
154 c = phase1_pushback[--phase1_pushback_length];
155 else
157 c = getc (fp);
159 if (c == EOF)
161 if (ferror (fp))
162 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
163 real_file_name);
164 return EOF;
168 if (c == '\n')
169 line_number++;
171 return c;
174 /* Supports 2 characters of pushback. */
175 static void
176 phase1_ungetc (int c)
178 if (c != EOF)
180 if (c == '\n')
181 --line_number;
183 if (phase1_pushback_length == SIZEOF (phase1_pushback))
184 abort ();
185 phase1_pushback[phase1_pushback_length++] = c;
190 /* 2. Ignore HTML sections. They are equivalent to PHP echo commands and
191 therefore don't contain translatable strings. */
193 static void
194 skip_html ()
196 for (;;)
198 int c = phase1_getc ();
200 if (c == EOF)
201 return;
203 if (c == '<')
205 int c2 = phase1_getc ();
207 if (c2 == EOF)
208 break;
210 if (c2 == '?')
212 /* <?php is the normal way to enter PHP mode. <? and <?= are
213 recognized by PHP depending on a configuration setting. */
214 int c3 = phase1_getc ();
216 if (c3 != '=')
217 phase1_ungetc (c3);
219 return;
222 if (c2 == '%')
224 /* <% and <%= are recognized by PHP depending on a configuration
225 setting. */
226 int c3 = phase1_getc ();
228 if (c3 != '=')
229 phase1_ungetc (c3);
231 return;
234 if (c2 == '<')
236 phase1_ungetc (c2);
237 continue;
240 /* < script language = php >
241 < script language = "php" >
242 < script language = 'php' >
243 are always recognized. */
244 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
245 c2 = phase1_getc ();
246 if (c2 != 's' && c2 != 'S')
248 phase1_ungetc (c2);
249 continue;
251 c2 = phase1_getc ();
252 if (c2 != 'c' && c2 != 'C')
254 phase1_ungetc (c2);
255 continue;
257 c2 = phase1_getc ();
258 if (c2 != 'r' && c2 != 'R')
260 phase1_ungetc (c2);
261 continue;
263 c2 = phase1_getc ();
264 if (c2 != 'i' && c2 != 'I')
266 phase1_ungetc (c2);
267 continue;
269 c2 = phase1_getc ();
270 if (c2 != 'p' && c2 != 'P')
272 phase1_ungetc (c2);
273 continue;
275 c2 = phase1_getc ();
276 if (c2 != 't' && c2 != 'T')
278 phase1_ungetc (c2);
279 continue;
281 c2 = phase1_getc ();
282 if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
284 phase1_ungetc (c2);
285 continue;
288 c2 = phase1_getc ();
289 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
290 if (c2 != 'l' && c2 != 'L')
292 phase1_ungetc (c2);
293 continue;
295 c2 = phase1_getc ();
296 if (c2 != 'a' && c2 != 'A')
298 phase1_ungetc (c2);
299 continue;
301 c2 = phase1_getc ();
302 if (c2 != 'n' && c2 != 'N')
304 phase1_ungetc (c2);
305 continue;
307 c2 = phase1_getc ();
308 if (c2 != 'g' && c2 != 'G')
310 phase1_ungetc (c2);
311 continue;
313 c2 = phase1_getc ();
314 if (c2 != 'u' && c2 != 'U')
316 phase1_ungetc (c2);
317 continue;
319 c2 = phase1_getc ();
320 if (c2 != 'a' && c2 != 'A')
322 phase1_ungetc (c2);
323 continue;
325 c2 = phase1_getc ();
326 if (c2 != 'g' && c2 != 'G')
328 phase1_ungetc (c2);
329 continue;
331 c2 = phase1_getc ();
332 if (c2 != 'e' && c2 != 'E')
334 phase1_ungetc (c2);
335 continue;
337 c2 = phase1_getc ();
338 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
339 c2 = phase1_getc ();
340 if (c2 != '=')
342 phase1_ungetc (c2);
343 continue;
345 c2 = phase1_getc ();
346 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
347 c2 = phase1_getc ();
348 if (c2 == '"')
350 c2 = phase1_getc ();
351 if (c2 != 'p')
353 phase1_ungetc (c2);
354 continue;
356 c2 = phase1_getc ();
357 if (c2 != 'h')
359 phase1_ungetc (c2);
360 continue;
362 c2 = phase1_getc ();
363 if (c2 != 'p')
365 phase1_ungetc (c2);
366 continue;
368 c2 = phase1_getc ();
369 if (c2 != '"')
371 phase1_ungetc (c2);
372 continue;
375 else if (c2 == '\'')
377 c2 = phase1_getc ();
378 if (c2 != 'p')
380 phase1_ungetc (c2);
381 continue;
383 c2 = phase1_getc ();
384 if (c2 != 'h')
386 phase1_ungetc (c2);
387 continue;
389 c2 = phase1_getc ();
390 if (c2 != 'p')
392 phase1_ungetc (c2);
393 continue;
395 c2 = phase1_getc ();
396 if (c2 != '\'')
398 phase1_ungetc (c2);
399 continue;
402 else
404 if (c2 != 'p')
406 phase1_ungetc (c2);
407 continue;
409 c2 = phase1_getc ();
410 if (c2 != 'h')
412 phase1_ungetc (c2);
413 continue;
415 c2 = phase1_getc ();
416 if (c2 != 'p')
418 phase1_ungetc (c2);
419 continue;
422 c2 = phase1_getc ();
423 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
424 c2 = phase1_getc ();
425 if (c2 != '>')
427 phase1_ungetc (c2);
428 continue;
430 return;
435 #if 0
437 static unsigned char phase2_pushback[1];
438 static int phase2_pushback_length;
440 static int
441 phase2_getc ()
443 int c;
445 if (phase2_pushback_length)
446 return phase2_pushback[--phase2_pushback_length];
448 c = phase1_getc ();
449 switch (c)
451 case '?':
452 case '%':
454 int c2 = phase1_getc ();
455 if (c2 == '>')
457 /* ?> and %> terminate PHP mode and switch back to HTML mode. */
458 skip_html ();
459 return ' ';
461 phase1_ungetc (c2);
463 break;
465 case '<':
467 int c2 = phase1_getc ();
469 /* < / script > terminates PHP mode and switches back to HTML mode. */
470 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
471 c2 = phase1_getc ();
472 if (c2 == '/')
475 c2 = phase1_getc ();
476 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
477 if (c2 == 's' || c2 == 'S')
479 c2 = phase1_getc ();
480 if (c2 == 'c' || c2 == 'C')
482 c2 = phase1_getc ();
483 if (c2 == 'r' || c2 == 'R')
485 c2 = phase1_getc ();
486 if (c2 == 'i' || c2 == 'I')
488 c2 = phase1_getc ();
489 if (c2 == 'p' || c2 == 'P')
491 c2 = phase1_getc ();
492 if (c2 == 't' || c2 == 'T')
495 c2 = phase1_getc ();
496 while (c2 == ' ' || c2 == '\t'
497 || c2 == '\n' || c2 == '\r');
498 if (c2 == '>')
500 skip_html ();
501 return ' ';
510 phase1_ungetc (c2);
512 break;
515 return c;
518 static void
519 phase2_ungetc (int c)
521 if (c != EOF)
523 if (phase2_pushback_length == SIZEOF (phase2_pushback))
524 abort ();
525 phase2_pushback[phase2_pushback_length++] = c;
529 #endif
532 /* Accumulating comments. */
534 static char *buffer;
535 static size_t bufmax;
536 static size_t buflen;
538 static inline void
539 comment_start ()
541 buflen = 0;
544 static inline void
545 comment_add (int c)
547 if (buflen >= bufmax)
549 bufmax = 2 * bufmax + 10;
550 buffer = xrealloc (buffer, bufmax);
552 buffer[buflen++] = c;
555 static inline void
556 comment_line_end (size_t chars_to_remove)
558 buflen -= chars_to_remove;
559 while (buflen >= 1
560 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
561 --buflen;
562 if (chars_to_remove == 0 && buflen >= bufmax)
564 bufmax = 2 * bufmax + 10;
565 buffer = xrealloc (buffer, bufmax);
567 buffer[buflen] = '\0';
568 xgettext_comment_add (buffer);
572 /* 3. Replace each comment that is not inside a string literal with a
573 space character. We need to remember the comment for later, because
574 it may be attached to a keyword string. */
576 /* These are for tracking whether comments count as immediately before
577 keyword. */
578 static int last_comment_line;
579 static int last_non_comment_line;
581 static unsigned char phase3_pushback[1];
582 static int phase3_pushback_length;
584 static int
585 phase3_getc ()
587 int lineno;
588 int c;
590 if (phase3_pushback_length)
591 return phase3_pushback[--phase3_pushback_length];
593 c = phase1_getc ();
595 if (c == '#')
597 /* sh comment. */
598 bool last_was_qmark = false;
600 comment_start ();
601 lineno = line_number;
602 for (;;)
604 c = phase1_getc ();
605 if (c == '\n' || c == EOF)
607 comment_line_end (0);
608 break;
610 if (last_was_qmark && c == '>')
612 comment_line_end (1);
613 skip_html ();
614 break;
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen == 0 && (c == ' ' || c == '\t')))
618 comment_add (c);
619 last_was_qmark = (c == '?' || c == '%');
621 last_comment_line = lineno;
622 return '\n';
624 else if (c == '/')
626 c = phase1_getc ();
628 switch (c)
630 default:
631 phase1_ungetc (c);
632 return '/';
634 case '*':
636 /* C comment. */
637 bool last_was_star;
639 comment_start ();
640 lineno = line_number;
641 last_was_star = false;
642 for (;;)
644 c = phase1_getc ();
645 if (c == EOF)
646 break;
647 /* We skip all leading white space, but not EOLs. */
648 if (buflen == 0 && (c == ' ' || c == '\t'))
649 continue;
650 comment_add (c);
651 switch (c)
653 case '\n':
654 comment_line_end (1);
655 comment_start ();
656 lineno = line_number;
657 last_was_star = false;
658 continue;
660 case '*':
661 last_was_star = true;
662 continue;
664 case '/':
665 if (last_was_star)
667 comment_line_end (2);
668 break;
670 /* FALLTHROUGH */
672 default:
673 last_was_star = false;
674 continue;
676 break;
678 last_comment_line = lineno;
679 return ' ';
682 case '/':
684 /* C++ comment. */
685 bool last_was_qmark = false;
687 comment_start ();
688 lineno = line_number;
689 for (;;)
691 c = phase1_getc ();
692 if (c == '\n' || c == EOF)
694 comment_line_end (0);
695 break;
697 if (last_was_qmark && c == '>')
699 comment_line_end (1);
700 skip_html ();
701 break;
703 /* We skip all leading white space, but not EOLs. */
704 if (!(buflen == 0 && (c == ' ' || c == '\t')))
705 comment_add (c);
706 last_was_qmark = (c == '?' || c == '%');
708 last_comment_line = lineno;
709 return '\n';
713 else
714 return c;
717 #ifdef unused
718 static void
719 phase3_ungetc (int c)
721 if (c != EOF)
723 if (phase3_pushback_length == SIZEOF (phase3_pushback))
724 abort ();
725 phase3_pushback[phase3_pushback_length++] = c;
728 #endif
731 /* ========================== Reading of tokens. ========================== */
734 enum token_type_ty
736 token_type_eof,
737 token_type_lparen, /* ( */
738 token_type_rparen, /* ) */
739 token_type_comma, /* , */
740 token_type_string_literal, /* "abc" */
741 token_type_symbol, /* symbol, number */
742 token_type_other /* misc. operator */
744 typedef enum token_type_ty token_type_ty;
746 typedef struct token_ty token_ty;
747 struct token_ty
749 token_type_ty type;
750 char *string; /* for token_type_string_literal, token_type_symbol */
751 int line_number;
755 /* Free the memory pointed to by a 'struct token_ty'. */
756 static inline void
757 free_token (token_ty *tp)
759 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
760 free (tp->string);
764 /* 4. Combine characters into tokens. Discard whitespace. */
766 static void
767 x_php_lex (token_ty *tp)
769 static char *buffer;
770 static int bufmax;
771 int bufpos;
772 int c;
774 tp->string = NULL;
776 for (;;)
778 tp->line_number = line_number;
779 c = phase3_getc ();
780 switch (c)
782 case EOF:
783 tp->type = token_type_eof;
784 return;
786 case '\n':
787 if (last_non_comment_line > last_comment_line)
788 xgettext_comment_reset ();
789 /* FALLTHROUGH */
790 case ' ':
791 case '\t':
792 case '\r':
793 /* Ignore whitespace. */
794 continue;
797 last_non_comment_line = tp->line_number;
799 switch (c)
801 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
802 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
803 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
804 case 'V': case 'W': case 'X': case 'Y': case 'Z':
805 case '_':
806 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
807 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
808 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
809 case 'v': case 'w': case 'x': case 'y': case 'z':
810 bufpos = 0;
811 for (;;)
813 if (bufpos >= bufmax)
815 bufmax = 2 * bufmax + 10;
816 buffer = xrealloc (buffer, bufmax);
818 buffer[bufpos++] = c;
819 c = phase1_getc ();
820 switch (c)
822 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
823 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
824 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
825 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
826 case 'Y': case 'Z':
827 case '_':
828 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
829 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
830 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
831 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
832 case 'y': case 'z':
833 case '0': case '1': case '2': case '3': case '4':
834 case '5': case '6': case '7': case '8': case '9':
835 continue;
837 default:
838 phase1_ungetc (c);
839 break;
841 break;
843 if (bufpos >= bufmax)
845 bufmax = 2 * bufmax + 10;
846 buffer = xrealloc (buffer, bufmax);
848 buffer[bufpos] = 0;
849 tp->string = xstrdup (buffer);
850 tp->type = token_type_symbol;
851 return;
853 case '\'':
854 /* Single-quoted string literal. */
855 bufpos = 0;
856 for (;;)
858 c = phase1_getc ();
859 if (c == EOF || c == '\'')
860 break;
861 if (c == '\\')
863 c = phase1_getc ();
864 if (c != '\\' && c != '\'')
866 phase1_ungetc (c);
867 c = '\\';
870 if (bufpos >= bufmax)
872 bufmax = 2 * bufmax + 10;
873 buffer = xrealloc (buffer, bufmax);
875 buffer[bufpos++] = c;
877 if (bufpos >= bufmax)
879 bufmax = 2 * bufmax + 10;
880 buffer = xrealloc (buffer, bufmax);
882 buffer[bufpos] = 0;
883 tp->type = token_type_string_literal;
884 tp->string = xstrdup (buffer);
885 return;
887 case '"':
888 /* Double-quoted string literal. */
889 tp->type = token_type_string_literal;
890 bufpos = 0;
891 for (;;)
893 c = phase1_getc ();
894 if (c == EOF || c == '"')
895 break;
896 if (c == '$')
898 c = phase1_getc ();
899 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
900 || c == '_' || c == '{' || c >= 0x7f)
902 /* String with variables. */
903 tp->type = token_type_other;
904 continue;
906 phase1_ungetc (c);
907 c = '$';
909 if (c == '{')
911 c = phase1_getc ();
912 if (c == '$')
914 /* String with expressions. */
915 tp->type = token_type_other;
916 continue;
918 phase1_ungetc (c);
919 c = '{';
921 if (c == '\\')
923 int n, j;
925 c = phase1_getc ();
926 switch (c)
928 case '"':
929 case '\\':
930 case '$':
931 break;
933 case '0': case '1': case '2': case '3':
934 case '4': case '5': case '6': case '7':
935 n = 0;
936 for (j = 0; j < 3; ++j)
938 n = n * 8 + c - '0';
939 c = phase1_getc ();
940 switch (c)
942 default:
943 break;
945 case '0': case '1': case '2': case '3':
946 case '4': case '5': case '6': case '7':
947 continue;
949 break;
951 phase1_ungetc (c);
952 c = n;
953 break;
955 case 'x':
956 n = 0;
957 for (j = 0; j < 2; ++j)
959 c = phase1_getc ();
960 switch (c)
962 case '0': case '1': case '2': case '3': case '4':
963 case '5': case '6': case '7': case '8': case '9':
964 n = n * 16 + c - '0';
965 break;
966 case 'A': case 'B': case 'C': case 'D': case 'E':
967 case 'F':
968 n = n * 16 + 10 + c - 'A';
969 break;
970 case 'a': case 'b': case 'c': case 'd': case 'e':
971 case 'f':
972 n = n * 16 + 10 + c - 'a';
973 break;
974 default:
975 phase1_ungetc (c);
976 c = 0;
977 break;
979 if (c == 0)
980 break;
982 if (j == 0)
984 phase1_ungetc ('x');
985 c = '\\';
987 else
988 c = n;
989 break;
991 case 'n':
992 c = '\n';
993 break;
994 case 't':
995 c = '\t';
996 break;
997 case 'r':
998 c = '\r';
999 break;
1001 default:
1002 phase1_ungetc (c);
1003 c = '\\';
1004 break;
1007 if (bufpos >= bufmax)
1009 bufmax = 2 * bufmax + 10;
1010 buffer = xrealloc (buffer, bufmax);
1012 buffer[bufpos++] = c;
1014 if (bufpos >= bufmax)
1016 bufmax = 2 * bufmax + 10;
1017 buffer = xrealloc (buffer, bufmax);
1019 buffer[bufpos] = 0;
1020 if (tp->type == token_type_string_literal)
1021 tp->string = xstrdup (buffer);
1022 return;
1024 case '?':
1025 case '%':
1027 int c2 = phase1_getc ();
1028 if (c2 == '>')
1030 /* ?> and %> terminate PHP mode and switch back to HTML
1031 mode. */
1032 skip_html ();
1034 else
1035 phase1_ungetc (c2);
1036 tp->type = token_type_other;
1037 return;
1040 case '(':
1041 tp->type = token_type_lparen;
1042 return;
1044 case ')':
1045 tp->type = token_type_rparen;
1046 return;
1048 case ',':
1049 tp->type = token_type_comma;
1050 return;
1052 case '<':
1054 int c2 = phase1_getc ();
1055 if (c2 == '<')
1057 int c3 = phase1_getc ();
1058 if (c3 == '<')
1060 /* Start of here document.
1061 Parse whitespace, then label, then newline. */
1063 c = phase3_getc ();
1064 while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1066 bufpos = 0;
1069 if (bufpos >= bufmax)
1071 bufmax = 2 * bufmax + 10;
1072 buffer = xrealloc (buffer, bufmax);
1074 buffer[bufpos++] = c;
1075 c = phase3_getc ();
1077 while (c != EOF && c != '\n' && c != '\r');
1078 /* buffer[0..bufpos-1] now contains the label. */
1080 /* Now skip the here document. */
1081 for (;;)
1083 c = phase1_getc ();
1084 if (c == EOF)
1085 break;
1086 if (c == '\n' || c == '\r')
1088 int bufidx = 0;
1090 while (bufidx < bufpos)
1092 c = phase1_getc ();
1093 if (c == EOF)
1094 break;
1095 if (c != buffer[bufidx])
1097 phase1_ungetc (c);
1098 break;
1101 c = phase1_getc ();
1102 if (c != ';')
1103 phase1_ungetc (c);
1104 c = phase1_getc ();
1105 if (c == '\n' || c == '\r')
1106 break;
1110 /* FIXME: Ideally we should turn the here document into a
1111 string literal if it didn't contain $ substitution. And
1112 we should also respect backslash escape sequences like
1113 in double-quoted strings. */
1114 tp->type = token_type_other;
1115 return;
1117 phase1_ungetc (c3);
1120 /* < / script > terminates PHP mode and switches back to HTML
1121 mode. */
1122 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1123 c2 = phase1_getc ();
1124 if (c2 == '/')
1127 c2 = phase1_getc ();
1128 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1129 if (c2 == 's' || c2 == 'S')
1131 c2 = phase1_getc ();
1132 if (c2 == 'c' || c2 == 'C')
1134 c2 = phase1_getc ();
1135 if (c2 == 'r' || c2 == 'R')
1137 c2 = phase1_getc ();
1138 if (c2 == 'i' || c2 == 'I')
1140 c2 = phase1_getc ();
1141 if (c2 == 'p' || c2 == 'P')
1143 c2 = phase1_getc ();
1144 if (c2 == 't' || c2 == 'T')
1147 c2 = phase1_getc ();
1148 while (c2 == ' ' || c2 == '\t'
1149 || c2 == '\n' || c2 == '\r');
1150 if (c2 == '>')
1152 skip_html ();
1154 else
1155 phase1_ungetc (c2);
1157 else
1158 phase1_ungetc (c2);
1160 else
1161 phase1_ungetc (c2);
1163 else
1164 phase1_ungetc (c2);
1166 else
1167 phase1_ungetc (c2);
1169 else
1170 phase1_ungetc (c2);
1172 else
1173 phase1_ungetc (c2);
1175 else
1176 phase1_ungetc (c2);
1178 tp->type = token_type_other;
1179 return;
1182 case '`':
1183 /* Execution operator. */
1184 default:
1185 /* We could carefully recognize each of the 2 and 3 character
1186 operators, but it is not necessary, as we only need to recognize
1187 gettext invocations. Don't bother. */
1188 tp->type = token_type_other;
1189 return;
1195 /* ========================= Extracting strings. ========================== */
1198 /* Context lookup table. */
1199 static flag_context_list_table_ty *flag_context_list_table;
1202 /* The file is broken into tokens. Scan the token stream, looking for
1203 a keyword, followed by a left paren, followed by a string. When we
1204 see this sequence, we have something to remember. We assume we are
1205 looking at a valid C or C++ program, and leave the complaints about
1206 the grammar to the compiler.
1208 Normal handling: Look for
1209 keyword ( ... msgid ... )
1210 Plural handling: Look for
1211 keyword ( ... msgid ... msgid_plural ... )
1213 We use recursion because the arguments before msgid or between msgid
1214 and msgid_plural can contain subexpressions of the same form. */
1217 /* Extract messages until the next balanced closing parenthesis.
1218 Extracted messages are added to MLP.
1219 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1220 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1221 otherwise PLURAL_COMMAS = 0.
1222 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1223 Return true upon eof, false upon closing parenthesis. */
1224 static bool
1225 extract_parenthesized (message_list_ty *mlp,
1226 flag_context_ty outer_context,
1227 flag_context_list_iterator_ty context_iter,
1228 int commas_to_skip, int plural_commas)
1230 /* Remember the message containing the msgid, for msgid_plural. */
1231 message_ty *plural_mp = NULL;
1233 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1234 int state;
1235 /* Parameters of the keyword just seen. Defined only in state 1. */
1236 int next_commas_to_skip = -1;
1237 int next_plural_commas = 0;
1238 /* Context iterator that will be used if the next token is a '('. */
1239 flag_context_list_iterator_ty next_context_iter =
1240 passthrough_context_list_iterator;
1241 /* Current context. */
1242 flag_context_ty inner_context =
1243 inherited_context (outer_context,
1244 flag_context_list_iterator_advance (&context_iter));
1246 /* Start state is 0. */
1247 state = 0;
1249 for (;;)
1251 token_ty token;
1253 x_php_lex (&token);
1254 switch (token.type)
1256 case token_type_symbol:
1258 void *keyword_value;
1260 if (find_entry (&keywords, token.string, strlen (token.string),
1261 &keyword_value)
1262 == 0)
1264 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1265 int argnum2 = (int) (long) keyword_value >> 10;
1267 next_commas_to_skip = argnum1 - 1;
1268 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1269 state = 1;
1271 else
1272 state = 0;
1274 next_context_iter =
1275 flag_context_list_iterator (
1276 flag_context_list_table_lookup (
1277 flag_context_list_table,
1278 token.string, strlen (token.string)));
1279 free (token.string);
1280 continue;
1282 case token_type_lparen:
1283 if (extract_parenthesized (mlp, inner_context, next_context_iter,
1284 state ? next_commas_to_skip : -1,
1285 state ? next_plural_commas: 0))
1286 return true;
1287 next_context_iter = null_context_list_iterator;
1288 state = 0;
1289 continue;
1291 case token_type_rparen:
1292 return false;
1294 case token_type_comma:
1295 if (commas_to_skip >= 0)
1297 if (commas_to_skip > 0)
1298 commas_to_skip--;
1299 else
1300 if (plural_mp != NULL && plural_commas > 0)
1302 commas_to_skip = plural_commas - 1;
1303 plural_commas = 0;
1305 else
1306 commas_to_skip = -1;
1308 inner_context =
1309 inherited_context (outer_context,
1310 flag_context_list_iterator_advance (
1311 &context_iter));
1312 next_context_iter = passthrough_context_list_iterator;
1313 state = 0;
1314 continue;
1316 case token_type_string_literal:
1318 lex_pos_ty pos;
1319 pos.file_name = logical_file_name;
1320 pos.line_number = token.line_number;
1322 if (extract_all)
1323 remember_a_message (mlp, token.string, inner_context, &pos);
1324 else
1326 if (commas_to_skip == 0)
1328 if (plural_mp == NULL)
1330 /* Seen an msgid. */
1331 message_ty *mp =
1332 remember_a_message (mlp, token.string,
1333 inner_context, &pos);
1334 if (plural_commas > 0)
1335 plural_mp = mp;
1337 else
1339 /* Seen an msgid_plural. */
1340 remember_a_message_plural (plural_mp, token.string,
1341 inner_context, &pos);
1342 plural_mp = NULL;
1345 else
1346 free (token.string);
1349 next_context_iter = null_context_list_iterator;
1350 state = 0;
1351 continue;
1353 case token_type_other:
1354 next_context_iter = null_context_list_iterator;
1355 state = 0;
1356 continue;
1358 case token_type_eof:
1359 return true;
1361 default:
1362 abort ();
1368 void
1369 extract_php (FILE *f,
1370 const char *real_filename, const char *logical_filename,
1371 flag_context_list_table_ty *flag_table,
1372 msgdomain_list_ty *mdlp)
1374 message_list_ty *mlp = mdlp->item[0]->messages;
1376 fp = f;
1377 real_file_name = real_filename;
1378 logical_file_name = xstrdup (logical_filename);
1379 line_number = 1;
1381 last_comment_line = -1;
1382 last_non_comment_line = -1;
1384 flag_context_list_table = flag_table;
1386 init_keywords ();
1388 /* Initial mode is HTML mode, not PHP mode. */
1389 skip_html ();
1391 /* Eat tokens until eof is seen. When extract_parenthesized returns
1392 due to an unbalanced closing parenthesis, just restart it. */
1393 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1394 -1, 0))
1397 /* Close scanner. */
1398 fp = NULL;
1399 real_file_name = NULL;
1400 logical_file_name = NULL;
1401 line_number = 0;