1 /* xgettext PHP backend.
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <bruno@clisp.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
37 #define _(s) gettext(s)
39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
43 See also php-4.1.0/Zend/zend_language_scanner.l. */
46 /* ====================== Keyword set customization. ====================== */
48 /* If true extract all strings. */
49 static bool extract_all
= false;
51 static hash_table keywords
;
52 static bool default_keywords
= true;
63 x_php_keyword (const char *name
)
66 default_keywords
= false;
74 if (keywords
.table
== NULL
)
75 init_hash (&keywords
, 100);
77 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
79 /* The characters between name and end should form a valid C identifier.
80 A colon means an invalid parse in split_keywordspec(). */
81 colon
= strchr (name
, ':');
82 if (colon
== NULL
|| colon
>= end
)
86 insert_entry (&keywords
, name
, end
- name
,
87 (void *) (long) (argnum1
+ (argnum2
<< 10)));
92 /* Finish initializing the keywords hash table.
93 Called after argument processing, before each file is processed. */
100 x_php_keyword ("gettext");
101 x_php_keyword ("dgettext:2");
102 x_php_keyword ("dcgettext:2");
103 /* The following were added in PHP 4.2.0. */
104 x_php_keyword ("ngettext:1,2");
105 x_php_keyword ("dngettext:2,3");
106 x_php_keyword ("dcngettext:2,3");
107 default_keywords
= false;
112 init_flag_table_php ()
114 xgettext_record_flag ("_:1:pass-php-format");
115 xgettext_record_flag ("gettext:1:pass-php-format");
116 xgettext_record_flag ("dgettext:2:pass-php-format");
117 xgettext_record_flag ("dcgettext:2:pass-php-format");
118 xgettext_record_flag ("ngettext:1:pass-php-format");
119 xgettext_record_flag ("ngettext:2:pass-php-format");
120 xgettext_record_flag ("dngettext:2:pass-php-format");
121 xgettext_record_flag ("dngettext:3:pass-php-format");
122 xgettext_record_flag ("dcngettext:2:pass-php-format");
123 xgettext_record_flag ("dcngettext:3:pass-php-format");
124 xgettext_record_flag ("sprintf:1:php-format");
125 xgettext_record_flag ("printf:1:php-format");
129 /* ======================== Reading of characters. ======================== */
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name
;
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name
;
137 static int line_number
;
139 /* The input file stream. */
143 /* 1. line_number handling. */
145 static unsigned char phase1_pushback
[2];
146 static int phase1_pushback_length
;
153 if (phase1_pushback_length
)
154 c
= phase1_pushback
[--phase1_pushback_length
];
162 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
174 /* Supports 2 characters of pushback. */
176 phase1_ungetc (int c
)
183 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
185 phase1_pushback
[phase1_pushback_length
++] = c
;
190 /* 2. Ignore HTML sections. They are equivalent to PHP echo commands and
191 therefore don't contain translatable strings. */
198 int c
= phase1_getc ();
205 int c2
= phase1_getc ();
212 /* <?php is the normal way to enter PHP mode. <? and <?= are
213 recognized by PHP depending on a configuration setting. */
214 int c3
= phase1_getc ();
224 /* <% and <%= are recognized by PHP depending on a configuration
226 int c3
= phase1_getc ();
240 /* < script language = php >
241 < script language = "php" >
242 < script language = 'php' >
243 are always recognized. */
244 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r')
246 if (c2
!= 's' && c2
!= 'S')
252 if (c2
!= 'c' && c2
!= 'C')
258 if (c2
!= 'r' && c2
!= 'R')
264 if (c2
!= 'i' && c2
!= 'I')
270 if (c2
!= 'p' && c2
!= 'P')
276 if (c2
!= 't' && c2
!= 'T')
282 if (!(c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r'))
289 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r');
290 if (c2
!= 'l' && c2
!= 'L')
296 if (c2
!= 'a' && c2
!= 'A')
302 if (c2
!= 'n' && c2
!= 'N')
308 if (c2
!= 'g' && c2
!= 'G')
314 if (c2
!= 'u' && c2
!= 'U')
320 if (c2
!= 'a' && c2
!= 'A')
326 if (c2
!= 'g' && c2
!= 'G')
332 if (c2
!= 'e' && c2
!= 'E')
338 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r')
346 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r')
423 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r')
437 static unsigned char phase2_pushback
[1];
438 static int phase2_pushback_length
;
445 if (phase2_pushback_length
)
446 return phase2_pushback
[--phase2_pushback_length
];
454 int c2
= phase1_getc ();
457 /* ?> and %> terminate PHP mode and switch back to HTML mode. */
467 int c2
= phase1_getc ();
469 /* < / script > terminates PHP mode and switches back to HTML mode. */
470 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r')
476 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r');
477 if (c2
== 's' || c2
== 'S')
480 if (c2
== 'c' || c2
== 'C')
483 if (c2
== 'r' || c2
== 'R')
486 if (c2
== 'i' || c2
== 'I')
489 if (c2
== 'p' || c2
== 'P')
492 if (c2
== 't' || c2
== 'T')
496 while (c2
== ' ' || c2
== '\t'
497 || c2
== '\n' || c2
== '\r');
519 phase2_ungetc (int c
)
523 if (phase2_pushback_length
== SIZEOF (phase2_pushback
))
525 phase2_pushback
[phase2_pushback_length
++] = c
;
532 /* Accumulating comments. */
535 static size_t bufmax
;
536 static size_t buflen
;
547 if (buflen
>= bufmax
)
549 bufmax
= 2 * bufmax
+ 10;
550 buffer
= xrealloc (buffer
, bufmax
);
552 buffer
[buflen
++] = c
;
556 comment_line_end (size_t chars_to_remove
)
558 buflen
-= chars_to_remove
;
560 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
562 if (chars_to_remove
== 0 && buflen
>= bufmax
)
564 bufmax
= 2 * bufmax
+ 10;
565 buffer
= xrealloc (buffer
, bufmax
);
567 buffer
[buflen
] = '\0';
568 xgettext_comment_add (buffer
);
572 /* 3. Replace each comment that is not inside a string literal with a
573 space character. We need to remember the comment for later, because
574 it may be attached to a keyword string. */
576 /* These are for tracking whether comments count as immediately before
578 static int last_comment_line
;
579 static int last_non_comment_line
;
581 static unsigned char phase3_pushback
[1];
582 static int phase3_pushback_length
;
590 if (phase3_pushback_length
)
591 return phase3_pushback
[--phase3_pushback_length
];
598 bool last_was_qmark
= false;
601 lineno
= line_number
;
605 if (c
== '\n' || c
== EOF
)
607 comment_line_end (0);
610 if (last_was_qmark
&& c
== '>')
612 comment_line_end (1);
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
619 last_was_qmark
= (c
== '?' || c
== '%');
621 last_comment_line
= lineno
;
640 lineno
= line_number
;
641 last_was_star
= false;
647 /* We skip all leading white space, but not EOLs. */
648 if (buflen
== 0 && (c
== ' ' || c
== '\t'))
654 comment_line_end (1);
656 lineno
= line_number
;
657 last_was_star
= false;
661 last_was_star
= true;
667 comment_line_end (2);
673 last_was_star
= false;
678 last_comment_line
= lineno
;
685 bool last_was_qmark
= false;
688 lineno
= line_number
;
692 if (c
== '\n' || c
== EOF
)
694 comment_line_end (0);
697 if (last_was_qmark
&& c
== '>')
699 comment_line_end (1);
703 /* We skip all leading white space, but not EOLs. */
704 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
706 last_was_qmark
= (c
== '?' || c
== '%');
708 last_comment_line
= lineno
;
719 phase3_ungetc (int c
)
723 if (phase3_pushback_length
== SIZEOF (phase3_pushback
))
725 phase3_pushback
[phase3_pushback_length
++] = c
;
731 /* ========================== Reading of tokens. ========================== */
737 token_type_lparen
, /* ( */
738 token_type_rparen
, /* ) */
739 token_type_comma
, /* , */
740 token_type_string_literal
, /* "abc" */
741 token_type_symbol
, /* symbol, number */
742 token_type_other
/* misc. operator */
744 typedef enum token_type_ty token_type_ty
;
746 typedef struct token_ty token_ty
;
750 char *string
; /* for token_type_string_literal, token_type_symbol */
755 /* Free the memory pointed to by a 'struct token_ty'. */
757 free_token (token_ty
*tp
)
759 if (tp
->type
== token_type_string_literal
|| tp
->type
== token_type_symbol
)
764 /* 4. Combine characters into tokens. Discard whitespace. */
767 x_php_lex (token_ty
*tp
)
778 tp
->line_number
= line_number
;
783 tp
->type
= token_type_eof
;
787 if (last_non_comment_line
> last_comment_line
)
788 xgettext_comment_reset ();
793 /* Ignore whitespace. */
797 last_non_comment_line
= tp
->line_number
;
801 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
802 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
803 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
804 case 'V': case 'W': case 'X': case 'Y': case 'Z':
806 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
807 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
808 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
809 case 'v': case 'w': case 'x': case 'y': case 'z':
813 if (bufpos
>= bufmax
)
815 bufmax
= 2 * bufmax
+ 10;
816 buffer
= xrealloc (buffer
, bufmax
);
818 buffer
[bufpos
++] = c
;
822 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
823 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
824 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
825 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
828 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
829 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
830 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
831 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
833 case '0': case '1': case '2': case '3': case '4':
834 case '5': case '6': case '7': case '8': case '9':
843 if (bufpos
>= bufmax
)
845 bufmax
= 2 * bufmax
+ 10;
846 buffer
= xrealloc (buffer
, bufmax
);
849 tp
->string
= xstrdup (buffer
);
850 tp
->type
= token_type_symbol
;
854 /* Single-quoted string literal. */
859 if (c
== EOF
|| c
== '\'')
864 if (c
!= '\\' && c
!= '\'')
870 if (bufpos
>= bufmax
)
872 bufmax
= 2 * bufmax
+ 10;
873 buffer
= xrealloc (buffer
, bufmax
);
875 buffer
[bufpos
++] = c
;
877 if (bufpos
>= bufmax
)
879 bufmax
= 2 * bufmax
+ 10;
880 buffer
= xrealloc (buffer
, bufmax
);
883 tp
->type
= token_type_string_literal
;
884 tp
->string
= xstrdup (buffer
);
888 /* Double-quoted string literal. */
889 tp
->type
= token_type_string_literal
;
894 if (c
== EOF
|| c
== '"')
899 if ((c
>= 'A' && c
<= 'Z') || (c
>= 'a' && c
<= 'z')
900 || c
== '_' || c
== '{' || c
>= 0x7f)
902 /* String with variables. */
903 tp
->type
= token_type_other
;
914 /* String with expressions. */
915 tp
->type
= token_type_other
;
933 case '0': case '1': case '2': case '3':
934 case '4': case '5': case '6': case '7':
936 for (j
= 0; j
< 3; ++j
)
945 case '0': case '1': case '2': case '3':
946 case '4': case '5': case '6': case '7':
957 for (j
= 0; j
< 2; ++j
)
962 case '0': case '1': case '2': case '3': case '4':
963 case '5': case '6': case '7': case '8': case '9':
964 n
= n
* 16 + c
- '0';
966 case 'A': case 'B': case 'C': case 'D': case 'E':
968 n
= n
* 16 + 10 + c
- 'A';
970 case 'a': case 'b': case 'c': case 'd': case 'e':
972 n
= n
* 16 + 10 + c
- 'a';
1007 if (bufpos
>= bufmax
)
1009 bufmax
= 2 * bufmax
+ 10;
1010 buffer
= xrealloc (buffer
, bufmax
);
1012 buffer
[bufpos
++] = c
;
1014 if (bufpos
>= bufmax
)
1016 bufmax
= 2 * bufmax
+ 10;
1017 buffer
= xrealloc (buffer
, bufmax
);
1020 if (tp
->type
== token_type_string_literal
)
1021 tp
->string
= xstrdup (buffer
);
1027 int c2
= phase1_getc ();
1030 /* ?> and %> terminate PHP mode and switch back to HTML
1036 tp
->type
= token_type_other
;
1041 tp
->type
= token_type_lparen
;
1045 tp
->type
= token_type_rparen
;
1049 tp
->type
= token_type_comma
;
1054 int c2
= phase1_getc ();
1057 int c3
= phase1_getc ();
1060 /* Start of here document.
1061 Parse whitespace, then label, then newline. */
1064 while (c
== ' ' || c
== '\t' || c
== '\n' || c
== '\r');
1069 if (bufpos
>= bufmax
)
1071 bufmax
= 2 * bufmax
+ 10;
1072 buffer
= xrealloc (buffer
, bufmax
);
1074 buffer
[bufpos
++] = c
;
1077 while (c
!= EOF
&& c
!= '\n' && c
!= '\r');
1078 /* buffer[0..bufpos-1] now contains the label. */
1080 /* Now skip the here document. */
1086 if (c
== '\n' || c
== '\r')
1090 while (bufidx
< bufpos
)
1095 if (c
!= buffer
[bufidx
])
1105 if (c
== '\n' || c
== '\r')
1110 /* FIXME: Ideally we should turn the here document into a
1111 string literal if it didn't contain $ substitution. And
1112 we should also respect backslash escape sequences like
1113 in double-quoted strings. */
1114 tp
->type
= token_type_other
;
1120 /* < / script > terminates PHP mode and switches back to HTML
1122 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r')
1123 c2
= phase1_getc ();
1127 c2
= phase1_getc ();
1128 while (c2
== ' ' || c2
== '\t' || c2
== '\n' || c2
== '\r');
1129 if (c2
== 's' || c2
== 'S')
1131 c2
= phase1_getc ();
1132 if (c2
== 'c' || c2
== 'C')
1134 c2
= phase1_getc ();
1135 if (c2
== 'r' || c2
== 'R')
1137 c2
= phase1_getc ();
1138 if (c2
== 'i' || c2
== 'I')
1140 c2
= phase1_getc ();
1141 if (c2
== 'p' || c2
== 'P')
1143 c2
= phase1_getc ();
1144 if (c2
== 't' || c2
== 'T')
1147 c2
= phase1_getc ();
1148 while (c2
== ' ' || c2
== '\t'
1149 || c2
== '\n' || c2
== '\r');
1178 tp
->type
= token_type_other
;
1183 /* Execution operator. */
1185 /* We could carefully recognize each of the 2 and 3 character
1186 operators, but it is not necessary, as we only need to recognize
1187 gettext invocations. Don't bother. */
1188 tp
->type
= token_type_other
;
1195 /* ========================= Extracting strings. ========================== */
1198 /* Context lookup table. */
1199 static flag_context_list_table_ty
*flag_context_list_table
;
1202 /* The file is broken into tokens. Scan the token stream, looking for
1203 a keyword, followed by a left paren, followed by a string. When we
1204 see this sequence, we have something to remember. We assume we are
1205 looking at a valid C or C++ program, and leave the complaints about
1206 the grammar to the compiler.
1208 Normal handling: Look for
1209 keyword ( ... msgid ... )
1210 Plural handling: Look for
1211 keyword ( ... msgid ... msgid_plural ... )
1213 We use recursion because the arguments before msgid or between msgid
1214 and msgid_plural can contain subexpressions of the same form. */
1217 /* Extract messages until the next balanced closing parenthesis.
1218 Extracted messages are added to MLP.
1219 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1220 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1221 otherwise PLURAL_COMMAS = 0.
1222 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1223 Return true upon eof, false upon closing parenthesis. */
1225 extract_parenthesized (message_list_ty
*mlp
,
1226 flag_context_ty outer_context
,
1227 flag_context_list_iterator_ty context_iter
,
1228 int commas_to_skip
, int plural_commas
)
1230 /* Remember the message containing the msgid, for msgid_plural. */
1231 message_ty
*plural_mp
= NULL
;
1233 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1235 /* Parameters of the keyword just seen. Defined only in state 1. */
1236 int next_commas_to_skip
= -1;
1237 int next_plural_commas
= 0;
1238 /* Context iterator that will be used if the next token is a '('. */
1239 flag_context_list_iterator_ty next_context_iter
=
1240 passthrough_context_list_iterator
;
1241 /* Current context. */
1242 flag_context_ty inner_context
=
1243 inherited_context (outer_context
,
1244 flag_context_list_iterator_advance (&context_iter
));
1246 /* Start state is 0. */
1256 case token_type_symbol
:
1258 void *keyword_value
;
1260 if (find_entry (&keywords
, token
.string
, strlen (token
.string
),
1264 int argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
1265 int argnum2
= (int) (long) keyword_value
>> 10;
1267 next_commas_to_skip
= argnum1
- 1;
1268 next_plural_commas
= (argnum2
> argnum1
? argnum2
- argnum1
: 0);
1275 flag_context_list_iterator (
1276 flag_context_list_table_lookup (
1277 flag_context_list_table
,
1278 token
.string
, strlen (token
.string
)));
1279 free (token
.string
);
1282 case token_type_lparen
:
1283 if (extract_parenthesized (mlp
, inner_context
, next_context_iter
,
1284 state
? next_commas_to_skip
: -1,
1285 state
? next_plural_commas
: 0))
1287 next_context_iter
= null_context_list_iterator
;
1291 case token_type_rparen
:
1294 case token_type_comma
:
1295 if (commas_to_skip
>= 0)
1297 if (commas_to_skip
> 0)
1300 if (plural_mp
!= NULL
&& plural_commas
> 0)
1302 commas_to_skip
= plural_commas
- 1;
1306 commas_to_skip
= -1;
1309 inherited_context (outer_context
,
1310 flag_context_list_iterator_advance (
1312 next_context_iter
= passthrough_context_list_iterator
;
1316 case token_type_string_literal
:
1319 pos
.file_name
= logical_file_name
;
1320 pos
.line_number
= token
.line_number
;
1323 remember_a_message (mlp
, token
.string
, inner_context
, &pos
);
1326 if (commas_to_skip
== 0)
1328 if (plural_mp
== NULL
)
1330 /* Seen an msgid. */
1332 remember_a_message (mlp
, token
.string
,
1333 inner_context
, &pos
);
1334 if (plural_commas
> 0)
1339 /* Seen an msgid_plural. */
1340 remember_a_message_plural (plural_mp
, token
.string
,
1341 inner_context
, &pos
);
1346 free (token
.string
);
1349 next_context_iter
= null_context_list_iterator
;
1353 case token_type_other
:
1354 next_context_iter
= null_context_list_iterator
;
1358 case token_type_eof
:
1369 extract_php (FILE *f
,
1370 const char *real_filename
, const char *logical_filename
,
1371 flag_context_list_table_ty
*flag_table
,
1372 msgdomain_list_ty
*mdlp
)
1374 message_list_ty
*mlp
= mdlp
->item
[0]->messages
;
1377 real_file_name
= real_filename
;
1378 logical_file_name
= xstrdup (logical_filename
);
1381 last_comment_line
= -1;
1382 last_non_comment_line
= -1;
1384 flag_context_list_table
= flag_table
;
1388 /* Initial mode is HTML mode, not PHP mode. */
1391 /* Eat tokens until eof is seen. When extract_parenthesized returns
1392 due to an unbalanced closing parenthesis, just restart it. */
1393 while (!extract_parenthesized (mlp
, null_context
, null_context_list_iterator
,
1397 /* Close scanner. */
1399 real_file_name
= NULL
;
1400 logical_file_name
= NULL
;