Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-perl.c
blobb17315a5bca2b2aea4d4a7bc5314d0dbb10c8a61
1 /* xgettext Perl backend.
2 Copyright (C) 2002-2004 Free Software Foundation, Inc.
4 This file was written by Guido Flohr <guido@imperia.net>, 2002-2003.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-perl.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "exit.h"
37 #include "po-charset.h"
38 #include "ucs4-utf8.h"
39 #include "uniname.h"
40 #include "getline.h"
41 #include "gettext.h"
43 #define _(s) gettext(s)
45 /* The Perl syntax is defined in perlsyn.pod. Try the command
46 "man perlsyn" or "perldoc perlsyn".
47 Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
48 Try the command "man perlsub" or "perldoc perlsub". */
50 #define DEBUG_PERL 0
53 /* ====================== Keyword set customization. ====================== */
55 /* If true extract all strings. */
56 static bool extract_all = false;
58 static hash_table keywords;
59 static bool default_keywords = true;
62 void
63 x_perl_extract_all ()
65 extract_all = true;
69 void
70 x_perl_keyword (const char *name)
72 if (name == NULL)
73 default_keywords = false;
74 else
76 const char *end;
77 int argnum1;
78 int argnum2;
79 const char *colon;
81 if (keywords.table == NULL)
82 init_hash (&keywords, 100);
84 split_keywordspec (name, &end, &argnum1, &argnum2);
86 /* The characters between name and end should form a valid C identifier.
87 A colon means an invalid parse in split_keywordspec(). */
88 colon = strchr (name, ':');
89 if (colon == NULL || colon >= end)
91 if (argnum1 == 0)
92 argnum1 = 1;
93 insert_entry (&keywords, name, end - name,
94 (void *) (long) (argnum1 + (argnum2 << 10)));
99 /* Finish initializing the keywords hash table.
100 Called after argument processing, before each file is processed. */
101 static void
102 init_keywords ()
104 if (default_keywords)
106 x_perl_keyword ("gettext");
107 x_perl_keyword ("%gettext");
108 x_perl_keyword ("$gettext");
109 x_perl_keyword ("dgettext:2");
110 x_perl_keyword ("dcgettext:2");
111 x_perl_keyword ("ngettext:1,2");
112 x_perl_keyword ("dngettext:2,3");
113 x_perl_keyword ("dcngettext:2,3");
114 x_perl_keyword ("gettext_noop");
115 #if 0
116 x_perl_keyword ("__");
117 x_perl_keyword ("$__");
118 x_perl_keyword ("%__");
119 x_perl_keyword ("__x");
120 x_perl_keyword ("__n:1,2");
121 x_perl_keyword ("__nx:1,2");
122 x_perl_keyword ("__xn:1,2");
123 x_perl_keyword ("N__");
124 #endif
125 default_keywords = false;
129 void
130 init_flag_table_perl ()
132 xgettext_record_flag ("gettext:1:pass-perl-format");
133 xgettext_record_flag ("gettext:1:pass-perl-brace-format");
134 xgettext_record_flag ("%gettext:1:pass-perl-format");
135 xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
136 xgettext_record_flag ("$gettext:1:pass-perl-format");
137 xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
138 xgettext_record_flag ("dgettext:2:pass-perl-format");
139 xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
140 xgettext_record_flag ("dcgettext:2:pass-perl-format");
141 xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
142 xgettext_record_flag ("ngettext:1:pass-perl-format");
143 xgettext_record_flag ("ngettext:2:pass-perl-format");
144 xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
145 xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
146 xgettext_record_flag ("dngettext:2:pass-perl-format");
147 xgettext_record_flag ("dngettext:3:pass-perl-format");
148 xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
149 xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
150 xgettext_record_flag ("dcngettext:2:pass-perl-format");
151 xgettext_record_flag ("dcngettext:3:pass-perl-format");
152 xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
153 xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
154 xgettext_record_flag ("gettext_noop:1:pass-perl-format");
155 xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
156 xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
157 xgettext_record_flag ("sprintf:1:perl-format");
158 #if 0
159 xgettext_record_flag ("__:1:pass-perl-format");
160 xgettext_record_flag ("__:1:pass-perl-brace-format");
161 xgettext_record_flag ("%__:1:pass-perl-format");
162 xgettext_record_flag ("%__:1:pass-perl-brace-format");
163 xgettext_record_flag ("$__:1:pass-perl-format");
164 xgettext_record_flag ("$__:1:pass-perl-brace-format");
165 xgettext_record_flag ("__x:1:perl-brace-format");
166 xgettext_record_flag ("__n:1:pass-perl-format");
167 xgettext_record_flag ("__n:2:pass-perl-format");
168 xgettext_record_flag ("__n:1:pass-perl-brace-format");
169 xgettext_record_flag ("__n:2:pass-perl-brace-format");
170 xgettext_record_flag ("__nx:1:perl-brace-format");
171 xgettext_record_flag ("__nx:2:perl-brace-format");
172 xgettext_record_flag ("__xn:1:perl-brace-format");
173 xgettext_record_flag ("__xn:2:perl-brace-format");
174 xgettext_record_flag ("N__:1:pass-perl-format");
175 xgettext_record_flag ("N__:1:pass-perl-brace-format");
176 #endif
180 /* ======================== Reading of characters. ======================== */
182 /* Real filename, used in error messages about the input file. */
183 static const char *real_file_name;
185 /* Logical filename and line number, used to label the extracted messages. */
186 static char *logical_file_name;
187 static int line_number;
189 /* The input file stream. */
190 static FILE *fp;
192 /* The current line buffer. */
193 static char *linebuf;
195 /* The size of the current line. */
196 static int linesize;
198 /* The position in the current line. */
199 static int linepos;
201 /* The size of the input buffer. */
202 static size_t linebuf_size;
204 /* Number of lines eaten for here documents. */
205 static int here_eaten;
207 /* Paranoia: EOF marker for __END__ or __DATA__. */
208 static bool end_of_file;
211 /* 1. line_number handling. */
213 /* Returns the next character from the input stream or EOF. */
214 static int
215 phase1_getc ()
217 line_number += here_eaten;
218 here_eaten = 0;
220 if (end_of_file)
221 return EOF;
223 if (linepos >= linesize)
225 linesize = getline (&linebuf, &linebuf_size, fp);
227 if (linesize < 0)
229 if (ferror (fp))
230 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
231 real_file_name);
232 end_of_file = true;
233 return EOF;
236 linepos = 0;
237 ++line_number;
239 /* Undosify. This is important for catching the end of <<EOF and
240 <<'EOF'. We could rely on stdio doing this for us but you
241 it is not uncommon to to come across Perl scripts with CRLF
242 newline conventions on systems that do not follow this
243 convention. */
244 if (linesize >= 2 && linebuf[linesize - 1] == '\n'
245 && linebuf[linesize - 2] == '\r')
247 linebuf[linesize - 2] = '\n';
248 linebuf[linesize - 1] = '\0';
249 --linesize;
253 return linebuf[linepos++];
256 /* Supports only one pushback character. */
257 static void
258 phase1_ungetc (int c)
260 if (c != EOF)
262 if (linepos == 0)
263 /* Attempt to ungetc across line boundary. Shouldn't happen.
264 No two phase1_ungetc calls are permitted in a row. */
265 abort ();
267 --linepos;
271 /* Read a here document and return its contents.
272 The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
273 encoded as well. */
275 static char *
276 get_here_document (const char *delimiter)
278 /* Accumulator for the entire here document, including a NUL byte
279 at the end. */
280 static char *buffer;
281 static size_t bufmax = 0;
282 size_t bufpos = 0;
283 /* Current line being appended. */
284 static char *my_linebuf = NULL;
285 static size_t my_linebuf_size = 0;
287 /* Allocate the initial buffer. Later on, bufmax > 0. */
288 if (bufmax == 0)
290 buffer = xrealloc (NULL, 1);
291 buffer[0] = '\0';
292 bufmax = 1;
295 for (;;)
297 int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
298 char *my_line_utf8;
299 bool chomp;
301 if (read_bytes < 0)
303 if (ferror (fp))
305 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
306 real_file_name);
308 else
310 error_with_progname = false;
311 error (EXIT_SUCCESS, 0, _("\
312 %s:%d: can't find string terminator \"%s\" anywhere before EOF"),
313 real_file_name, line_number, delimiter);
314 error_with_progname = true;
316 break;
320 ++here_eaten;
322 /* Convert to UTF-8. */
323 my_line_utf8 =
324 from_current_source_encoding (my_linebuf, logical_file_name,
325 line_number + here_eaten);
326 if (my_line_utf8 != my_linebuf)
328 if (strlen (my_line_utf8) >= my_linebuf_size)
330 my_linebuf_size = strlen (my_line_utf8) + 1;
331 my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
333 strcpy (my_linebuf, my_line_utf8);
334 free (my_line_utf8);
337 /* Undosify. This is important for catching the end of <<EOF and
338 <<'EOF'. We could rely on stdio doing this for us but you
339 it is not uncommon to to come across Perl scripts with CRLF
340 newline conventions on systems that do not follow this
341 convention. */
342 if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
343 && my_linebuf[read_bytes - 2] == '\r')
345 my_linebuf[read_bytes - 2] = '\n';
346 my_linebuf[read_bytes - 1] = '\0';
347 --read_bytes;
350 /* Temporarily remove the trailing newline from my_linebuf. */
351 chomp = false;
352 if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
354 chomp = true;
355 my_linebuf[read_bytes - 1] = '\0';
358 /* See whether this line terminates the here document. */
359 if (strcmp (my_linebuf, delimiter) == 0)
360 break;
362 /* Add back the trailing newline to my_linebuf. */
363 if (chomp)
364 my_linebuf[read_bytes - 1] = '\n';
366 /* Ensure room for read_bytes + 1 bytes. */
367 if (bufpos + read_bytes >= bufmax)
370 bufmax = 2 * bufmax + 10;
371 while (bufpos + read_bytes >= bufmax);
372 buffer = xrealloc (buffer, bufmax);
374 /* Append this line to the accumulator. */
375 strcpy (buffer + bufpos, my_linebuf);
376 bufpos += read_bytes;
379 /* Done accumulating the here document. */
380 return xstrdup (buffer);
383 /* Skips pod sections. */
384 static void
385 skip_pod ()
387 line_number += here_eaten;
388 here_eaten = 0;
389 linepos = 0;
391 for (;;)
393 linesize = getline (&linebuf, &linebuf_size, fp);
395 if (linesize < 0)
397 if (ferror (fp))
398 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
399 real_file_name);
400 return;
403 ++line_number;
405 if (strncmp ("=cut", linebuf, 4) == 0)
407 /* Force reading of a new line on next call to phase1_getc(). */
408 linepos = linesize;
409 return;
415 /* These are for tracking whether comments count as immediately before
416 keyword. */
417 static int last_comment_line;
418 static int last_non_comment_line;
421 /* 2. Replace each comment that is not inside a string literal or regular
422 expression with a newline character. We need to remember the comment
423 for later, because it may be attached to a keyword string. */
425 static int
426 phase2_getc ()
428 static char *buffer;
429 static size_t bufmax;
430 size_t buflen;
431 int lineno;
432 int c;
433 char *utf8_string;
435 c = phase1_getc ();
436 if (c == '#')
438 buflen = 0;
439 lineno = line_number;
440 /* Skip leading whitespace. */
441 for (;;)
443 c = phase1_getc ();
444 if (c == EOF)
445 break;
446 if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
448 phase1_ungetc (c);
449 break;
452 /* Accumulate the comment. */
453 for (;;)
455 c = phase1_getc ();
456 if (c == '\n' || c == EOF)
457 break;
458 if (buflen >= bufmax)
460 bufmax = 2 * bufmax + 10;
461 buffer = xrealloc (buffer, bufmax);
463 buffer[buflen++] = c;
465 if (buflen >= bufmax)
467 bufmax = 2 * bufmax + 10;
468 buffer = xrealloc (buffer, bufmax);
470 buffer[buflen] = '\0';
471 /* Convert it to UTF-8. */
472 utf8_string =
473 from_current_source_encoding (buffer, logical_file_name, lineno);
474 /* Save it until we encounter the corresponding string. */
475 xgettext_current_source_encoding = po_charset_utf8;
476 xgettext_comment_add (utf8_string);
477 xgettext_current_source_encoding = xgettext_global_source_encoding;
478 last_comment_line = lineno;
480 return c;
483 /* Supports only one pushback character. */
484 static void
485 phase2_ungetc (int c)
487 if (c != EOF)
488 phase1_ungetc (c);
491 /* Whitespace recognition. */
493 #define case_whitespace \
494 case ' ': case '\t': case '\r': case '\n': case '\f'
496 static inline bool
497 is_whitespace (int c)
499 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
503 /* ========================== Reading of tokens. ========================== */
506 enum token_type_ty
508 token_type_eof,
509 token_type_lparen, /* ( */
510 token_type_rparen, /* ) */
511 token_type_comma, /* , */
512 token_type_fat_comma, /* => */
513 token_type_dereference, /* , */
514 token_type_semicolon, /* ; */
515 token_type_lbrace, /* { */
516 token_type_rbrace, /* } */
517 token_type_lbracket, /* [ */
518 token_type_rbracket, /* ] */
519 token_type_string, /* quote-like */
520 token_type_named_op, /* if, unless, while, ... */
521 token_type_variable, /* $... */
522 token_type_symbol, /* symbol, number */
523 token_type_regex_op, /* s, tr, y, m. */
524 token_type_dot, /* . */
525 token_type_other, /* regexp, misc. operator */
526 /* The following are not really token types, but variants used by
527 the parser. */
528 token_type_keyword_symbol /* keyword symbol */
530 typedef enum token_type_ty token_type_ty;
532 /* Subtypes for strings, important for interpolation. */
533 enum string_type_ty
535 string_type_verbatim, /* "<<'EOF'", "m'...'", "s'...''...'",
536 "tr/.../.../", "y/.../.../". */
537 string_type_q, /* "'..'", "q/.../". */
538 string_type_qq, /* '"..."', "`...`", "qq/.../", "qx/.../",
539 "<file*glob>". */
540 string_type_qr /* Not supported. */
543 /* Subtypes for symbols, important for dollar interpretation. */
544 enum symbol_type_ty
546 symbol_type_none, /* Nothing special. */
547 symbol_type_sub, /* 'sub'. */
548 symbol_type_function /* Function name after 'sub'. */
551 typedef struct token_ty token_ty;
552 struct token_ty
554 token_type_ty type;
555 int sub_type; /* for token_type_string, token_type_symbol */
556 char *string; /* for: in encoding:
557 token_type_named_op ASCII
558 token_type_string UTF-8
559 token_type_symbol ASCII
560 token_type_variable global_source_encoding
562 int line_number;
565 #if DEBUG_PERL
566 static const char *
567 token2string (const token_ty *token)
569 switch (token->type)
571 case token_type_eof:
572 return "token_type_eof";
573 case token_type_lparen:
574 return "token_type_lparen";
575 case token_type_rparen:
576 return "token_type_rparen";
577 case token_type_comma:
578 return "token_type_comma";
579 case token_type_fat_comma:
580 return "token_type_fat_comma";
581 case token_type_dereference:
582 return "token_type_dereference";
583 case token_type_semicolon:
584 return "token_type_semicolon";
585 case token_type_lbrace:
586 return "token_type_lbrace";
587 case token_type_rbrace:
588 return "token_type_rbrace";
589 case token_type_lbracket:
590 return "token_type_lbracket";
591 case token_type_rbracket:
592 return "token_type_rbracket";
593 case token_type_string:
594 return "token_type_string";
595 case token_type_named_op:
596 return "token_type_named_op";
597 case token_type_variable:
598 return "token_type_variable";
599 case token_type_symbol:
600 return "token_type_symbol";
601 case token_type_regex_op:
602 return "token_type_regex_op";
603 case token_type_dot:
604 return "token_type_dot";
605 case token_type_other:
606 return "token_type_other";
607 default:
608 return "unknown";
611 #endif
613 /* Free the memory pointed to by a 'struct token_ty'. */
614 static inline void
615 free_token (token_ty *tp)
617 switch (tp->type)
619 case token_type_named_op:
620 case token_type_string:
621 case token_type_symbol:
622 case token_type_variable:
623 free (tp->string);
624 break;
625 default:
626 break;
628 free (tp);
631 /* Pass 1 of extracting quotes: Find the end of the string, regardless
632 of the semantics of the construct. Return the complete string,
633 including the starting and the trailing delimiter, with backslashes
634 removed where appropriate. */
635 static char *
636 extract_quotelike_pass1 (int delim)
638 /* This function is called recursively. No way to allocate stuff
639 statically. Also alloca() is inappropriate due to limited stack
640 size on some platforms. So we use malloc(). */
641 int bufmax = 10;
642 char *buffer = (char *) xmalloc (bufmax);
643 int bufpos = 0;
644 bool nested = true;
645 int counter_delim;
647 buffer[bufpos++] = delim;
649 /* Find the closing delimiter. */
650 switch (delim)
652 case '(':
653 counter_delim = ')';
654 break;
655 case '{':
656 counter_delim = '}';
657 break;
658 case '[':
659 counter_delim = ']';
660 break;
661 case '<':
662 counter_delim = '>';
663 break;
664 default: /* "..." or '...' or |...| etc. */
665 nested = false;
666 counter_delim = delim;
667 break;
670 for (;;)
672 int c = phase1_getc ();
674 /* This round can produce 1 or 2 bytes. Ensure room for 2 bytes. */
675 if (bufpos + 2 > bufmax)
677 bufmax = 2 * bufmax + 10;
678 buffer = xrealloc (buffer, bufmax);
681 if (c == counter_delim || c == EOF)
683 buffer[bufpos++] = counter_delim; /* will be stripped off later */
684 buffer[bufpos++] = '\0';
685 #if DEBUG_PERL
686 fprintf (stderr, "PASS1: %s\n", buffer);
687 #endif
688 return buffer;
691 if (nested && c == delim)
693 char *inner = extract_quotelike_pass1 (delim);
694 size_t len = strlen (inner);
696 /* Ensure room for len + 1 bytes. */
697 if (bufpos + len >= bufmax)
700 bufmax = 2 * bufmax + 10;
701 while (bufpos + len >= bufmax);
702 buffer = xrealloc (buffer, bufmax);
704 strcpy (buffer + bufpos, inner);
705 free (inner);
706 bufpos += len;
708 else if (c == '\\')
710 c = phase1_getc ();
711 if (c == '\\')
713 buffer[bufpos++] = '\\';
714 buffer[bufpos++] = '\\';
716 else if (c == delim || c == counter_delim)
718 /* This is pass2 in Perl. */
719 buffer[bufpos++] = c;
721 else
723 buffer[bufpos++] = '\\';
724 phase1_ungetc (c);
727 else
729 buffer[bufpos++] = c;
734 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
735 encoding. */
736 static char *
737 extract_quotelike_pass1_utf8 (int delim)
739 char *string = extract_quotelike_pass1 (delim);
740 char *utf8_string =
741 from_current_source_encoding (string, logical_file_name, line_number);
742 if (utf8_string != string)
743 free (string);
744 return utf8_string;
748 /* ========= Reading of tokens and commands. Extracting strings. ========= */
751 /* There is an ambiguity about '/': It can start a division operator ('/' or
752 '/=') or it can start a regular expression. The distinction is important
753 because inside regular expressions, '#' loses its special meaning.
754 The distinction is possible depending on the parsing state: After a
755 variable or simple expression, it's a division operator; at the beginning
756 of an expression, it's a regexp. */
757 static bool prefer_division_over_regexp;
759 /* Context lookup table. */
760 static flag_context_list_table_ty *flag_context_list_table;
763 /* Forward declaration of local functions. */
764 static void interpolate_keywords (message_list_ty *mlp, const char *string,
765 int lineno);
766 static token_ty *x_perl_lex (message_list_ty *mlp);
767 static void x_perl_unlex (token_ty *tp);
768 static bool extract_balanced (message_list_ty *mlp, int state,
769 token_type_ty delim,
770 flag_context_ty outer_context,
771 flag_context_list_iterator_ty context_iter,
772 int arg_sg, int arg_pl);
775 /* Extract an unsigned hexadecimal number from STRING, considering at
776 most LEN bytes and place the result in *RESULT. Returns a pointer
777 to the first character past the hexadecimal number. */
778 static const char *
779 extract_hex (const char *string, size_t len, unsigned int *result)
781 size_t i;
783 *result = 0;
785 for (i = 0; i < len; i++)
787 char c = string[i];
788 int number;
790 if (c >= 'A' && c <= 'F')
791 number = c - 'A' + 10;
792 else if (c >= 'a' && c <= 'f')
793 number = c - 'a' + 10;
794 else if (c >= '0' && c <= '9')
795 number = c - '0';
796 else
797 break;
799 *result <<= 4;
800 *result |= number;
803 return string + i;
806 /* Extract an unsigned octal number from STRING, considering at
807 most LEN bytes and place the result in *RESULT. Returns a pointer
808 to the first character past the octal number. */
809 static const char *
810 extract_oct (const char *string, size_t len, unsigned int *result)
812 size_t i;
814 *result = 0;
816 for (i = 0; i < len; i++)
818 char c = string[i];
819 int number;
821 if (c >= '0' && c <= '7')
822 number = c - '0';
823 else
824 break;
826 *result <<= 3;
827 *result |= number;
830 return string + i;
833 /* Extract the various quotelike constructs except for <<EOF. See the
834 section "Gory details of parsing quoted constructs" in perlop.pod.
835 Return the resulting token in *tp; tp->type == token_type_string. */
836 static void
837 extract_quotelike (token_ty *tp, int delim)
839 char *string = extract_quotelike_pass1_utf8 (delim);
840 size_t len = strlen (string);
842 tp->type = token_type_string;
843 /* Take the string without the delimiters at the start and at the end. */
844 if (!(len >= 2))
845 abort ();
846 string[len - 1] = '\0';
847 tp->string = xstrdup (string + 1);
848 free (string);
851 /* Extract the quotelike constructs with double delimiters, like
852 s/[SEARCH]/[REPLACE]/. This function does not eat up trailing
853 modifiers (left to the caller).
854 Return the resulting token in *tp; tp->type == token_type_regex_op. */
855 static void
856 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
857 bool interpolate)
859 char *string;
861 tp->type = token_type_regex_op;
863 string = extract_quotelike_pass1_utf8 (delim);
864 if (interpolate)
865 interpolate_keywords (mlp, string, line_number);
866 free (string);
868 if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
870 /* The delimiter for the second string can be different, e.g.
871 s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/. See "man perlrequick". */
872 delim = phase1_getc ();
873 while (is_whitespace (delim))
875 /* The hash-sign is not a valid delimiter after whitespace, ergo
876 use phase2_getc() and not phase1_getc() now. */
877 delim = phase2_getc ();
880 string = extract_quotelike_pass1_utf8 (delim);
881 if (interpolate)
882 interpolate_keywords (mlp, string, line_number);
883 free (string);
886 /* Perform pass 3 of quotelike extraction (interpolation).
887 *tp is a token of type token_type_string.
888 This function replaces tp->string. */
889 /* FIXME: Currently may writes null-bytes into the string. */
890 static void
891 extract_quotelike_pass3 (token_ty *tp, int error_level)
893 static char *buffer;
894 static int bufmax = 0;
895 int bufpos = 0;
896 const char *crs;
897 bool uppercase;
898 bool lowercase;
899 bool quotemeta;
901 #if DEBUG_PERL
902 switch (tp->sub_type)
904 case string_type_verbatim:
905 fprintf (stderr, "Interpolating string_type_verbatim:\n");
906 break;
907 case string_type_q:
908 fprintf (stderr, "Interpolating string_type_q:\n");
909 break;
910 case string_type_qq:
911 fprintf (stderr, "Interpolating string_type_qq:\n");
912 break;
913 case string_type_qr:
914 fprintf (stderr, "Interpolating string_type_qr:\n");
915 break;
917 fprintf (stderr, "%s\n", tp->string);
918 if (tp->sub_type == string_type_verbatim)
919 fprintf (stderr, "---> %s\n", tp->string);
920 #endif
922 if (tp->sub_type == string_type_verbatim)
923 return;
925 /* Loop over tp->string, accumulating the expansion in buffer. */
926 crs = tp->string;
927 uppercase = false;
928 lowercase = false;
929 quotemeta = false;
930 while (*crs)
932 bool backslashed;
934 /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
935 if \Q modifier is present. */
936 if (bufpos + 7 > bufmax)
938 bufmax = 2 * bufmax + 10;
939 buffer = xrealloc (buffer, bufmax);
942 if (tp->sub_type == string_type_q)
944 switch (*crs)
946 case '\\':
947 if (crs[1] == '\\')
949 crs += 2;
950 buffer[bufpos++] = '\\';
951 break;
953 /* FALLTHROUGH */
954 default:
955 buffer[bufpos++] = *crs++;
956 break;
958 continue;
961 /* We only get here for double-quoted strings or regular expressions.
962 Unescape escape sequences. */
963 if (*crs == '\\')
965 switch (crs[1])
967 case 't':
968 crs += 2;
969 buffer[bufpos++] = '\t';
970 continue;
971 case 'n':
972 crs += 2;
973 buffer[bufpos++] = '\n';
974 continue;
975 case 'r':
976 crs += 2;
977 buffer[bufpos++] = '\r';
978 continue;
979 case 'f':
980 crs += 2;
981 buffer[bufpos++] = '\f';
982 continue;
983 case 'b':
984 crs += 2;
985 buffer[bufpos++] = '\b';
986 continue;
987 case 'a':
988 crs += 2;
989 buffer[bufpos++] = '\a';
990 continue;
991 case 'e':
992 crs += 2;
993 buffer[bufpos++] = 0x1b;
994 continue;
995 case '0': case '1': case '2': case '3':
996 case '4': case '5': case '6': case '7':
998 unsigned int oct_number;
999 int length;
1001 crs = extract_oct (crs + 1, 3, &oct_number);
1003 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1004 true, the character should be converted to its uppercase
1005 resp. lowercase equivalent. I don't know if the necessary
1006 facilities are already included in gettext. For US-Ascii
1007 the conversion can be already be done, however. */
1008 if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1010 oct_number = oct_number - 'a' + 'A';
1012 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1014 oct_number = oct_number - 'A' + 'a';
1018 /* Yes, octal escape sequences in the range 0x100..0x1ff are
1019 valid. */
1020 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1021 oct_number, 2);
1022 if (length > 0)
1023 bufpos += length;
1025 continue;
1026 case 'x':
1028 unsigned int hex_number = 0;
1029 int length;
1031 crs += 2;
1032 if (*crs == '{')
1034 const char *end = strchr (crs, '}');
1035 if (end == NULL)
1037 error_with_progname = false;
1038 error (error_level, 0, _("\
1039 %s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1040 error_with_progname = true;
1041 ++crs;
1042 continue;
1044 else
1046 ++crs;
1047 (void) extract_hex (crs, end - crs, &hex_number);
1048 crs = end + 1;
1051 else
1053 crs = extract_hex (crs, 2, &hex_number);
1056 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1057 true, the character should be converted to its uppercase
1058 resp. lowercase equivalent. I don't know if the necessary
1059 facilities are already included in gettext. For US-Ascii
1060 the conversion can be already be done, however. */
1061 if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1063 hex_number = hex_number - 'a' + 'A';
1065 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1067 hex_number = hex_number - 'A' + 'a';
1070 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1071 hex_number, 6);
1073 if (length > 0)
1074 bufpos += length;
1076 continue;
1077 case 'c':
1078 /* Perl's notion of control characters. */
1079 crs += 2;
1080 if (*crs)
1082 int the_char = (unsigned char) *crs;
1083 if (the_char >= 'a' || the_char <= 'z')
1084 the_char = the_char - 'a' + 'A';
1085 buffer[bufpos++] = the_char ^ 0x40;
1087 continue;
1088 case 'N':
1089 crs += 2;
1090 if (*crs == '{')
1092 const char *end = strchr (crs + 1, '}');
1093 if (end != NULL)
1095 char *name;
1096 unsigned int unicode;
1098 name = (char *) xmalloc (end - (crs + 1) + 1);
1099 memcpy (name, crs + 1, end - (crs + 1));
1100 name[end - (crs + 1)] = '\0';
1102 unicode = unicode_name_character (name);
1103 if (unicode != UNINAME_INVALID)
1105 /* FIXME: Convert to upper/lowercase if the
1106 corresponding flag is set to true. */
1107 int length =
1108 u8_uctomb ((unsigned char *) (buffer + bufpos),
1109 unicode, 6);
1110 if (length > 0)
1111 bufpos += length;
1114 free (name);
1116 crs = end + 1;
1119 continue;
1123 /* No escape sequence, go on. */
1124 if (*crs == '\\')
1126 ++crs;
1127 switch (*crs)
1129 case 'E':
1130 uppercase = false;
1131 lowercase = false;
1132 quotemeta = false;
1133 ++crs;
1134 continue;
1135 case 'L':
1136 uppercase = false;
1137 lowercase = true;
1138 ++crs;
1139 continue;
1140 case 'U':
1141 uppercase = true;
1142 lowercase = false;
1143 ++crs;
1144 continue;
1145 case 'Q':
1146 quotemeta = true;
1147 ++crs;
1148 continue;
1149 case 'l':
1150 ++crs;
1151 if (*crs >= 'A' && *crs <= 'Z')
1153 buffer[bufpos++] = *crs - 'A' + 'a';
1155 else if ((unsigned char) *crs >= 0x80)
1157 error_with_progname = false;
1158 error (error_level, 0, _("\
1159 %s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1160 real_file_name, line_number, *crs);
1161 error_with_progname = true;
1163 else
1165 buffer[bufpos++] = *crs;
1167 ++crs;
1168 continue;
1169 case 'u':
1170 ++crs;
1171 if (*crs >= 'a' && *crs <= 'z')
1173 buffer[bufpos++] = *crs - 'a' + 'A';
1175 else if ((unsigned char) *crs >= 0x80)
1177 error_with_progname = false;
1178 error (error_level, 0, _("\
1179 %s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1180 real_file_name, line_number, *crs);
1181 error_with_progname = true;
1183 else
1185 buffer[bufpos++] = *crs;
1187 ++crs;
1188 continue;
1189 case '\\':
1190 buffer[bufpos++] = *crs;
1191 ++crs;
1192 continue;
1193 default:
1194 backslashed = true;
1195 break;
1198 else
1199 backslashed = false;
1201 if (quotemeta
1202 && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1203 || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1205 buffer[bufpos++] = '\\';
1206 backslashed = true;
1209 if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1211 error_with_progname = false;
1212 error (error_level, 0, _("\
1213 %s:%d: invalid variable interpolation at \"%c\""),
1214 real_file_name, line_number, *crs);
1215 error_with_progname = true;
1216 ++crs;
1218 else if (lowercase)
1220 if (*crs >= 'A' && *crs <= 'Z')
1221 buffer[bufpos++] = *crs - 'A' + 'a';
1222 else if ((unsigned char) *crs >= 0x80)
1224 error_with_progname = false;
1225 error (error_level, 0, _("\
1226 %s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1227 real_file_name, line_number, *crs);
1228 error_with_progname = true;
1229 buffer[bufpos++] = *crs;
1231 else
1232 buffer[bufpos++] = *crs;
1233 ++crs;
1235 else if (uppercase)
1237 if (*crs >= 'a' && *crs <= 'z')
1238 buffer[bufpos++] = *crs - 'a' + 'A';
1239 else if ((unsigned char) *crs >= 0x80)
1241 error_with_progname = false;
1242 error (error_level, 0, _("\
1243 %s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1244 real_file_name, line_number, *crs);
1245 error_with_progname = true;
1246 buffer[bufpos++] = *crs;
1248 else
1249 buffer[bufpos++] = *crs;
1250 ++crs;
1252 else
1254 buffer[bufpos++] = *crs++;
1258 /* Ensure room for 1 more byte. */
1259 if (bufpos >= bufmax)
1261 bufmax = 2 * bufmax + 10;
1262 buffer = xrealloc (buffer, bufmax);
1265 buffer[bufpos++] = '\0';
1267 #if DEBUG_PERL
1268 fprintf (stderr, "---> %s\n", buffer);
1269 #endif
1271 /* Replace tp->string. */
1272 free (tp->string);
1273 tp->string = xstrdup (buffer);
1276 /* Parse a variable. This is done in several steps:
1277 1) Consume all leading occurencies of '$', '@', '%', and '*'.
1278 2) Determine the name of the variable from the following input.
1279 3) Parse possible following hash keys or array indexes.
1281 static void
1282 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1284 static char *buffer;
1285 static int bufmax = 0;
1286 int bufpos = 0;
1287 int c = first;
1288 size_t varbody_length = 0;
1289 bool maybe_hash_deref = false;
1290 bool maybe_hash_value = false;
1292 tp->type = token_type_variable;
1294 #if DEBUG_PERL
1295 fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1296 real_file_name, line_number, first);
1297 #endif
1300 * 1) Consume dollars and so on (not euros ...). Unconditionally
1301 * accepting the hash sign (#) will maybe lead to inaccurate
1302 * results. FIXME!
1304 while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1306 if (bufpos >= bufmax)
1308 bufmax = 2 * bufmax + 10;
1309 buffer = xrealloc (buffer, bufmax);
1311 buffer[bufpos++] = c;
1312 c = phase1_getc ();
1315 if (c == EOF)
1317 tp->type = token_type_eof;
1318 return;
1321 /* Hash references are treated in a special way, when looking for
1322 our keywords. */
1323 if (buffer[0] == '$')
1325 if (bufpos == 1)
1326 maybe_hash_value = true;
1327 else if (bufpos == 2 && buffer[1] == '$')
1329 if (!(c == '{'
1330 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1331 || (c >= '0' && c <= '9')
1332 || c == '_' || c == ':' || c == '\'' || c >= 0x80))
1334 /* Special variable $$ for pid. */
1335 if (bufpos >= bufmax)
1337 bufmax = 2 * bufmax + 10;
1338 buffer = xrealloc (buffer, bufmax);
1340 buffer[bufpos++] = '\0';
1341 tp->string = xstrdup (buffer);
1342 #if DEBUG_PERL
1343 fprintf (stderr, "%s:%d: is PID ($$)\n",
1344 real_file_name, line_number);
1345 #endif
1347 phase1_ungetc (c);
1348 return;
1351 maybe_hash_deref = true;
1352 bufpos = 1;
1357 * 2) Get the name of the variable. The first character is practically
1358 * arbitrary. Punctuation and numbers automagically put a variable
1359 * in the global namespace but that subtle difference is not interesting
1360 * for us.
1362 if (bufpos >= bufmax)
1364 bufmax = 2 * bufmax + 10;
1365 buffer = xrealloc (buffer, bufmax);
1367 if (c == '{')
1369 /* Yuck, we cannot accept ${gettext} as a keyword... Except for
1370 * debugging purposes it is also harmless, that we suppress the
1371 * real name of the variable.
1373 #if DEBUG_PERL
1374 fprintf (stderr, "%s:%d: braced {variable_name}\n",
1375 real_file_name, line_number);
1376 #endif
1378 if (extract_balanced (mlp, 0, token_type_rbrace,
1379 null_context, null_context_list_iterator, -1, -1))
1380 return;
1381 buffer[bufpos++] = c;
1383 else
1385 while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1386 || (c >= '0' && c <= '9')
1387 || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1389 ++varbody_length;
1390 if (bufpos >= bufmax)
1392 bufmax = 2 * bufmax + 10;
1393 buffer = xrealloc (buffer, bufmax);
1395 buffer[bufpos++] = c;
1396 c = phase1_getc ();
1398 phase1_ungetc (c);
1401 /* Probably some strange Perl variable like $`. */
1402 if (varbody_length == 0)
1404 c = phase1_getc ();
1405 if (c == EOF || is_whitespace (c))
1406 phase1_ungetc (c); /* Loser. */
1407 else
1409 if (bufpos >= bufmax)
1411 bufmax = 2 * bufmax + 10;
1412 buffer = xrealloc (buffer, bufmax);
1414 buffer[bufpos++] = c;
1418 if (bufpos >= bufmax)
1420 bufmax = 2 * bufmax + 10;
1421 buffer = xrealloc (buffer, bufmax);
1423 buffer[bufpos++] = '\0';
1425 tp->string = xstrdup (buffer);
1427 #if DEBUG_PERL
1428 fprintf (stderr, "%s:%d: complete variable name: %s\n",
1429 real_file_name, line_number, tp->string);
1430 #endif
1432 prefer_division_over_regexp = true;
1435 * 3) If the following looks strange to you, this is valid Perl syntax:
1437 * $var = $$hashref # We can place a
1438 * # comment here and then ...
1439 * {key_into_hashref};
1441 * POD sections are not allowed but we leave complaints about
1442 * that to the compiler/interpreter.
1444 /* We only extract strings from the first hash key (if present). */
1446 if (maybe_hash_deref || maybe_hash_value)
1448 bool is_dereference = false;
1449 int c;
1452 c = phase2_getc ();
1453 while (is_whitespace (c));
1455 if (c == '-')
1457 int c2 = phase1_getc ();
1459 if (c2 == '>')
1461 is_dereference = true;
1464 c = phase2_getc ();
1465 while (is_whitespace (c));
1467 else if (c2 != '\n')
1469 /* Discarding the newline is harmless here. The only
1470 special character recognized after a minus is greater-than
1471 for dereference. However, the sequence "-\n>" that we
1472 treat incorrectly here, is a syntax error. */
1473 phase1_ungetc (c2);
1477 if (maybe_hash_value && is_dereference)
1479 #if DEBUG_PERL
1480 fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1481 real_file_name, line_number);
1482 #endif
1484 else if (maybe_hash_value)
1486 /* Fake it into a hash. */
1487 tp->string[0] = '%';
1490 /* Do NOT change that into else if (see above). */
1491 if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1493 void *keyword_value;
1495 #if DEBUG_PERL
1496 fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1497 real_file_name, line_number);
1498 #endif
1500 if (find_entry (&keywords, tp->string, strlen (tp->string),
1501 &keyword_value) == 0)
1503 /* Extract a possible string from the key. Before proceeding
1504 we check whether the open curly is followed by a symbol and
1505 then by a right curly. */
1506 flag_context_list_iterator_ty context_iter =
1507 flag_context_list_iterator (
1508 flag_context_list_table_lookup (
1509 flag_context_list_table,
1510 tp->string, strlen (tp->string)));
1511 token_ty *t1 = x_perl_lex (mlp);
1513 #if DEBUG_PERL
1514 fprintf (stderr, "%s:%d: extracting string key\n",
1515 real_file_name, line_number);
1516 #endif
1518 if (t1->type == token_type_symbol
1519 || t1->type == token_type_named_op)
1521 token_ty *t2 = x_perl_lex (mlp);
1522 if (t2->type == token_type_rbrace)
1524 flag_context_ty context;
1525 lex_pos_ty pos;
1527 context =
1528 inherited_context (null_context,
1529 flag_context_list_iterator_advance (
1530 &context_iter));
1532 pos.line_number = line_number;
1533 pos.file_name = logical_file_name;
1535 xgettext_current_source_encoding = po_charset_utf8;
1536 remember_a_message (mlp, xstrdup (t1->string), context, &pos);
1537 xgettext_current_source_encoding = xgettext_global_source_encoding;
1538 free_token (t2);
1539 free_token (t1);
1541 else
1543 x_perl_unlex (t2);
1546 else
1548 x_perl_unlex (t1);
1549 if (extract_balanced (mlp, 1, token_type_rbrace,
1550 null_context, context_iter, 1, -1))
1551 return;
1554 else
1556 phase2_ungetc (c);
1559 else
1561 phase2_ungetc (c);
1565 /* Now consume "->", "[...]", and "{...}". */
1566 for (;;)
1568 int c = phase2_getc ();
1569 int c2;
1571 switch (c)
1573 case '{':
1574 #if DEBUG_PERL
1575 fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1576 real_file_name, line_number);
1577 #endif
1578 extract_balanced (mlp, 0, token_type_rbrace,
1579 null_context, null_context_list_iterator, -1, -1);
1580 break;
1582 case '[':
1583 #if DEBUG_PERL
1584 fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1585 real_file_name, line_number);
1586 #endif
1587 extract_balanced (mlp, 0, token_type_rbracket,
1588 null_context, null_context_list_iterator, -1, -1);
1589 break;
1591 case '-':
1592 c2 = phase1_getc ();
1593 if (c2 == '>')
1595 #if DEBUG_PERL
1596 fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1597 real_file_name, line_number);
1598 #endif
1599 break;
1601 else if (c2 != '\n')
1603 /* Discarding the newline is harmless here. The only
1604 special character recognized after a minus is greater-than
1605 for dereference. However, the sequence "-\n>" that we
1606 treat incorrectly here, is a syntax error. */
1607 phase1_ungetc (c2);
1609 /* FALLTHROUGH */
1611 default:
1612 #if DEBUG_PERL
1613 fprintf (stderr, "%s:%d: variable finished\n",
1614 real_file_name, line_number);
1615 #endif
1616 phase2_ungetc (c);
1617 return;
1622 /* Actually a simplified version of extract_variable(). It searches for
1623 variables inside a double-quoted string that may interpolate to
1624 some keyword hash (reference). The string is UTF-8 encoded. */
1625 static void
1626 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1628 static char *buffer;
1629 static int bufmax = 0;
1630 int bufpos = 0;
1631 flag_context_ty context;
1632 int c;
1633 bool maybe_hash_deref = false;
1634 enum parser_state
1636 initial,
1637 one_dollar,
1638 two_dollars,
1639 identifier,
1640 minus,
1641 wait_lbrace,
1642 wait_quote,
1643 dquote,
1644 squote,
1645 barekey,
1646 wait_rbrace
1647 } state;
1648 token_ty token;
1650 lex_pos_ty pos;
1652 /* States are:
1654 * initial: initial
1655 * one_dollar: dollar sign seen in state INITIAL
1656 * two_dollars: another dollar-sign has been seen in state ONE_DOLLAR
1657 * identifier: a valid identifier character has been seen in state
1658 * ONE_DOLLAR or TWO_DOLLARS
1659 * minus: a minus-sign has been seen in state IDENTIFIER
1660 * wait_lbrace: a greater-than has been seen in state MINUS
1661 * wait_quote: a left brace has been seen in state IDENTIFIER or in
1662 * state WAIT_LBRACE
1663 * dquote: a double-quote has been seen in state WAIT_QUOTE
1664 * squote: a single-quote has been seen in state WAIT_QUOTE
1665 * barekey: an bareword character has been seen in state WAIT_QUOTE
1666 * wait_rbrace: closing quote has been seen in state DQUOTE or SQUOTE
1668 * In the states initial...identifier the context is null_context; in the
1669 * states minus...wait_rbrace the context is the one suitable for the first
1670 * argument of the last seen identifier.
1672 state = initial;
1673 context = null_context;
1675 token.type = token_type_string;
1676 token.sub_type = string_type_qq;
1677 token.line_number = line_number;
1678 pos.file_name = logical_file_name;
1679 pos.line_number = lineno;
1681 while ((c = (unsigned char) *string++) != '\0')
1683 void *keyword_value;
1685 if (state == initial)
1686 bufpos = 0;
1688 if (c == '\n')
1689 lineno++;
1691 if (bufpos + 1 >= bufmax)
1693 bufmax = 2 * bufmax + 10;
1694 buffer = xrealloc (buffer, bufmax);
1697 switch (state)
1699 case initial:
1700 switch (c)
1702 case '\\':
1703 c = (unsigned char) *string++;
1704 if (c == '\0')
1705 return;
1706 break;
1707 case '$':
1708 buffer[bufpos++] = '$';
1709 maybe_hash_deref = false;
1710 state = one_dollar;
1711 break;
1712 default:
1713 break;
1715 break;
1716 case one_dollar:
1717 switch (c)
1719 case '$':
1721 * This is enough to make us believe later that we dereference
1722 * a hash reference.
1724 maybe_hash_deref = true;
1725 state = two_dollars;
1726 break;
1727 default:
1728 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1729 || (c >= 'A' && c <= 'Z')
1730 || (c >= 'a' && c <= 'z')
1731 || (c >= '0' && c <= '9'))
1733 buffer[bufpos++] = c;
1734 state = identifier;
1736 else
1737 state = initial;
1738 break;
1740 break;
1741 case two_dollars:
1742 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1743 || (c >= 'A' && c <= 'Z')
1744 || (c >= 'a' && c <= 'z')
1745 || (c >= '0' && c <= '9'))
1747 buffer[bufpos++] = c;
1748 state = identifier;
1750 else
1751 state = initial;
1752 break;
1753 case identifier:
1754 switch (c)
1756 case '-':
1757 if (find_entry (&keywords, buffer, bufpos, &keyword_value) == 0)
1759 flag_context_list_iterator_ty context_iter =
1760 flag_context_list_iterator (
1761 flag_context_list_table_lookup (
1762 flag_context_list_table,
1763 buffer, bufpos));
1764 context =
1765 inherited_context (null_context,
1766 flag_context_list_iterator_advance (
1767 &context_iter));
1768 state = minus;
1770 else
1771 state = initial;
1772 break;
1773 case '{':
1774 if (!maybe_hash_deref)
1775 buffer[0] = '%';
1776 if (find_entry (&keywords, buffer, bufpos, &keyword_value) == 0)
1778 flag_context_list_iterator_ty context_iter =
1779 flag_context_list_iterator (
1780 flag_context_list_table_lookup (
1781 flag_context_list_table,
1782 buffer, bufpos));
1783 context =
1784 inherited_context (null_context,
1785 flag_context_list_iterator_advance (
1786 &context_iter));
1787 state = wait_quote;
1789 else
1790 state = initial;
1791 break;
1792 default:
1793 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1794 || (c >= 'A' && c <= 'Z')
1795 || (c >= 'a' && c <= 'z')
1796 || (c >= '0' && c <= '9'))
1798 buffer[bufpos++] = c;
1800 else
1801 state = initial;
1802 break;
1804 break;
1805 case minus:
1806 switch (c)
1808 case '>':
1809 state = wait_lbrace;
1810 break;
1811 default:
1812 context = null_context;
1813 state = initial;
1814 break;
1816 break;
1817 case wait_lbrace:
1818 switch (c)
1820 case '{':
1821 state = wait_quote;
1822 break;
1823 default:
1824 context = null_context;
1825 state = initial;
1826 break;
1828 break;
1829 case wait_quote:
1830 switch (c)
1832 case_whitespace:
1833 break;
1834 case '\'':
1835 pos.line_number = lineno;
1836 bufpos = 0;
1837 state = squote;
1838 break;
1839 case '"':
1840 pos.line_number = lineno;
1841 bufpos = 0;
1842 state = dquote;
1843 break;
1844 default:
1845 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1846 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1848 pos.line_number = lineno;
1849 bufpos = 0;
1850 buffer[bufpos++] = c;
1851 state = barekey;
1853 else
1855 context = null_context;
1856 state = initial;
1858 break;
1860 break;
1861 case dquote:
1862 switch (c)
1864 case '"':
1865 /* The resulting string has to be interpolated twice. */
1866 buffer[bufpos] = '\0';
1867 token.string = xstrdup (buffer);
1868 extract_quotelike_pass3 (&token, EXIT_FAILURE);
1869 /* The string can only shrink with interpolation (because
1870 we ignore \Q). */
1871 if (!(strlen (token.string) <= bufpos))
1872 abort ();
1873 strcpy (buffer, token.string);
1874 free (token.string);
1875 state = wait_rbrace;
1876 break;
1877 case '\\':
1878 if (string[0] == '\"')
1880 buffer[bufpos++] = string++[0];
1882 else if (string[0])
1884 buffer[bufpos++] = '\\';
1885 buffer[bufpos++] = string++[0];
1887 else
1889 context = null_context;
1890 state = initial;
1892 break;
1893 default:
1894 buffer[bufpos++] = c;
1895 break;
1897 break;
1898 case squote:
1899 switch (c)
1901 case '\'':
1902 state = wait_rbrace;
1903 break;
1904 case '\\':
1905 if (string[0] == '\'')
1907 buffer[bufpos++] = string++[0];
1909 else if (string[0])
1911 buffer[bufpos++] = '\\';
1912 buffer[bufpos++] = string++[0];
1914 else
1916 context = null_context;
1917 state = initial;
1919 break;
1920 default:
1921 buffer[bufpos++] = c;
1922 break;
1924 break;
1925 case barekey:
1926 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1927 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1929 buffer[bufpos++] = c;
1930 break;
1932 else if (is_whitespace (c))
1934 state = wait_rbrace;
1935 break;
1937 else if (c != '}')
1939 context = null_context;
1940 state = initial;
1941 break;
1943 /* Must be right brace. */
1944 /* FALLTHROUGH */
1945 case wait_rbrace:
1946 switch (c)
1948 case_whitespace:
1949 break;
1950 case '}':
1951 buffer[bufpos] = '\0';
1952 token.string = xstrdup (buffer);
1953 extract_quotelike_pass3 (&token, EXIT_FAILURE);
1954 xgettext_current_source_encoding = po_charset_utf8;
1955 remember_a_message (mlp, token.string, context, &pos);
1956 xgettext_current_source_encoding = xgettext_global_source_encoding;
1957 /* FALLTHROUGH */
1958 default:
1959 context = null_context;
1960 state = initial;
1961 break;
1963 break;
1968 /* The last token seen in the token stream. This is important for the
1969 interpretation of '?' and '/'. */
1970 static token_type_ty last_token;
1972 /* Combine characters into tokens. Discard whitespace. */
1974 static void
1975 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
1977 static char *buffer;
1978 static int bufmax;
1979 int bufpos;
1980 int c;
1982 for (;;)
1984 c = phase2_getc ();
1985 tp->line_number = line_number;
1987 switch (c)
1989 case EOF:
1990 tp->type = token_type_eof;
1991 return;
1993 case '\n':
1994 if (last_non_comment_line > last_comment_line)
1995 xgettext_comment_reset ();
1996 /* FALLTHROUGH */
1997 case '\t':
1998 case ' ':
1999 /* Ignore whitespace. */
2000 continue;
2002 case '%':
2003 case '@':
2004 case '*':
2005 case '$':
2006 if (!extract_all)
2008 extract_variable (mlp, tp, c);
2009 prefer_division_over_regexp = true;
2010 return;
2012 break;
2015 last_non_comment_line = tp->line_number;
2017 switch (c)
2019 case '.':
2021 int c2 = phase1_getc ();
2022 phase1_ungetc (c2);
2023 if (c2 == '.')
2025 tp->type = token_type_other;
2026 prefer_division_over_regexp = false;
2027 return;
2029 else if (c2 >= '0' && c2 <= '9')
2031 prefer_division_over_regexp = false;
2033 else
2035 tp->type = token_type_dot;
2036 prefer_division_over_regexp = true;
2037 return;
2040 /* FALLTHROUGH */
2041 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2042 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2043 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2044 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2045 case 'Y': case 'Z':
2046 case '_':
2047 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2048 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2049 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2050 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2051 case 'y': case 'z':
2052 case '0': case '1': case '2': case '3': case '4':
2053 case '5': case '6': case '7': case '8': case '9':
2054 /* Symbol, or part of a number. */
2055 prefer_division_over_regexp = true;
2056 bufpos = 0;
2057 for (;;)
2059 if (bufpos >= bufmax)
2061 bufmax = 2 * bufmax + 10;
2062 buffer = xrealloc (buffer, bufmax);
2064 buffer[bufpos++] = c;
2065 c = phase1_getc ();
2066 switch (c)
2068 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2069 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2070 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2071 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2072 case 'Y': case 'Z':
2073 case '_':
2074 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2075 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2076 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2077 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2078 case 'y': case 'z':
2079 case '0': case '1': case '2': case '3': case '4':
2080 case '5': case '6': case '7': case '8': case '9':
2081 continue;
2083 default:
2084 phase1_ungetc (c);
2085 break;
2087 break;
2089 if (bufpos >= bufmax)
2091 bufmax = 2 * bufmax + 10;
2092 buffer = xrealloc (buffer, bufmax);
2094 buffer[bufpos] = '\0';
2096 if (strcmp (buffer, "__END__") == 0
2097 || strcmp (buffer, "__DATA__") == 0)
2099 end_of_file = true;
2100 tp->type = token_type_eof;
2101 return;
2103 else if (strcmp (buffer, "and") == 0
2104 || strcmp (buffer, "cmp") == 0
2105 || strcmp (buffer, "eq") == 0
2106 || strcmp (buffer, "if") == 0
2107 || strcmp (buffer, "ge") == 0
2108 || strcmp (buffer, "gt") == 0
2109 || strcmp (buffer, "le") == 0
2110 || strcmp (buffer, "lt") == 0
2111 || strcmp (buffer, "ne") == 0
2112 || strcmp (buffer, "not") == 0
2113 || strcmp (buffer, "or") == 0
2114 || strcmp (buffer, "unless") == 0
2115 || strcmp (buffer, "while") == 0
2116 || strcmp (buffer, "xor") == 0)
2118 tp->type = token_type_named_op;
2119 tp->string = xstrdup (buffer);
2120 prefer_division_over_regexp = false;
2121 return;
2123 else if (strcmp (buffer, "s") == 0
2124 || strcmp (buffer, "y") == 0
2125 || strcmp (buffer, "tr") == 0)
2127 int delim = phase1_getc ();
2129 while (is_whitespace (delim))
2130 delim = phase2_getc ();
2132 if (delim == EOF)
2134 tp->type = token_type_eof;
2135 return;
2137 if ((delim >= '0' && delim <= '9')
2138 || (delim >= 'A' && delim <= 'Z')
2139 || (delim >= 'a' && delim <= 'z'))
2141 /* False positive. */
2142 phase2_ungetc (delim);
2143 tp->type = token_type_symbol;
2144 tp->sub_type = symbol_type_none;
2145 tp->string = xstrdup (buffer);
2146 prefer_division_over_regexp = true;
2147 return;
2149 extract_triple_quotelike (mlp, tp, delim,
2150 buffer[0] == 's' && delim != '\'');
2152 /* Eat the following modifiers. */
2154 c = phase1_getc ();
2155 while (c >= 'a' && c <= 'z');
2156 phase1_ungetc (c);
2157 return;
2159 else if (strcmp (buffer, "m") == 0)
2161 int delim = phase1_getc ();
2163 while (is_whitespace (delim))
2164 delim = phase2_getc ();
2166 if (delim == EOF)
2168 tp->type = token_type_eof;
2169 return;
2171 if ((delim >= '0' && delim <= '9')
2172 || (delim >= 'A' && delim <= 'Z')
2173 || (delim >= 'a' && delim <= 'z'))
2175 /* False positive. */
2176 phase2_ungetc (delim);
2177 tp->type = token_type_symbol;
2178 tp->sub_type = symbol_type_none;
2179 tp->string = xstrdup (buffer);
2180 prefer_division_over_regexp = true;
2181 return;
2183 extract_quotelike (tp, delim);
2184 if (delim != '\'')
2185 interpolate_keywords (mlp, tp->string, line_number);
2186 free (tp->string);
2187 tp->type = token_type_regex_op;
2188 prefer_division_over_regexp = true;
2190 /* Eat the following modifiers. */
2192 c = phase1_getc ();
2193 while (c >= 'a' && c <= 'z');
2194 phase1_ungetc (c);
2195 return;
2197 else if (strcmp (buffer, "qq") == 0
2198 || strcmp (buffer, "q") == 0
2199 || strcmp (buffer, "qx") == 0
2200 || strcmp (buffer, "qw") == 0
2201 || strcmp (buffer, "qr") == 0)
2203 /* The qw (...) construct is not really a string but we
2204 can treat in the same manner and then pretend it is
2205 a symbol. Rationale: Saying "qw (foo bar)" is the
2206 same as "my @list = ('foo', 'bar'); @list;". */
2208 int delim = phase1_getc ();
2210 while (is_whitespace (delim))
2211 delim = phase2_getc ();
2213 if (delim == EOF)
2215 tp->type = token_type_eof;
2216 return;
2218 prefer_division_over_regexp = true;
2220 if ((delim >= '0' && delim <= '9')
2221 || (delim >= 'A' && delim <= 'Z')
2222 || (delim >= 'a' && delim <= 'z'))
2224 /* False positive. */
2225 phase2_ungetc (delim);
2226 tp->type = token_type_symbol;
2227 tp->sub_type = symbol_type_none;
2228 tp->string = xstrdup (buffer);
2229 prefer_division_over_regexp = true;
2230 return;
2233 extract_quotelike (tp, delim);
2235 switch (buffer[1])
2237 case 'q':
2238 case 'x':
2239 tp->type = token_type_string;
2240 tp->sub_type = string_type_qq;
2241 interpolate_keywords (mlp, tp->string, line_number);
2242 break;
2243 case 'r':
2244 tp->type = token_type_regex_op;
2245 break;
2246 case 'w':
2247 tp->type = token_type_symbol;
2248 tp->sub_type = symbol_type_none;
2249 break;
2250 case '\0':
2251 tp->type = token_type_string;
2252 tp->sub_type = string_type_q;
2253 break;
2254 default:
2255 abort ();
2257 return;
2259 else if (strcmp (buffer, "grep") == 0
2260 || strcmp (buffer, "split") == 0)
2262 prefer_division_over_regexp = false;
2264 tp->type = token_type_symbol;
2265 tp->sub_type = (strcmp (buffer, "sub") == 0
2266 ? symbol_type_sub
2267 : symbol_type_none);
2268 tp->string = xstrdup (buffer);
2269 return;
2271 case '"':
2272 prefer_division_over_regexp = true;
2273 extract_quotelike (tp, c);
2274 tp->sub_type = string_type_qq;
2275 interpolate_keywords (mlp, tp->string, line_number);
2276 return;
2278 case '`':
2279 prefer_division_over_regexp = true;
2280 extract_quotelike (tp, c);
2281 tp->sub_type = string_type_qq;
2282 interpolate_keywords (mlp, tp->string, line_number);
2283 return;
2285 case '\'':
2286 prefer_division_over_regexp = true;
2287 extract_quotelike (tp, c);
2288 tp->sub_type = string_type_q;
2289 return;
2291 case '(':
2292 c = phase2_getc ();
2293 if (c == ')')
2294 /* Ignore empty list. */
2295 continue;
2296 else
2297 phase2_ungetc (c);
2298 tp->type = token_type_lparen;
2299 prefer_division_over_regexp = false;
2300 return;
2302 case ')':
2303 tp->type = token_type_rparen;
2304 prefer_division_over_regexp = true;
2305 return;
2307 case '{':
2308 tp->type = token_type_lbrace;
2309 prefer_division_over_regexp = false;
2310 return;
2312 case '}':
2313 tp->type = token_type_rbrace;
2314 prefer_division_over_regexp = false;
2315 return;
2317 case '[':
2318 tp->type = token_type_lbracket;
2319 prefer_division_over_regexp = false;
2320 return;
2322 case ']':
2323 tp->type = token_type_rbracket;
2324 prefer_division_over_regexp = false;
2325 return;
2327 case ';':
2328 tp->type = token_type_semicolon;
2329 prefer_division_over_regexp = false;
2330 return;
2332 case ',':
2333 tp->type = token_type_comma;
2334 prefer_division_over_regexp = false;
2335 return;
2337 case '=':
2338 /* Check for fat comma. */
2339 c = phase1_getc ();
2340 if (c == '>')
2342 tp->type = token_type_fat_comma;
2343 return;
2345 else if (linepos == 2
2346 && (last_token == token_type_semicolon
2347 || last_token == token_type_rbrace)
2348 && ((c >= 'A' && c <='Z')
2349 || (c >= 'a' && c <= 'z')))
2351 #if DEBUG_PERL
2352 fprintf (stderr, "%s:%d: start pod section\n",
2353 real_file_name, line_number);
2354 #endif
2355 skip_pod ();
2356 #if DEBUG_PERL
2357 fprintf (stderr, "%s:%d: end pod section\n",
2358 real_file_name, line_number);
2359 #endif
2360 continue;
2362 phase1_ungetc (c);
2363 tp->type = token_type_other;
2364 prefer_division_over_regexp = false;
2365 return;
2367 case '<':
2368 /* Check for <<EOF and friends. */
2369 prefer_division_over_regexp = false;
2370 c = phase1_getc ();
2371 if (c == '<')
2373 c = phase1_getc ();
2374 if (c == '\'')
2376 char *string;
2377 extract_quotelike (tp, c);
2378 string = get_here_document (tp->string);
2379 free (tp->string);
2380 tp->string = string;
2381 tp->type = token_type_string;
2382 tp->sub_type = string_type_verbatim;
2383 tp->line_number = line_number + 1;
2384 return;
2386 else if (c == '"')
2388 char *string;
2389 extract_quotelike (tp, c);
2390 string = get_here_document (tp->string);
2391 free (tp->string);
2392 tp->string = string;
2393 tp->type = token_type_string;
2394 tp->sub_type = string_type_qq;
2395 tp->line_number = line_number + 1;
2396 interpolate_keywords (mlp, tp->string, line_number + 1);
2397 return;
2399 else if ((c >= 'A' && c <= 'Z')
2400 || (c >= 'a' && c <= 'z')
2401 || c == '_')
2403 bufpos = 0;
2404 while ((c >= 'A' && c <= 'Z')
2405 || (c >= 'a' && c <= 'z')
2406 || (c >= '0' && c <= '9')
2407 || c == '_' || c >= 0x80)
2409 if (bufpos >= bufmax)
2411 bufmax = 2 * bufmax + 10;
2412 buffer = xrealloc (buffer, bufmax);
2414 buffer[bufpos++] = c;
2415 c = phase1_getc ();
2417 if (c == EOF)
2419 tp->type = token_type_eof;
2420 return;
2422 else
2424 char *string;
2425 phase1_ungetc (c);
2426 if (bufpos >= bufmax)
2428 bufmax = 2 * bufmax + 10;
2429 buffer = xrealloc (buffer, bufmax);
2431 buffer[bufpos++] = '\0';
2432 string = get_here_document (buffer);
2433 tp->string = string;
2434 tp->type = token_type_string;
2435 tp->sub_type = string_type_qq;
2436 tp->line_number = line_number + 1;
2437 interpolate_keywords (mlp, tp->string, line_number + 1);
2438 return;
2441 else
2443 tp->type = token_type_other;
2444 return;
2447 else
2449 phase1_ungetc (c);
2450 tp->type = token_type_other;
2452 return; /* End of case '>'. */
2454 case '-':
2455 /* Check for dereferencing operator. */
2456 c = phase1_getc ();
2457 if (c == '>')
2459 tp->type = token_type_dereference;
2460 return;
2462 else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2464 /* One of the -X (filetest) functions. We play safe
2465 and accept all alphabetical characters here. */
2466 tp->type = token_type_other;
2467 return;
2469 phase1_ungetc (c);
2470 tp->type = token_type_other;
2471 prefer_division_over_regexp = false;
2472 return;
2474 case '/':
2475 case '?':
2476 if (!prefer_division_over_regexp)
2478 extract_quotelike (tp, c);
2479 interpolate_keywords (mlp, tp->string, line_number);
2480 free (tp->string);
2481 tp->type = token_type_other;
2482 prefer_division_over_regexp = true;
2483 /* Eat the following modifiers. */
2485 c = phase1_getc ();
2486 while (c >= 'a' && c <= 'z');
2487 phase1_ungetc (c);
2488 return;
2490 /* FALLTHROUGH */
2492 default:
2493 /* We could carefully recognize each of the 2 and 3 character
2494 operators, but it is not necessary, as we only need to recognize
2495 gettext invocations. Don't bother. */
2496 tp->type = token_type_other;
2497 prefer_division_over_regexp = false;
2498 return;
2504 /* A token stack used as a lookahead buffer. */
2506 typedef struct token_stack_ty token_stack_ty;
2507 struct token_stack_ty
2509 token_ty **items;
2510 size_t nitems;
2511 size_t nitems_max;
2514 static struct token_stack_ty token_stack;
2516 #if DEBUG_PERL
2517 /* Dumps all resources allocated by stack STACK. */
2518 static int
2519 token_stack_dump (token_stack_ty *stack)
2521 size_t i;
2523 fprintf (stderr, "BEGIN STACK DUMP\n");
2524 for (i = 0; i < stack->nitems; i++)
2526 token_ty *token = stack->items[i];
2527 fprintf (stderr, " [%s]\n", token2string (token));
2528 switch (token->type)
2530 case token_type_named_op:
2531 case token_type_string:
2532 case token_type_symbol:
2533 case token_type_variable:
2534 fprintf (stderr, " string: %s\n", token->string);
2535 break;
2538 fprintf (stderr, "END STACK DUMP\n");
2539 return 0;
2541 #endif
2543 /* Pushes the token TOKEN onto the stack STACK. */
2544 static inline void
2545 token_stack_push (token_stack_ty *stack, token_ty *token)
2547 if (stack->nitems >= stack->nitems_max)
2549 size_t nbytes;
2551 stack->nitems_max = 2 * stack->nitems_max + 4;
2552 nbytes = stack->nitems_max * sizeof (token_ty *);
2553 stack->items = xrealloc (stack->items, nbytes);
2555 stack->items[stack->nitems++] = token;
2558 /* Pops the most recently pushed token from the stack STACK and returns it.
2559 Returns NULL if the stack is empty. */
2560 static inline token_ty *
2561 token_stack_pop (token_stack_ty *stack)
2563 if (stack->nitems > 0)
2564 return stack->items[--(stack->nitems)];
2565 else
2566 return NULL;
2569 /* Return the top of the stack without removing it from the stack, or
2570 NULL if the stack is empty. */
2571 static inline token_ty *
2572 token_stack_peek (const token_stack_ty *stack)
2574 if (stack->nitems > 0)
2575 return stack->items[stack->nitems - 1];
2576 else
2577 return NULL;
2580 /* Frees all resources allocated by stack STACK. */
2581 static inline void
2582 token_stack_free (token_stack_ty *stack)
2584 size_t i;
2586 for (i = 0; i < stack->nitems; i++)
2587 free_token (stack->items[i]);
2588 free (stack->items);
2592 static token_ty *
2593 x_perl_lex (message_list_ty *mlp)
2595 #if DEBUG_PERL
2596 int dummy = token_stack_dump (&token_stack);
2597 #endif
2598 token_ty *tp = token_stack_pop (&token_stack);
2600 if (!tp)
2602 tp = (token_ty *) xmalloc (sizeof (token_ty));
2603 x_perl_prelex (mlp, tp);
2604 #if DEBUG_PERL
2605 fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2606 real_file_name, line_number, token2string (tp));
2607 #endif
2609 #if DEBUG_PERL
2610 else
2612 fprintf (stderr, "%s:%d: %s recycled from stack\n",
2613 real_file_name, line_number, token2string (tp));
2615 #endif
2617 /* A symbol followed by a fat comma is really a single-quoted string.
2618 Function definitions or forward declarations also need a special
2619 handling because the dollars and at signs inside the parentheses
2620 must not be interpreted as the beginning of a variable ')'. */
2621 if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2623 token_ty *next = token_stack_peek (&token_stack);
2625 if (!next)
2627 #if DEBUG_PERL
2628 fprintf (stderr, "%s:%d: pre-fetching next token\n",
2629 real_file_name, line_number);
2630 #endif
2631 next = x_perl_lex (mlp);
2632 x_perl_unlex (next);
2633 #if DEBUG_PERL
2634 fprintf (stderr, "%s:%d: unshifted next token\n",
2635 real_file_name, line_number);
2636 #endif
2639 #if DEBUG_PERL
2640 fprintf (stderr, "%s:%d: next token is %s\n",
2641 real_file_name, line_number, token2string (next));
2642 #endif
2644 if (next->type == token_type_fat_comma)
2646 tp->type = token_type_string;
2647 tp->sub_type = string_type_q;
2648 #if DEBUG_PERL
2649 fprintf (stderr,
2650 "%s:%d: token %s mutated to token_type_string\n",
2651 real_file_name, line_number, token2string (tp));
2652 #endif
2654 else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2655 && next->type == token_type_symbol)
2657 /* Start of a function declaration or definition. Mark this
2658 symbol as a function name, so that we can later eat up
2659 possible prototype information. */
2660 #if DEBUG_PERL
2661 fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2662 real_file_name, line_number, next->string);
2663 #endif
2664 next->sub_type = symbol_type_function;
2666 else if (tp->type == token_type_symbol
2667 && (tp->sub_type == symbol_type_sub
2668 || tp->sub_type == symbol_type_function)
2669 && next->type == token_type_lparen)
2671 /* For simplicity we simply consume everything up to the
2672 closing parenthesis. Actually only a limited set of
2673 characters is allowed inside parentheses but we leave
2674 complaints to the interpreter and are prepared for
2675 future extensions to the Perl syntax. */
2676 int c;
2678 #if DEBUG_PERL
2679 fprintf (stderr, "%s:%d: consuming prototype information\n",
2680 real_file_name, line_number);
2681 #endif
2685 c = phase1_getc ();
2686 #if DEBUG_PERL
2687 fprintf (stderr, " consuming character '%c'\n", c);
2688 #endif
2690 while (c != EOF && c != ')');
2691 phase1_ungetc (c);
2695 return tp;
2698 static void
2699 x_perl_unlex (token_ty *tp)
2701 token_stack_push (&token_stack, tp);
2705 /* ========================= Extracting strings. ========================== */
2707 /* Assuming TP is a string token, this function accumulates all subsequent
2708 . string2 . string3 ... to the string. (String concatenation.) */
2710 static char *
2711 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2713 char *string;
2714 size_t len;
2716 extract_quotelike_pass3 (tp, error_level);
2717 string = xstrdup (tp->string);
2718 len = strlen (tp->string) + 1;
2720 for (;;)
2722 int c;
2725 c = phase2_getc ();
2726 while (is_whitespace (c));
2728 if (c != '.')
2730 phase2_ungetc (c);
2731 return string;
2735 c = phase2_getc ();
2736 while (is_whitespace (c));
2738 phase2_ungetc (c);
2740 if (c == '"' || c == '\'' || c == '`'
2741 || (!prefer_division_over_regexp && (c == '/' || c == '?'))
2742 || c == 'q')
2744 token_ty *qstring = x_perl_lex (mlp);
2745 if (qstring->type != token_type_string)
2747 /* assert (qstring->type == token_type_symbol) */
2748 x_perl_unlex (qstring);
2749 return string;
2752 extract_quotelike_pass3 (qstring, error_level);
2753 len += strlen (qstring->string);
2754 string = xrealloc (string, len);
2755 strcat (string, qstring->string);
2756 free_token (qstring);
2761 /* The file is broken into tokens. Scan the token stream, looking for
2762 a keyword, followed by a left paren, followed by a string. When we
2763 see this sequence, we have something to remember. We assume we are
2764 looking at a valid C or C++ program, and leave the complaints about
2765 the grammar to the compiler.
2767 Normal handling: Look for
2768 keyword ( ... msgid ... )
2769 Plural handling: Look for
2770 keyword ( ... msgid ... msgid_plural ... )
2772 We use recursion because the arguments before msgid or between msgid
2773 and msgid_plural can contain subexpressions of the same form. */
2775 /* Extract messages until the next balanced closing parenthesis.
2776 Extracted messages are added to MLP.
2778 When specific arguments shall be extracted, ARG_SG and ARG_PL are
2779 set to the corresponding argument number or -1 if not applicable.
2781 Returns true for EOF, false otherwise.
2783 States are:
2785 0 - initial state
2786 1 - keyword has been seen
2787 2 - extractable string has been seen
2788 3 - a dot operator after an extractable string has been seen
2790 States 2 and 3 are "fragile", the parser will remain in state 2
2791 as long as only opening parentheses are seen, a transition to
2792 state 3 is done on appearance of a dot operator, all other tokens
2793 will cause the parser to fall back to state 1 or 0, eventually
2794 with an error message about invalid intermixing of constant and
2795 non-constant strings.
2797 Likewise, state 3 is fragile. The parser will remain in state 3
2798 as long as only closing parentheses are seen, a transition to state
2799 2 is done on appearance of another (literal!) string, all other
2800 tokens will cause a warning. */
2802 static bool
2803 extract_balanced (message_list_ty *mlp, int state, token_type_ty delim,
2804 flag_context_ty outer_context,
2805 flag_context_list_iterator_ty context_iter,
2806 int arg_sg, int arg_pl)
2808 /* Remember the message containing the msgid, for msgid_plural. */
2809 message_ty *plural_mp = NULL;
2811 /* The current argument for a possibly extracted keyword. Counting
2812 starts with 1. */
2813 int arg_count = 1;
2815 /* Number of left parentheses seen. */
2816 int paren_seen = 0;
2818 /* Whether to implicitly assume the next tokens are arguments even without
2819 a '('. */
2820 bool next_is_argument = false;
2822 /* Context iterator that will be used if the next token is a '('. */
2823 flag_context_list_iterator_ty next_context_iter =
2824 passthrough_context_list_iterator;
2825 /* Current context. */
2826 flag_context_ty inner_context =
2827 inherited_context (outer_context,
2828 flag_context_list_iterator_advance (&context_iter));
2830 #if DEBUG_PERL
2831 static int nesting_level = 0;
2833 ++nesting_level;
2834 #endif
2836 last_token = token_type_semicolon; /* Safe assumption. */
2837 prefer_division_over_regexp = false;
2839 for (;;)
2841 int my_last_token = last_token;
2842 /* The current token. */
2843 token_ty *tp;
2845 tp = x_perl_lex (mlp);
2847 last_token = tp->type;
2849 if (delim == tp->type)
2851 #if DEBUG_PERL
2852 fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
2853 logical_file_name, tp->line_number, --nesting_level);
2854 #endif
2855 free_token (tp);
2856 return false;
2859 if (next_is_argument && tp->type != token_type_lparen)
2861 /* An argument list starts, even though there is no '('. */
2862 context_iter = next_context_iter;
2863 outer_context = inner_context;
2864 inner_context =
2865 inherited_context (outer_context,
2866 flag_context_list_iterator_advance (
2867 &context_iter));
2870 switch (tp->type)
2872 case token_type_symbol:
2873 #if DEBUG_PERL
2874 fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
2875 logical_file_name, tp->line_number, nesting_level,
2876 tp->string);
2877 #endif
2880 void *keyword_value;
2882 if (find_entry (&keywords, tp->string, strlen (tp->string),
2883 &keyword_value) == 0)
2885 last_token = token_type_keyword_symbol;
2887 arg_sg = (int) (long) keyword_value & ((1 << 10) - 1);
2888 arg_pl = (int) (long) keyword_value >> 10;
2889 arg_count = 1;
2891 state = 2;
2894 next_is_argument = true;
2895 next_context_iter =
2896 flag_context_list_iterator (
2897 flag_context_list_table_lookup (
2898 flag_context_list_table,
2899 tp->string, strlen (tp->string)));
2900 break;
2902 case token_type_variable:
2903 #if DEBUG_PERL
2904 fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
2905 logical_file_name, tp->line_number, nesting_level, tp->string);
2906 #endif
2907 prefer_division_over_regexp = true;
2908 next_is_argument = false;
2909 next_context_iter = null_context_list_iterator;
2910 break;
2912 case token_type_lparen:
2913 #if DEBUG_PERL
2914 fprintf (stderr, "%s:%d: type left parentheses (%d)\n",
2915 logical_file_name, tp->line_number, nesting_level);
2916 #endif
2917 ++paren_seen;
2919 if (extract_balanced (mlp, state, token_type_rparen,
2920 inner_context, next_context_iter,
2921 arg_sg - arg_count + 1, arg_pl - arg_count + 1))
2923 free_token (tp);
2924 return true;
2926 if (my_last_token == token_type_keyword_symbol)
2927 arg_sg = arg_pl = -1;
2928 next_is_argument = false;
2929 next_context_iter = null_context_list_iterator;
2930 break;
2932 case token_type_rparen:
2933 #if DEBUG_PERL
2934 fprintf (stderr, "%s:%d: type right parentheses(%d)\n",
2935 logical_file_name, tp->line_number, nesting_level);
2936 #endif
2937 --paren_seen;
2938 next_is_argument = false;
2939 next_context_iter = null_context_list_iterator;
2940 break;
2942 case token_type_comma:
2943 case token_type_fat_comma:
2944 #if DEBUG_PERL
2945 fprintf (stderr, "%s:%d: type comma (%d)\n",
2946 logical_file_name, tp->line_number, nesting_level);
2947 #endif
2948 ++arg_count;
2949 if (arg_count > arg_sg && arg_count > arg_pl)
2951 /* We have missed the argument. */
2952 arg_sg = arg_pl = -1;
2953 arg_count = 0;
2955 #if DEBUG_PERL
2956 fprintf (stderr, "%s:%d: arg_count: %d, arg_sg: %d, arg_pl: %d\n",
2957 real_file_name, tp->line_number,
2958 arg_count, arg_sg, arg_pl);
2959 #endif
2960 inner_context =
2961 inherited_context (outer_context,
2962 flag_context_list_iterator_advance (
2963 &context_iter));
2964 next_is_argument = false;
2965 next_context_iter = passthrough_context_list_iterator;
2966 break;
2968 case token_type_string:
2969 #if DEBUG_PERL
2970 fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
2971 logical_file_name, tp->line_number, nesting_level,
2972 tp->string);
2973 #endif
2975 if (extract_all)
2977 lex_pos_ty pos;
2978 char *string;
2980 pos.file_name = logical_file_name;
2981 pos.line_number = tp->line_number;
2982 string = collect_message (mlp, tp, EXIT_SUCCESS);
2983 xgettext_current_source_encoding = po_charset_utf8;
2984 remember_a_message (mlp, string, inner_context, &pos);
2985 xgettext_current_source_encoding = xgettext_global_source_encoding;
2987 else if (state)
2989 lex_pos_ty pos;
2990 char *string;
2992 pos.file_name = logical_file_name;
2993 pos.line_number = tp->line_number;
2995 if (arg_count == arg_sg)
2997 string = collect_message (mlp, tp, EXIT_FAILURE);
2998 xgettext_current_source_encoding = po_charset_utf8;
2999 plural_mp = remember_a_message (mlp, string, inner_context, &pos);
3000 xgettext_current_source_encoding = xgettext_global_source_encoding;
3001 arg_sg = -1;
3003 else if (arg_count == arg_pl)
3005 if (plural_mp == NULL)
3006 error (EXIT_FAILURE, 0, _("\
3007 %s:%d: fatal: plural message seen before singular message\n"),
3008 real_file_name, tp->line_number);
3010 string = collect_message (mlp, tp, EXIT_FAILURE);
3011 xgettext_current_source_encoding = po_charset_utf8;
3012 remember_a_message_plural (plural_mp, string, inner_context, &pos);
3013 xgettext_current_source_encoding = xgettext_global_source_encoding;
3014 arg_pl = -1;
3018 if (arg_sg == -1 && arg_pl == -1)
3020 state = 0;
3021 plural_mp = NULL;
3024 next_is_argument = false;
3025 next_context_iter = null_context_list_iterator;
3026 break;
3028 case token_type_eof:
3029 #if DEBUG_PERL
3030 fprintf (stderr, "%s:%d: type EOF (%d)\n",
3031 logical_file_name, tp->line_number, nesting_level);
3032 #endif
3033 free_token (tp);
3034 return true;
3036 case token_type_lbrace:
3037 #if DEBUG_PERL
3038 fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3039 logical_file_name, tp->line_number, nesting_level);
3040 #endif
3041 if (extract_balanced (mlp, 0, token_type_rbrace,
3042 null_context, null_context_list_iterator,
3043 -1, -1))
3045 free_token (tp);
3046 return true;
3048 next_is_argument = false;
3049 next_context_iter = null_context_list_iterator;
3050 break;
3052 case token_type_rbrace:
3053 #if DEBUG_PERL
3054 fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3055 logical_file_name, tp->line_number, nesting_level);
3056 #endif
3057 next_is_argument = false;
3058 next_context_iter = null_context_list_iterator;
3059 state = 0;
3060 break;
3062 case token_type_lbracket:
3063 #if DEBUG_PERL
3064 fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3065 logical_file_name, tp->line_number, nesting_level);
3066 #endif
3067 if (extract_balanced (mlp, 0, token_type_rbracket,
3068 null_context, null_context_list_iterator,
3069 -1, -1))
3071 free_token (tp);
3072 return true;
3074 next_is_argument = false;
3075 next_context_iter = null_context_list_iterator;
3076 break;
3078 case token_type_rbracket:
3079 #if DEBUG_PERL
3080 fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3081 logical_file_name, tp->line_number, nesting_level);
3082 #endif
3083 next_is_argument = false;
3084 next_context_iter = null_context_list_iterator;
3085 state = 0;
3086 break;
3088 case token_type_semicolon:
3089 #if DEBUG_PERL
3090 fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3091 logical_file_name, tp->line_number, nesting_level);
3092 #endif
3093 state = 0;
3095 /* The ultimate sign. */
3096 arg_sg = arg_pl = -1;
3098 /* FIXME: Instead of resetting outer_context here, it may be better
3099 to recurse in the next_is_argument handling above, waiting for
3100 the next semicolon or other statement terminator. */
3101 outer_context = null_context;
3102 context_iter = null_context_list_iterator;
3103 next_is_argument = false;
3104 next_context_iter = passthrough_context_list_iterator;
3105 inner_context =
3106 inherited_context (outer_context,
3107 flag_context_list_iterator_advance (
3108 &context_iter));
3109 break;
3111 case token_type_dereference:
3112 #if DEBUG_PERL
3113 fprintf (stderr, "%s:%d: type dereference (%d)\n",
3114 logical_file_name, tp->line_number, nesting_level);
3115 #endif
3116 next_is_argument = false;
3117 next_context_iter = null_context_list_iterator;
3118 break;
3120 case token_type_dot:
3121 #if DEBUG_PERL
3122 fprintf (stderr, "%s:%d: type dot (%d)\n",
3123 logical_file_name, tp->line_number, nesting_level);
3124 #endif
3125 next_is_argument = false;
3126 next_context_iter = null_context_list_iterator;
3127 state = 0;
3128 break;
3130 case token_type_named_op:
3131 #if DEBUG_PERL
3132 fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3133 logical_file_name, tp->line_number, nesting_level,
3134 tp->string);
3135 #endif
3136 next_is_argument = false;
3137 next_context_iter = null_context_list_iterator;
3138 state = 0;
3139 break;
3141 case token_type_regex_op:
3142 #if DEBUG_PERL
3143 fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3144 logical_file_name, tp->line_number, nesting_level);
3145 #endif
3146 next_is_argument = false;
3147 next_context_iter = null_context_list_iterator;
3148 break;
3150 case token_type_other:
3151 #if DEBUG_PERL
3152 fprintf (stderr, "%s:%d: type other (%d)\n",
3153 logical_file_name, tp->line_number, nesting_level);
3154 #endif
3155 next_is_argument = false;
3156 next_context_iter = null_context_list_iterator;
3157 state = 0;
3158 break;
3160 default:
3161 fprintf (stderr, "%s:%d: unknown token type %d\n",
3162 real_file_name, tp->line_number, tp->type);
3163 abort ();
3166 free_token (tp);
3170 void
3171 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3172 flag_context_list_table_ty *flag_table,
3173 msgdomain_list_ty *mdlp)
3175 message_list_ty *mlp = mdlp->item[0]->messages;
3177 fp = f;
3178 real_file_name = real_filename;
3179 logical_file_name = xstrdup (logical_filename);
3180 line_number = 0;
3182 last_comment_line = -1;
3183 last_non_comment_line = -1;
3185 flag_context_list_table = flag_table;
3187 init_keywords ();
3189 token_stack.items = NULL;
3190 token_stack.nitems = 0;
3191 token_stack.nitems_max = 0;
3192 linesize = 0;
3193 linepos = 0;
3194 here_eaten = 0;
3195 end_of_file = false;
3197 /* Eat tokens until eof is seen. When extract_balanced returns
3198 due to an unbalanced closing brace, just restart it. */
3199 while (!extract_balanced (mlp, 0, token_type_rbrace,
3200 null_context, null_context_list_iterator,
3201 -1, -1))
3204 fp = NULL;
3205 real_file_name = NULL;
3206 free (logical_file_name);
3207 logical_file_name = NULL;
3208 line_number = 0;
3209 last_token = token_type_semicolon;
3210 token_stack_free (&token_stack);
3211 here_eaten = 0;
3212 end_of_file = true;