1 /* xgettext awk backend.
2 Copyright (C) 2002-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34 #include "error-progname.h"
39 #define _(s) gettext(s)
42 /* The awk syntax is defined in the gawk manual page and documentation.
43 See also gawk/awkgram.y. */
46 /* ====================== Keyword set customization. ====================== */
48 /* If true extract all strings. */
49 static bool extract_all
= false;
51 static hash_table keywords
;
52 static bool default_keywords
= true;
63 x_awk_keyword (const char *name
)
66 default_keywords
= false;
74 if (keywords
.table
== NULL
)
75 init_hash (&keywords
, 100);
77 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
79 /* The characters between name and end should form a valid C identifier.
80 A colon means an invalid parse in split_keywordspec(). */
81 colon
= strchr (name
, ':');
82 if (colon
== NULL
|| colon
>= end
)
86 insert_entry (&keywords
, name
, end
- name
,
87 (void *) (long) (argnum1
+ (argnum2
<< 10)));
92 /* Finish initializing the keywords hash table.
93 Called after argument processing, before each file is processed. */
99 x_awk_keyword ("dcgettext");
100 x_awk_keyword ("dcngettext:1,2");
101 default_keywords
= false;
106 init_flag_table_awk ()
108 xgettext_record_flag ("dcgettext:1:pass-awk-format");
109 xgettext_record_flag ("dcngettext:1:pass-awk-format");
110 xgettext_record_flag ("dcngettext:2:pass-awk-format");
111 xgettext_record_flag ("printf:1:awk-format");
115 /* ======================== Reading of characters. ======================== */
117 /* Real filename, used in error messages about the input file. */
118 static const char *real_file_name
;
120 /* Logical filename and line number, used to label the extracted messages. */
121 static char *logical_file_name
;
122 static int line_number
;
124 /* The input file stream. */
127 /* These are for tracking whether comments count as immediately before
129 static int last_comment_line
;
130 static int last_non_comment_line
;
133 /* 1. line_number handling. */
143 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
154 /* Supports only one pushback character. */
156 phase1_ungetc (int c
)
168 /* 2. Replace each comment that is not inside a string literal or regular
169 expression with a newline character. We need to remember the comment
170 for later, because it may be attached to a keyword string. */
176 static size_t bufmax
;
185 lineno
= line_number
;
189 if (c
== '\n' || c
== EOF
)
191 /* We skip all leading white space, but not EOLs. */
192 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
194 if (buflen
>= bufmax
)
196 bufmax
= 2 * bufmax
+ 10;
197 buffer
= xrealloc (buffer
, bufmax
);
199 buffer
[buflen
++] = c
;
202 if (buflen
>= bufmax
)
204 bufmax
= 2 * bufmax
+ 10;
205 buffer
= xrealloc (buffer
, bufmax
);
207 buffer
[buflen
] = '\0';
208 xgettext_comment_add (buffer
);
209 last_comment_line
= lineno
;
214 /* Supports only one pushback character. */
216 phase2_ungetc (int c
)
223 /* ========================== Reading of tokens. ========================== */
229 token_type_lparen
, /* ( */
230 token_type_rparen
, /* ) */
231 token_type_comma
, /* , */
232 token_type_string
, /* "abc" */
233 token_type_i18nstring
, /* _"abc" */
234 token_type_symbol
, /* symbol, number */
235 token_type_semicolon
, /* ; */
236 token_type_other
/* regexp, misc. operator */
238 typedef enum token_type_ty token_type_ty
;
240 typedef struct token_ty token_ty
;
244 char *string
; /* for token_type_{symbol,string,i18nstring} */
249 /* 7. Replace escape sequences within character strings with their
250 single character equivalents. */
252 #define P7_QUOTES (1000 + '"')
261 /* Use phase 1, because phase 2 elides comments. */
264 if (c
== EOF
|| c
== '\n')
290 case '0': case '1': case '2': case '3': case '4':
291 case '5': case '6': case '7':
298 if (c
>= '0' && c
<= '7')
300 n
= (n
<< 3) + (c
- '0');
304 if (c
>= '0' && c
<= '7')
305 n
= (n
<< 3) + (c
- '0');
313 return (unsigned char) n
;
324 else if (c
>= '0' && c
<= '9')
325 n
= (n
<< 4) + (c
- '0');
326 else if (c
>= 'A' && c
<= 'F')
327 n
= (n
<< 4) + (c
- 'A' + 10);
328 else if (c
>= 'a' && c
<= 'f')
329 n
= (n
<< 4) + (c
- 'a' + 10);
336 return (unsigned char) n
;
344 error_with_progname
= false;
345 error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name
,
347 error_with_progname
= true;
352 /* Free the memory pointed to by a 'struct token_ty'. */
354 free_token (token_ty
*tp
)
358 case token_type_string
:
359 case token_type_i18nstring
:
360 case token_type_symbol
:
369 /* Combine characters into tokens. Discard whitespace. */
371 /* There is an ambiguity about '/': It can start a division operator ('/' or
372 '/=') or it can start a regular expression. The distinction is important
373 because inside regular expressions, '#' and '"' lose its special meanings.
374 If you look at the awk grammar, you see that the operator is only allowed
375 right after a 'variable' or 'simp_exp' nonterminal, and these nonterminals
376 can only end in the NAME, LENGTH, YSTRING, YNUMBER, ')', ']' terminals.
377 So we prefer the division operator interpretation only right after
378 symbol, string, number, ')', ']', with whitespace but no newline allowed
380 static bool prefer_division_over_regexp
;
383 x_awk_lex (token_ty
*tp
)
392 tp
->line_number
= line_number
;
398 tp
->type
= token_type_eof
;
402 if (last_non_comment_line
> last_comment_line
)
403 xgettext_comment_reset ();
404 /* Newline is not allowed inside expressions. It usually
405 introduces a fresh statement.
406 FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else'
407 does *not* introduce a fresh statement. */
408 prefer_division_over_regexp
= false;
412 /* Ignore whitespace and comments. */
416 /* Backslash ought to be immediately followed by a newline. */
420 last_non_comment_line
= tp
->line_number
;
426 int c2
= phase2_getc ();
428 if (!(c2
>= '0' && c2
<= '9'))
431 tp
->type
= token_type_other
;
432 prefer_division_over_regexp
= false;
437 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
438 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
439 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
440 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
443 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
444 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
445 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
446 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
448 case '0': case '1': case '2': case '3': case '4':
449 case '5': case '6': case '7': case '8': case '9':
450 /* Symbol, or part of a number. */
454 if (bufpos
>= bufmax
)
456 bufmax
= 2 * bufmax
+ 10;
457 buffer
= xrealloc (buffer
, bufmax
);
459 buffer
[bufpos
++] = c
;
463 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
464 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
465 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
466 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
469 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
470 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
471 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
472 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
474 case '0': case '1': case '2': case '3': case '4':
475 case '5': case '6': case '7': case '8': case '9':
478 if (bufpos
== 1 && buffer
[0] == '_' && c
== '"')
480 tp
->type
= token_type_i18nstring
;
488 if (bufpos
>= bufmax
)
490 bufmax
= 2 * bufmax
+ 10;
491 buffer
= xrealloc (buffer
, bufmax
);
493 buffer
[bufpos
] = '\0';
494 tp
->string
= xstrdup (buffer
);
495 tp
->type
= token_type_symbol
;
496 /* Most identifiers can be variable names; after them we must
497 interpret '/' as division operator. But for awk's builtin
498 keywords we have three cases:
499 (a) Must interpret '/' as division operator. "length".
500 (b) Must interpret '/' as start of a regular expression.
501 "do", "exit", "print", "printf", "return".
502 (c) '/' after this keyword in invalid anyway. All others.
503 I used the following script for the distinction.
504 for k in $awk_keywords; do
505 echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null
508 if (strcmp (buffer
, "do") == 0
509 || strcmp (buffer
, "exit") == 0
510 || strcmp (buffer
, "print") == 0
511 || strcmp (buffer
, "printf") == 0
512 || strcmp (buffer
, "return") == 0)
513 prefer_division_over_regexp
= false;
515 prefer_division_over_regexp
= true;
519 tp
->type
= token_type_string
;
525 if (c
== EOF
|| c
== P7_QUOTES
)
527 if (bufpos
>= bufmax
)
529 bufmax
= 2 * bufmax
+ 10;
530 buffer
= xrealloc (buffer
, bufmax
);
532 buffer
[bufpos
++] = c
;
534 if (bufpos
>= bufmax
)
536 bufmax
= 2 * bufmax
+ 10;
537 buffer
= xrealloc (buffer
, bufmax
);
539 buffer
[bufpos
] = '\0';
540 tp
->string
= xstrdup (buffer
);
541 prefer_division_over_regexp
= true;
545 tp
->type
= token_type_lparen
;
546 prefer_division_over_regexp
= false;
550 tp
->type
= token_type_rparen
;
551 prefer_division_over_regexp
= true;
555 tp
->type
= token_type_comma
;
556 prefer_division_over_regexp
= false;
560 tp
->type
= token_type_semicolon
;
561 prefer_division_over_regexp
= false;
565 tp
->type
= token_type_other
;
566 prefer_division_over_regexp
= true;
570 if (!prefer_division_over_regexp
)
572 /* Regular expression.
573 Counting brackets is non-trivial. [[] is balanced, and so is
574 [\]]. Also, /[/]/ is balanced and ends at the third slash.
575 Do not count [ or ] if either one is preceded by a \.
576 A '[' should be counted if
577 a) it is the first one so far (brackets == 0), or
578 b) it is the '[' in '[:'.
579 A ']' should be counted if not preceded by a \.
580 According to POSIX, []] is how you put a ] into a set.
581 Try to handle that too.
584 bool pos0
= true; /* true at start of regexp */
585 bool pos1_open
= false; /* true after [ at start of regexp */
586 bool pos2_open_not
= false; /* true after [^ at start of regexp */
592 if (c
== EOF
|| c
== '\n')
595 error_with_progname
= false;
596 error (0, 0, _("%s:%d: warning: unterminated regular expression"),
597 logical_file_name
, line_number
);
598 error_with_progname
= true;
621 if (!(pos1_open
|| pos2_open_not
))
629 pos2_open_not
= true;
636 /* Backslash-newline is valid and ignored. */
646 pos2_open_not
= false;
649 tp
->type
= token_type_other
;
650 prefer_division_over_regexp
= false;
656 /* We could carefully recognize each of the 2 and 3 character
657 operators, but it is not necessary, as we only need to recognize
658 gettext invocations. Don't bother. */
659 tp
->type
= token_type_other
;
660 prefer_division_over_regexp
= false;
667 /* ========================= Extracting strings. ========================== */
670 /* Context lookup table. */
671 static flag_context_list_table_ty
*flag_context_list_table
;
674 /* The file is broken into tokens. Scan the token stream, looking for
675 a keyword, followed by a left paren, followed by a string. When we
676 see this sequence, we have something to remember. We assume we are
677 looking at a valid C or C++ program, and leave the complaints about
678 the grammar to the compiler.
680 Normal handling: Look for
681 keyword ( ... msgid ... )
682 Plural handling: Look for
683 keyword ( ... msgid ... msgid_plural ... )
685 We use recursion because the arguments before msgid or between msgid
686 and msgid_plural can contain subexpressions of the same form. */
689 /* Extract messages until the next balanced closing parenthesis.
690 Extracted messages are added to MLP.
691 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
692 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
693 otherwise PLURAL_COMMAS = 0.
694 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
695 Return true upon eof, false upon closing parenthesis. */
697 extract_parenthesized (message_list_ty
*mlp
,
698 flag_context_ty outer_context
,
699 flag_context_list_iterator_ty context_iter
,
700 int commas_to_skip
, int plural_commas
)
702 /* Remember the message containing the msgid, for msgid_plural. */
703 message_ty
*plural_mp
= NULL
;
705 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
707 /* Parameters of the keyword just seen. Defined only in state 1. */
708 int next_commas_to_skip
= -1;
709 int next_plural_commas
= 0;
710 /* Whether to implicitly assume the next tokens are arguments even without
712 bool next_is_argument
= false;
713 /* Context iterator that will be used if the next token is a '('. */
714 flag_context_list_iterator_ty next_context_iter
=
715 passthrough_context_list_iterator
;
716 /* Current context. */
717 flag_context_ty inner_context
=
718 inherited_context (outer_context
,
719 flag_context_list_iterator_advance (&context_iter
));
721 /* Start state is 0. */
730 if (next_is_argument
&& token
.type
!= token_type_lparen
)
732 /* An argument list starts, even though there is no '('. */
733 context_iter
= next_context_iter
;
734 outer_context
= inner_context
;
736 inherited_context (outer_context
,
737 flag_context_list_iterator_advance (
743 case token_type_symbol
:
747 if (find_entry (&keywords
, token
.string
, strlen (token
.string
),
751 int argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
752 int argnum2
= (int) (long) keyword_value
>> 10;
754 next_commas_to_skip
= argnum1
- 1;
755 next_plural_commas
= (argnum2
> argnum1
? argnum2
- argnum1
: 0);
762 (strcmp (token
.string
, "print") == 0
763 || strcmp (token
.string
, "printf") == 0);
765 flag_context_list_iterator (
766 flag_context_list_table_lookup (
767 flag_context_list_table
,
768 token
.string
, strlen (token
.string
)));
772 case token_type_lparen
:
773 if (extract_parenthesized (mlp
, inner_context
, next_context_iter
,
774 state
? next_commas_to_skip
: -1,
775 state
? next_plural_commas
: 0))
777 next_is_argument
= false;
778 next_context_iter
= null_context_list_iterator
;
782 case token_type_rparen
:
785 case token_type_comma
:
786 if (commas_to_skip
>= 0)
788 if (commas_to_skip
> 0)
791 if (plural_mp
!= NULL
&& plural_commas
> 0)
793 commas_to_skip
= plural_commas
- 1;
800 inherited_context (outer_context
,
801 flag_context_list_iterator_advance (
803 next_is_argument
= false;
804 next_context_iter
= passthrough_context_list_iterator
;
808 case token_type_string
:
811 pos
.file_name
= logical_file_name
;
812 pos
.line_number
= token
.line_number
;
815 remember_a_message (mlp
, token
.string
, inner_context
, &pos
);
818 if (commas_to_skip
== 0)
820 if (plural_mp
== NULL
)
824 remember_a_message (mlp
, token
.string
,
825 inner_context
, &pos
);
826 if (plural_commas
> 0)
831 /* Seen an msgid_plural. */
832 remember_a_message_plural (plural_mp
, token
.string
,
833 inner_context
, &pos
);
841 next_is_argument
= false;
842 next_context_iter
= null_context_list_iterator
;
846 case token_type_i18nstring
:
849 pos
.file_name
= logical_file_name
;
850 pos
.line_number
= token
.line_number
;
852 remember_a_message (mlp
, token
.string
, inner_context
, &pos
);
854 next_is_argument
= false;
855 next_context_iter
= null_context_list_iterator
;
859 case token_type_semicolon
:
860 /* An argument list ends, and a new statement begins. */
861 /* FIXME: Should handle newline that acts as statement separator
863 /* FIXME: Instead of resetting outer_context here, it may be better
864 to recurse in the next_is_argument handling above, waiting for
865 the next semicolon or other statement terminator. */
866 outer_context
= null_context
;
867 context_iter
= null_context_list_iterator
;
868 next_is_argument
= false;
869 next_context_iter
= passthrough_context_list_iterator
;
871 inherited_context (outer_context
,
872 flag_context_list_iterator_advance (
880 case token_type_other
:
881 next_is_argument
= false;
882 next_context_iter
= null_context_list_iterator
;
894 extract_awk (FILE *f
,
895 const char *real_filename
, const char *logical_filename
,
896 flag_context_list_table_ty
*flag_table
,
897 msgdomain_list_ty
*mdlp
)
899 message_list_ty
*mlp
= mdlp
->item
[0]->messages
;
902 real_file_name
= real_filename
;
903 logical_file_name
= xstrdup (logical_filename
);
906 last_comment_line
= -1;
907 last_non_comment_line
= -1;
909 prefer_division_over_regexp
= false;
911 flag_context_list_table
= flag_table
;
915 /* Eat tokens until eof is seen. When extract_parenthesized returns
916 due to an unbalanced closing parenthesis, just restart it. */
917 while (!extract_parenthesized (mlp
, null_context
, null_context_list_iterator
,
922 real_file_name
= NULL
;
923 logical_file_name
= NULL
;