Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-awk.c
blobe9e6061132e435340722338db799b8dd631d34f9
1 /* xgettext awk backend.
2 Copyright (C) 2002-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-awk.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "exit.h"
37 #include "gettext.h"
39 #define _(s) gettext(s)
42 /* The awk syntax is defined in the gawk manual page and documentation.
43 See also gawk/awkgram.y. */
46 /* ====================== Keyword set customization. ====================== */
48 /* If true extract all strings. */
49 static bool extract_all = false;
51 static hash_table keywords;
52 static bool default_keywords = true;
55 void
56 x_awk_extract_all ()
58 extract_all = true;
62 void
63 x_awk_keyword (const char *name)
65 if (name == NULL)
66 default_keywords = false;
67 else
69 const char *end;
70 int argnum1;
71 int argnum2;
72 const char *colon;
74 if (keywords.table == NULL)
75 init_hash (&keywords, 100);
77 split_keywordspec (name, &end, &argnum1, &argnum2);
79 /* The characters between name and end should form a valid C identifier.
80 A colon means an invalid parse in split_keywordspec(). */
81 colon = strchr (name, ':');
82 if (colon == NULL || colon >= end)
84 if (argnum1 == 0)
85 argnum1 = 1;
86 insert_entry (&keywords, name, end - name,
87 (void *) (long) (argnum1 + (argnum2 << 10)));
92 /* Finish initializing the keywords hash table.
93 Called after argument processing, before each file is processed. */
94 static void
95 init_keywords ()
97 if (default_keywords)
99 x_awk_keyword ("dcgettext");
100 x_awk_keyword ("dcngettext:1,2");
101 default_keywords = false;
105 void
106 init_flag_table_awk ()
108 xgettext_record_flag ("dcgettext:1:pass-awk-format");
109 xgettext_record_flag ("dcngettext:1:pass-awk-format");
110 xgettext_record_flag ("dcngettext:2:pass-awk-format");
111 xgettext_record_flag ("printf:1:awk-format");
115 /* ======================== Reading of characters. ======================== */
117 /* Real filename, used in error messages about the input file. */
118 static const char *real_file_name;
120 /* Logical filename and line number, used to label the extracted messages. */
121 static char *logical_file_name;
122 static int line_number;
124 /* The input file stream. */
125 static FILE *fp;
127 /* These are for tracking whether comments count as immediately before
128 keyword. */
129 static int last_comment_line;
130 static int last_non_comment_line;
133 /* 1. line_number handling. */
135 static int
136 phase1_getc ()
138 int c = getc (fp);
140 if (c == EOF)
142 if (ferror (fp))
143 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
144 real_file_name);
145 return EOF;
148 if (c == '\n')
149 line_number++;
151 return c;
154 /* Supports only one pushback character. */
155 static void
156 phase1_ungetc (int c)
158 if (c != EOF)
160 if (c == '\n')
161 --line_number;
163 ungetc (c, fp);
168 /* 2. Replace each comment that is not inside a string literal or regular
169 expression with a newline character. We need to remember the comment
170 for later, because it may be attached to a keyword string. */
172 static int
173 phase2_getc ()
175 static char *buffer;
176 static size_t bufmax;
177 size_t buflen;
178 int lineno;
179 int c;
181 c = phase1_getc ();
182 if (c == '#')
184 buflen = 0;
185 lineno = line_number;
186 for (;;)
188 c = phase1_getc ();
189 if (c == '\n' || c == EOF)
190 break;
191 /* We skip all leading white space, but not EOLs. */
192 if (!(buflen == 0 && (c == ' ' || c == '\t')))
194 if (buflen >= bufmax)
196 bufmax = 2 * bufmax + 10;
197 buffer = xrealloc (buffer, bufmax);
199 buffer[buflen++] = c;
202 if (buflen >= bufmax)
204 bufmax = 2 * bufmax + 10;
205 buffer = xrealloc (buffer, bufmax);
207 buffer[buflen] = '\0';
208 xgettext_comment_add (buffer);
209 last_comment_line = lineno;
211 return c;
214 /* Supports only one pushback character. */
215 static void
216 phase2_ungetc (int c)
218 if (c != EOF)
219 phase1_ungetc (c);
223 /* ========================== Reading of tokens. ========================== */
226 enum token_type_ty
228 token_type_eof,
229 token_type_lparen, /* ( */
230 token_type_rparen, /* ) */
231 token_type_comma, /* , */
232 token_type_string, /* "abc" */
233 token_type_i18nstring, /* _"abc" */
234 token_type_symbol, /* symbol, number */
235 token_type_semicolon, /* ; */
236 token_type_other /* regexp, misc. operator */
238 typedef enum token_type_ty token_type_ty;
240 typedef struct token_ty token_ty;
241 struct token_ty
243 token_type_ty type;
244 char *string; /* for token_type_{symbol,string,i18nstring} */
245 int line_number;
249 /* 7. Replace escape sequences within character strings with their
250 single character equivalents. */
252 #define P7_QUOTES (1000 + '"')
254 static int
255 phase7_getc ()
257 int c;
259 for (;;)
261 /* Use phase 1, because phase 2 elides comments. */
262 c = phase1_getc ();
264 if (c == EOF || c == '\n')
265 break;
266 if (c == '"')
267 return P7_QUOTES;
268 if (c != '\\')
269 return c;
270 c = phase1_getc ();
271 if (c == EOF)
272 break;
273 if (c != '\n')
274 switch (c)
276 case 'a':
277 return '\a';
278 case 'b':
279 return '\b';
280 case 'f':
281 return '\f';
282 case 'n':
283 return '\n';
284 case 'r':
285 return '\r';
286 case 't':
287 return '\t';
288 case 'v':
289 return '\v';
290 case '0': case '1': case '2': case '3': case '4':
291 case '5': case '6': case '7':
293 int n = c - '0';
295 c = phase1_getc ();
296 if (c != EOF)
298 if (c >= '0' && c <= '7')
300 n = (n << 3) + (c - '0');
301 c = phase1_getc ();
302 if (c != EOF)
304 if (c >= '0' && c <= '7')
305 n = (n << 3) + (c - '0');
306 else
307 phase1_ungetc (c);
310 else
311 phase1_ungetc (c);
313 return (unsigned char) n;
315 case 'x':
317 int n = 0;
319 for (;;)
321 c = phase1_getc ();
322 if (c == EOF)
323 break;
324 else if (c >= '0' && c <= '9')
325 n = (n << 4) + (c - '0');
326 else if (c >= 'A' && c <= 'F')
327 n = (n << 4) + (c - 'A' + 10);
328 else if (c >= 'a' && c <= 'f')
329 n = (n << 4) + (c - 'a' + 10);
330 else
332 phase1_ungetc (c);
333 break;
336 return (unsigned char) n;
338 default:
339 return c;
343 phase1_ungetc (c);
344 error_with_progname = false;
345 error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name,
346 line_number);
347 error_with_progname = true;
348 return P7_QUOTES;
352 /* Free the memory pointed to by a 'struct token_ty'. */
353 static inline void
354 free_token (token_ty *tp)
356 switch (tp->type)
358 case token_type_string:
359 case token_type_i18nstring:
360 case token_type_symbol:
361 free (tp->string);
362 break;
363 default:
364 break;
369 /* Combine characters into tokens. Discard whitespace. */
371 /* There is an ambiguity about '/': It can start a division operator ('/' or
372 '/=') or it can start a regular expression. The distinction is important
373 because inside regular expressions, '#' and '"' lose its special meanings.
374 If you look at the awk grammar, you see that the operator is only allowed
375 right after a 'variable' or 'simp_exp' nonterminal, and these nonterminals
376 can only end in the NAME, LENGTH, YSTRING, YNUMBER, ')', ']' terminals.
377 So we prefer the division operator interpretation only right after
378 symbol, string, number, ')', ']', with whitespace but no newline allowed
379 in between. */
380 static bool prefer_division_over_regexp;
382 static void
383 x_awk_lex (token_ty *tp)
385 static char *buffer;
386 static int bufmax;
387 int bufpos;
388 int c;
390 for (;;)
392 tp->line_number = line_number;
393 c = phase2_getc ();
395 switch (c)
397 case EOF:
398 tp->type = token_type_eof;
399 return;
401 case '\n':
402 if (last_non_comment_line > last_comment_line)
403 xgettext_comment_reset ();
404 /* Newline is not allowed inside expressions. It usually
405 introduces a fresh statement.
406 FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else'
407 does *not* introduce a fresh statement. */
408 prefer_division_over_regexp = false;
409 /* FALLTHROUGH */
410 case '\t':
411 case ' ':
412 /* Ignore whitespace and comments. */
413 continue;
415 case '\\':
416 /* Backslash ought to be immediately followed by a newline. */
417 continue;
420 last_non_comment_line = tp->line_number;
422 switch (c)
424 case '.':
426 int c2 = phase2_getc ();
427 phase2_ungetc (c2);
428 if (!(c2 >= '0' && c2 <= '9'))
431 tp->type = token_type_other;
432 prefer_division_over_regexp = false;
433 return;
436 /* FALLTHROUGH */
437 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
438 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
439 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
440 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
441 case 'Y': case 'Z':
442 case '_':
443 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
444 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
445 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
446 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
447 case 'y': case 'z':
448 case '0': case '1': case '2': case '3': case '4':
449 case '5': case '6': case '7': case '8': case '9':
450 /* Symbol, or part of a number. */
451 bufpos = 0;
452 for (;;)
454 if (bufpos >= bufmax)
456 bufmax = 2 * bufmax + 10;
457 buffer = xrealloc (buffer, bufmax);
459 buffer[bufpos++] = c;
460 c = phase2_getc ();
461 switch (c)
463 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
464 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
465 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
466 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
467 case 'Y': case 'Z':
468 case '_':
469 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
470 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
471 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
472 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
473 case 'y': case 'z':
474 case '0': case '1': case '2': case '3': case '4':
475 case '5': case '6': case '7': case '8': case '9':
476 continue;
477 default:
478 if (bufpos == 1 && buffer[0] == '_' && c == '"')
480 tp->type = token_type_i18nstring;
481 goto case_string;
483 phase2_ungetc (c);
484 break;
486 break;
488 if (bufpos >= bufmax)
490 bufmax = 2 * bufmax + 10;
491 buffer = xrealloc (buffer, bufmax);
493 buffer[bufpos] = '\0';
494 tp->string = xstrdup (buffer);
495 tp->type = token_type_symbol;
496 /* Most identifiers can be variable names; after them we must
497 interpret '/' as division operator. But for awk's builtin
498 keywords we have three cases:
499 (a) Must interpret '/' as division operator. "length".
500 (b) Must interpret '/' as start of a regular expression.
501 "do", "exit", "print", "printf", "return".
502 (c) '/' after this keyword in invalid anyway. All others.
503 I used the following script for the distinction.
504 for k in $awk_keywords; do
505 echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null
506 done
508 if (strcmp (buffer, "do") == 0
509 || strcmp (buffer, "exit") == 0
510 || strcmp (buffer, "print") == 0
511 || strcmp (buffer, "printf") == 0
512 || strcmp (buffer, "return") == 0)
513 prefer_division_over_regexp = false;
514 else
515 prefer_division_over_regexp = true;
516 return;
518 case '"':
519 tp->type = token_type_string;
520 case_string:
521 bufpos = 0;
522 for (;;)
524 c = phase7_getc ();
525 if (c == EOF || c == P7_QUOTES)
526 break;
527 if (bufpos >= bufmax)
529 bufmax = 2 * bufmax + 10;
530 buffer = xrealloc (buffer, bufmax);
532 buffer[bufpos++] = c;
534 if (bufpos >= bufmax)
536 bufmax = 2 * bufmax + 10;
537 buffer = xrealloc (buffer, bufmax);
539 buffer[bufpos] = '\0';
540 tp->string = xstrdup (buffer);
541 prefer_division_over_regexp = true;
542 return;
544 case '(':
545 tp->type = token_type_lparen;
546 prefer_division_over_regexp = false;
547 return;
549 case ')':
550 tp->type = token_type_rparen;
551 prefer_division_over_regexp = true;
552 return;
554 case ',':
555 tp->type = token_type_comma;
556 prefer_division_over_regexp = false;
557 return;
559 case ';':
560 tp->type = token_type_semicolon;
561 prefer_division_over_regexp = false;
562 return;
564 case ']':
565 tp->type = token_type_other;
566 prefer_division_over_regexp = true;
567 return;
569 case '/':
570 if (!prefer_division_over_regexp)
572 /* Regular expression.
573 Counting brackets is non-trivial. [[] is balanced, and so is
574 [\]]. Also, /[/]/ is balanced and ends at the third slash.
575 Do not count [ or ] if either one is preceded by a \.
576 A '[' should be counted if
577 a) it is the first one so far (brackets == 0), or
578 b) it is the '[' in '[:'.
579 A ']' should be counted if not preceded by a \.
580 According to POSIX, []] is how you put a ] into a set.
581 Try to handle that too.
583 int brackets = 0;
584 bool pos0 = true; /* true at start of regexp */
585 bool pos1_open = false; /* true after [ at start of regexp */
586 bool pos2_open_not = false; /* true after [^ at start of regexp */
588 for (;;)
590 c = phase1_getc ();
592 if (c == EOF || c == '\n')
594 phase1_ungetc (c);
595 error_with_progname = false;
596 error (0, 0, _("%s:%d: warning: unterminated regular expression"),
597 logical_file_name, line_number);
598 error_with_progname = true;
599 break;
601 else if (c == '[')
603 if (brackets == 0)
604 brackets++;
605 else
607 c = phase1_getc ();
608 if (c == ':')
609 brackets++;
610 phase1_ungetc (c);
612 if (pos0)
614 pos0 = false;
615 pos1_open = true;
616 continue;
619 else if (c == ']')
621 if (!(pos1_open || pos2_open_not))
622 brackets--;
624 else if (c == '^')
626 if (pos1_open)
628 pos1_open = false;
629 pos2_open_not = true;
630 continue;
633 else if (c == '\\')
635 c = phase1_getc ();
636 /* Backslash-newline is valid and ignored. */
638 else if (c == '/')
640 if (brackets <= 0)
641 break;
644 pos0 = false;
645 pos1_open = false;
646 pos2_open_not = false;
649 tp->type = token_type_other;
650 prefer_division_over_regexp = false;
651 return;
653 /* FALLTHROUGH */
655 default:
656 /* We could carefully recognize each of the 2 and 3 character
657 operators, but it is not necessary, as we only need to recognize
658 gettext invocations. Don't bother. */
659 tp->type = token_type_other;
660 prefer_division_over_regexp = false;
661 return;
667 /* ========================= Extracting strings. ========================== */
670 /* Context lookup table. */
671 static flag_context_list_table_ty *flag_context_list_table;
674 /* The file is broken into tokens. Scan the token stream, looking for
675 a keyword, followed by a left paren, followed by a string. When we
676 see this sequence, we have something to remember. We assume we are
677 looking at a valid C or C++ program, and leave the complaints about
678 the grammar to the compiler.
680 Normal handling: Look for
681 keyword ( ... msgid ... )
682 Plural handling: Look for
683 keyword ( ... msgid ... msgid_plural ... )
685 We use recursion because the arguments before msgid or between msgid
686 and msgid_plural can contain subexpressions of the same form. */
689 /* Extract messages until the next balanced closing parenthesis.
690 Extracted messages are added to MLP.
691 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
692 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
693 otherwise PLURAL_COMMAS = 0.
694 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
695 Return true upon eof, false upon closing parenthesis. */
696 static bool
697 extract_parenthesized (message_list_ty *mlp,
698 flag_context_ty outer_context,
699 flag_context_list_iterator_ty context_iter,
700 int commas_to_skip, int plural_commas)
702 /* Remember the message containing the msgid, for msgid_plural. */
703 message_ty *plural_mp = NULL;
705 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
706 int state;
707 /* Parameters of the keyword just seen. Defined only in state 1. */
708 int next_commas_to_skip = -1;
709 int next_plural_commas = 0;
710 /* Whether to implicitly assume the next tokens are arguments even without
711 a '('. */
712 bool next_is_argument = false;
713 /* Context iterator that will be used if the next token is a '('. */
714 flag_context_list_iterator_ty next_context_iter =
715 passthrough_context_list_iterator;
716 /* Current context. */
717 flag_context_ty inner_context =
718 inherited_context (outer_context,
719 flag_context_list_iterator_advance (&context_iter));
721 /* Start state is 0. */
722 state = 0;
724 for (;;)
726 token_ty token;
728 x_awk_lex (&token);
730 if (next_is_argument && token.type != token_type_lparen)
732 /* An argument list starts, even though there is no '('. */
733 context_iter = next_context_iter;
734 outer_context = inner_context;
735 inner_context =
736 inherited_context (outer_context,
737 flag_context_list_iterator_advance (
738 &context_iter));
741 switch (token.type)
743 case token_type_symbol:
745 void *keyword_value;
747 if (find_entry (&keywords, token.string, strlen (token.string),
748 &keyword_value)
749 == 0)
751 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
752 int argnum2 = (int) (long) keyword_value >> 10;
754 next_commas_to_skip = argnum1 - 1;
755 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
756 state = 1;
758 else
759 state = 0;
761 next_is_argument =
762 (strcmp (token.string, "print") == 0
763 || strcmp (token.string, "printf") == 0);
764 next_context_iter =
765 flag_context_list_iterator (
766 flag_context_list_table_lookup (
767 flag_context_list_table,
768 token.string, strlen (token.string)));
769 free (token.string);
770 continue;
772 case token_type_lparen:
773 if (extract_parenthesized (mlp, inner_context, next_context_iter,
774 state ? next_commas_to_skip : -1,
775 state ? next_plural_commas : 0))
776 return true;
777 next_is_argument = false;
778 next_context_iter = null_context_list_iterator;
779 state = 0;
780 continue;
782 case token_type_rparen:
783 return false;
785 case token_type_comma:
786 if (commas_to_skip >= 0)
788 if (commas_to_skip > 0)
789 commas_to_skip--;
790 else
791 if (plural_mp != NULL && plural_commas > 0)
793 commas_to_skip = plural_commas - 1;
794 plural_commas = 0;
796 else
797 commas_to_skip = -1;
799 inner_context =
800 inherited_context (outer_context,
801 flag_context_list_iterator_advance (
802 &context_iter));
803 next_is_argument = false;
804 next_context_iter = passthrough_context_list_iterator;
805 state = 0;
806 continue;
808 case token_type_string:
810 lex_pos_ty pos;
811 pos.file_name = logical_file_name;
812 pos.line_number = token.line_number;
814 if (extract_all)
815 remember_a_message (mlp, token.string, inner_context, &pos);
816 else
818 if (commas_to_skip == 0)
820 if (plural_mp == NULL)
822 /* Seen an msgid. */
823 message_ty *mp =
824 remember_a_message (mlp, token.string,
825 inner_context, &pos);
826 if (plural_commas > 0)
827 plural_mp = mp;
829 else
831 /* Seen an msgid_plural. */
832 remember_a_message_plural (plural_mp, token.string,
833 inner_context, &pos);
834 plural_mp = NULL;
837 else
838 free (token.string);
841 next_is_argument = false;
842 next_context_iter = null_context_list_iterator;
843 state = 0;
844 continue;
846 case token_type_i18nstring:
848 lex_pos_ty pos;
849 pos.file_name = logical_file_name;
850 pos.line_number = token.line_number;
852 remember_a_message (mlp, token.string, inner_context, &pos);
854 next_is_argument = false;
855 next_context_iter = null_context_list_iterator;
856 state = 0;
857 continue;
859 case token_type_semicolon:
860 /* An argument list ends, and a new statement begins. */
861 /* FIXME: Should handle newline that acts as statement separator
862 in the same way. */
863 /* FIXME: Instead of resetting outer_context here, it may be better
864 to recurse in the next_is_argument handling above, waiting for
865 the next semicolon or other statement terminator. */
866 outer_context = null_context;
867 context_iter = null_context_list_iterator;
868 next_is_argument = false;
869 next_context_iter = passthrough_context_list_iterator;
870 inner_context =
871 inherited_context (outer_context,
872 flag_context_list_iterator_advance (
873 &context_iter));
874 state = 0;
875 continue;
877 case token_type_eof:
878 return true;
880 case token_type_other:
881 next_is_argument = false;
882 next_context_iter = null_context_list_iterator;
883 state = 0;
884 continue;
886 default:
887 abort ();
893 void
894 extract_awk (FILE *f,
895 const char *real_filename, const char *logical_filename,
896 flag_context_list_table_ty *flag_table,
897 msgdomain_list_ty *mdlp)
899 message_list_ty *mlp = mdlp->item[0]->messages;
901 fp = f;
902 real_file_name = real_filename;
903 logical_file_name = xstrdup (logical_filename);
904 line_number = 1;
906 last_comment_line = -1;
907 last_non_comment_line = -1;
909 prefer_division_over_regexp = false;
911 flag_context_list_table = flag_table;
913 init_keywords ();
915 /* Eat tokens until eof is seen. When extract_parenthesized returns
916 due to an unbalanced closing parenthesis, just restart it. */
917 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
918 -1, 0))
921 fp = NULL;
922 real_file_name = NULL;
923 logical_file_name = NULL;
924 line_number = 0;