Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-tcl.c
blobb895a63be49d6f687b524222fab750d842bb5aae
1 /* xgettext Tcl backend.
2 Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <assert.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
32 #include "message.h"
33 #include "xgettext.h"
34 #include "x-tcl.h"
35 #include "error.h"
36 #include "xalloc.h"
37 #include "exit.h"
38 #include "hash.h"
39 #include "c-ctype.h"
40 #include "po-charset.h"
41 #include "ucs4-utf8.h"
42 #include "gettext.h"
44 #define _(s) gettext(s)
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
49 /* The Tcl syntax is defined in the Tcl.n manual page.
50 Summary of Tcl syntax:
51 Like sh syntax, except that `...` is replaced with [...]. In detail:
52 - In a preprocessing pass, backslash-newline-anywhitespace is replaced
53 with single space.
54 - Input is broken into words, which are then subject to command
55 substitution [...] , variable substitution $var, backslash substitution
56 \escape.
57 - Strings are enclosed in "..."; command substitution, variable
58 substitution and backslash substitutions are performed here as well.
59 - {...} is a string without substitutions.
60 - The list of resulting words is split into commands by semicolon and
61 newline.
62 - '#' at the beginning of a command introduces a comment until end of line.
63 The parser is implemented in tcl8.3.3/generic/tclParse.c. */
66 /* ====================== Keyword set customization. ====================== */
68 /* If true extract all strings. */
69 static bool extract_all = false;
71 static hash_table keywords;
72 static bool default_keywords = true;
75 void
76 x_tcl_extract_all ()
78 extract_all = true;
82 void
83 x_tcl_keyword (const char *name)
85 if (name == NULL)
86 default_keywords = false;
87 else
89 const char *end;
90 int argnum1;
91 int argnum2;
93 if (keywords.table == NULL)
94 init_hash (&keywords, 100);
96 split_keywordspec (name, &end, &argnum1, &argnum2);
98 /* The characters between name and end should form a valid Tcl
99 function name. A leading "::" is redundant. */
100 if (end - name >= 2 && name[0] == ':' && name[1] == ':')
101 name += 2;
103 if (argnum1 == 0)
104 argnum1 = 1;
105 insert_entry (&keywords, name, end - name,
106 (void *) (long) (argnum1 + (argnum2 << 10)));
110 /* Finish initializing the keywords hash table.
111 Called after argument processing, before each file is processed. */
112 static void
113 init_keywords ()
115 if (default_keywords)
117 x_tcl_keyword ("::msgcat::mc");
118 default_keywords = false;
122 void
123 init_flag_table_tcl ()
125 xgettext_record_flag ("::msgcat::mc:1:pass-tcl-format");
126 xgettext_record_flag ("format:1:tcl-format");
130 /* ======================== Reading of characters. ======================== */
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name;
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name;
137 static int line_number;
139 /* The input file stream. */
140 static FILE *fp;
143 /* Fetch the next character from the input file. */
144 static int
145 do_getc ()
147 int c = getc (fp);
149 if (c == EOF)
151 if (ferror (fp))
152 error (EXIT_FAILURE, errno, _("\
153 error while reading \"%s\""), real_file_name);
155 else if (c == '\n')
156 line_number++;
158 return c;
161 /* Put back the last fetched character, not EOF. */
162 static void
163 do_ungetc (int c)
165 if (c == '\n')
166 line_number--;
167 ungetc (c, fp);
171 /* Combine backslash followed by newline and additional whitespace to
172 a single space. */
174 /* An int that becomes a space when casted to 'unsigned char'. */
175 #define BS_NL (UCHAR_MAX + 1 + ' ')
177 static int phase1_pushback[1];
178 static int phase1_pushback_length;
180 static int
181 phase1_getc ()
183 int c;
185 if (phase1_pushback_length)
187 c = phase1_pushback[--phase1_pushback_length];
188 if (c == '\n' || c == BS_NL)
189 ++line_number;
190 return c;
192 c = do_getc ();
193 if (c != '\\')
194 return c;
195 c = do_getc ();
196 if (c != '\n')
198 if (c != EOF)
199 do_ungetc (c);
200 return '\\';
202 for (;;)
204 c = do_getc ();
205 if (!(c == ' ' || c == '\t'))
206 break;
208 if (c != EOF)
209 do_ungetc (c);
210 return BS_NL;
213 /* Supports only one pushback character. */
214 static void
215 phase1_ungetc (int c)
217 switch (c)
219 case EOF:
220 break;
222 case '\n':
223 case BS_NL:
224 --line_number;
225 /* FALLTHROUGH */
227 default:
228 if (phase1_pushback_length == SIZEOF (phase1_pushback))
229 abort ();
230 phase1_pushback[phase1_pushback_length++] = c;
231 break;
236 /* Keep track of brace nesting depth.
237 When a word starts with an opening brace, a character group begins that
238 ends with the corresponding closing brace. In theory these character
239 groups are string literals, but they are used by so many Tcl primitives
240 (proc, if, ...) as representing command lists, that we treat them as
241 command lists. */
243 /* An int that becomes a closing brace when casted to 'unsigned char'. */
244 #define CL_BRACE (UCHAR_MAX + 1 + '}')
246 static int phase2_pushback[2];
247 static int phase2_pushback_length;
249 /* Brace nesting depth inside the current character group. */
250 static int brace_depth;
252 static int
253 phase2_push ()
255 int previous_depth = brace_depth;
256 brace_depth = 1;
257 return previous_depth;
260 static void
261 phase2_pop (int previous_depth)
263 brace_depth = previous_depth;
266 static int
267 phase2_getc ()
269 int c;
271 if (phase2_pushback_length)
273 c = phase2_pushback[--phase2_pushback_length];
274 if (c == '\n' || c == BS_NL)
275 ++line_number;
276 else if (c == '{')
277 ++brace_depth;
278 else if (c == '}')
279 --brace_depth;
280 return c;
282 c = phase1_getc ();
283 if (c == '{')
284 ++brace_depth;
285 else if (c == '}')
287 if (--brace_depth == 0)
288 c = CL_BRACE;
290 return c;
293 /* Supports 2 characters of pushback. */
294 static void
295 phase2_ungetc (int c)
297 if (c != EOF)
299 switch (c)
301 case '\n':
302 case BS_NL:
303 --line_number;
304 break;
306 case '{':
307 --brace_depth;
308 break;
310 case '}':
311 ++brace_depth;
312 break;
314 if (phase2_pushback_length == SIZEOF (phase2_pushback))
315 abort ();
316 phase2_pushback[phase2_pushback_length++] = c;
321 /* ========================== Reading of tokens. ========================== */
324 /* A token consists of a sequence of characters. */
325 struct token
327 int allocated; /* number of allocated 'token_char's */
328 int charcount; /* number of used 'token_char's */
329 char *chars; /* the token's constituents */
332 /* Initialize a 'struct token'. */
333 static inline void
334 init_token (struct token *tp)
336 tp->allocated = 10;
337 tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
338 tp->charcount = 0;
341 /* Free the memory pointed to by a 'struct token'. */
342 static inline void
343 free_token (struct token *tp)
345 free (tp->chars);
348 /* Ensure there is enough room in the token for one more character. */
349 static inline void
350 grow_token (struct token *tp)
352 if (tp->charcount == tp->allocated)
354 tp->allocated *= 2;
355 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
360 /* ========================= Accumulating comments ========================= */
363 static char *buffer;
364 static size_t bufmax;
365 static size_t buflen;
367 static inline void
368 comment_start ()
370 buflen = 0;
373 static inline void
374 comment_add (int c)
376 if (buflen >= bufmax)
378 bufmax = 2 * bufmax + 10;
379 buffer = xrealloc (buffer, bufmax);
381 buffer[buflen++] = c;
384 static inline void
385 comment_line_end ()
387 while (buflen >= 1
388 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
389 --buflen;
390 if (buflen >= bufmax)
392 bufmax = 2 * bufmax + 10;
393 buffer = xrealloc (buffer, bufmax);
395 buffer[buflen] = '\0';
396 xgettext_comment_add (buffer);
400 /* These are for tracking whether comments count as immediately before
401 keyword. */
402 static int last_comment_line;
403 static int last_non_comment_line;
406 /* ========================= Accumulating messages ========================= */
409 static message_list_ty *mlp;
412 /* ========================== Reading of commands ========================== */
415 /* We are only interested in constant strings (e.g. "msgcat::mc" or other
416 string literals). Other words need not to be represented precisely. */
417 enum word_type
419 t_string, /* constant string */
420 t_other, /* other string */
421 t_separator, /* command separator: semicolon or newline */
422 t_bracket, /* ']' pseudo word */
423 t_brace, /* '}' pseudo word */
424 t_eof /* EOF marker */
427 struct word
429 enum word_type type;
430 struct token *token; /* for t_string */
431 int line_number_at_start; /* for t_string */
434 /* Free the memory pointed to by a 'struct word'. */
435 static inline void
436 free_word (struct word *wp)
438 if (wp->type == t_string)
440 free_token (wp->token);
441 free (wp->token);
445 /* Convert a t_string token to a char*. */
446 static char *
447 string_of_word (const struct word *wp)
449 char *str;
450 int n;
452 if (!(wp->type == t_string))
453 abort ();
454 n = wp->token->charcount;
455 str = (char *) xmalloc (n + 1);
456 memcpy (str, wp->token->chars, n);
457 str[n] = '\0';
458 return str;
462 /* Context lookup table. */
463 static flag_context_list_table_ty *flag_context_list_table;
466 /* Read an escape sequence. The value is an ISO-8859-1 character (in the
467 range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff). */
468 static int
469 do_getc_escaped ()
471 int c;
473 c = phase1_getc ();
474 switch (c)
476 case EOF:
477 return '\\';
478 case 'a':
479 return '\a';
480 case 'b':
481 return '\b';
482 case 'f':
483 return '\f';
484 case 'n':
485 return '\n';
486 case 'r':
487 return '\r';
488 case 't':
489 return '\t';
490 case 'v':
491 return '\v';
492 case 'x':
494 int n = 0;
495 unsigned int i;
497 for (i = 0;; i++)
499 c = phase1_getc ();
500 if (c == EOF || !c_isxdigit ((unsigned char) c))
501 break;
503 if (c >= '0' && c <= '9')
504 n = (n << 4) + (c - '0');
505 else if (c >= 'A' && c <= 'F')
506 n = (n << 4) + (c - 'A' + 10);
507 else if (c >= 'a' && c <= 'f')
508 n = (n << 4) + (c - 'a' + 10);
510 phase1_ungetc (c);
511 return (i > 0 ? (unsigned char) n : 'x');
513 case 'u':
515 int n = 0;
516 unsigned int i;
518 for (i = 0; i < 4; i++)
520 c = phase1_getc ();
521 if (c == EOF || !c_isxdigit ((unsigned char) c))
522 break;
524 if (c >= '0' && c <= '9')
525 n = (n << 4) + (c - '0');
526 else if (c >= 'A' && c <= 'F')
527 n = (n << 4) + (c - 'A' + 10);
528 else if (c >= 'a' && c <= 'f')
529 n = (n << 4) + (c - 'a' + 10);
531 phase1_ungetc (c);
532 return (i > 0 ? n : 'u');
534 case '0': case '1': case '2': case '3': case '4':
535 case '5': case '6': case '7':
537 int n = c - '0';
539 c = phase1_getc ();
540 if (c != EOF)
542 if (c >= '0' && c <= '7')
544 n = (n << 3) + (c - '0');
545 c = phase1_getc ();
546 if (c != EOF)
548 if (c >= '0' && c <= '7')
549 n = (n << 3) + (c - '0');
550 else
551 phase1_ungetc (c);
554 else
555 phase1_ungetc (c);
557 return (unsigned char) n;
559 default:
560 /* Note: If c is non-ASCII, Tcl's behaviour is undefined here. */
561 return (unsigned char) c;
566 enum terminator
568 te_space_separator, /* looking for space semicolon newline */
569 te_space_separator_bracket, /* looking for space semicolon newline ']' */
570 te_paren, /* looking for ')' */
571 te_quote /* looking for '"' */
574 /* Forward declaration of local functions. */
575 static enum word_type read_command_list (int looking_for,
576 flag_context_ty outer_context);
578 /* Accumulate tokens into the given word.
579 'looking_for' denotes a parse terminator combination.
580 Return the first character past the token. */
581 static int
582 accumulate_word (struct word *wp, enum terminator looking_for,
583 flag_context_ty context)
585 int c;
587 for (;;)
589 c = phase2_getc ();
591 if (c == EOF || c == CL_BRACE)
592 return c;
593 if ((looking_for == te_space_separator
594 || looking_for == te_space_separator_bracket)
595 && (c == ' ' || c == BS_NL
596 || c == '\t' || c == '\v' || c == '\f' || c == '\r'
597 || c == ';' || c == '\n'))
598 return c;
599 if (looking_for == te_space_separator_bracket && c == ']')
600 return c;
601 if (looking_for == te_paren && c == ')')
602 return c;
603 if (looking_for == te_quote && c == '"')
604 return c;
606 if (c == '$')
608 /* Distinguish $varname, ${varname} and lone $. */
609 c = phase2_getc ();
610 if (c == '{')
612 /* ${varname} */
614 c = phase2_getc ();
615 while (c != EOF && c != '}');
616 wp->type = t_other;
618 else
620 bool nonempty = false;
622 for (; c != EOF && c != CL_BRACE; c = phase2_getc ())
624 if (c_isalnum ((unsigned char) c) || (c == '_'))
626 nonempty = true;
627 continue;
629 if (c == ':')
631 c = phase2_getc ();
632 if (c == ':')
635 c = phase2_getc ();
636 while (c == ':');
638 phase2_ungetc (c);
639 nonempty = true;
640 continue;
642 phase2_ungetc (c);
643 c = ':';
645 break;
647 if (c == '(')
649 /* $varname(index) */
650 struct word index_word;
652 index_word.type = t_other;
653 c = accumulate_word (&index_word, te_paren, null_context);
654 if (c != EOF && c != ')')
655 phase2_ungetc (c);
656 wp->type = t_other;
658 else
660 phase2_ungetc (c);
661 if (nonempty)
663 /* $varname */
664 wp->type = t_other;
666 else
668 /* lone $ */
669 if (wp->type == t_string)
671 grow_token (wp->token);
672 wp->token->chars[wp->token->charcount++] = '$';
678 else if (c == '[')
680 read_command_list (']', context);
681 wp->type = t_other;
683 else if (c == '\\')
685 unsigned int uc;
686 unsigned char utf8buf[6];
687 int count;
688 int i;
690 uc = do_getc_escaped ();
691 assert (uc < 0x10000);
692 count = u8_uctomb (utf8buf, uc, 6);
693 assert (count > 0);
694 if (wp->type == t_string)
695 for (i = 0; i < count; i++)
697 grow_token (wp->token);
698 wp->token->chars[wp->token->charcount++] = utf8buf[i];
701 else
703 if (wp->type == t_string)
705 grow_token (wp->token);
706 wp->token->chars[wp->token->charcount++] = (unsigned char) c;
713 /* Read the next word.
714 'looking_for' denotes a parse terminator, either ']' or '\0'. */
715 static void
716 read_word (struct word *wp, int looking_for, flag_context_ty context)
718 int c;
721 c = phase2_getc ();
722 while (c == ' ' || c == BS_NL
723 || c == '\t' || c == '\v' || c == '\f' || c == '\r');
725 if (c == EOF)
727 wp->type = t_eof;
728 return;
731 if (c == CL_BRACE)
733 wp->type = t_brace;
734 last_non_comment_line = line_number;
735 return;
738 if (c == '\n')
740 /* Comments assumed to be grouped with a message must immediately
741 precede it, with no non-whitespace token on a line between both. */
742 if (last_non_comment_line > last_comment_line)
743 xgettext_comment_reset ();
744 wp->type = t_separator;
745 return;
748 if (c == ';')
750 wp->type = t_separator;
751 last_non_comment_line = line_number;
752 return;
755 if (looking_for == ']' && c == ']')
757 wp->type = t_bracket;
758 last_non_comment_line = line_number;
759 return;
762 if (c == '{')
764 int previous_depth;
765 enum word_type terminator;
767 /* Start a new nested character group, which lasts until the next
768 balanced '}' (ignoring \} things). */
769 previous_depth = phase2_push () - 1;
771 /* Interpret it as a command list. */
772 terminator = read_command_list ('\0', null_context);
774 if (terminator == t_brace)
775 phase2_pop (previous_depth);
777 wp->type = t_other;
778 last_non_comment_line = line_number;
779 return;
782 wp->type = t_string;
783 wp->token = (struct token *) xmalloc (sizeof (struct token));
784 init_token (wp->token);
785 wp->line_number_at_start = line_number;
787 if (c == '"')
789 c = accumulate_word (wp, te_quote, context);
790 if (c != EOF && c != '"')
791 phase2_ungetc (c);
793 else
795 phase2_ungetc (c);
796 c = accumulate_word (wp,
797 looking_for == ']'
798 ? te_space_separator_bracket
799 : te_space_separator,
800 context);
801 if (c != EOF)
802 phase2_ungetc (c);
805 if (wp->type != t_string)
807 free_token (wp->token);
808 free (wp->token);
810 last_non_comment_line = line_number;
814 /* Read the next command.
815 'looking_for' denotes a parse terminator, either ']' or '\0'.
816 Returns the type of the word that terminated the command: t_separator or
817 t_bracket (only if looking_for is ']') or t_brace or t_eof. */
818 static enum word_type
819 read_command (int looking_for, flag_context_ty outer_context)
821 int c;
823 /* Skip whitespace and comments. */
824 for (;;)
826 c = phase2_getc ();
828 if (c == ' ' || c == BS_NL
829 || c == '\t' || c == '\v' || c == '\f' || c == '\r')
830 continue;
831 if (c == '#')
833 /* Skip a comment up to end of line. */
834 last_comment_line = line_number;
835 comment_start ();
836 for (;;)
838 c = phase2_getc ();
839 if (c == EOF || c == CL_BRACE || c == '\n')
840 break;
841 /* We skip all leading white space, but not EOLs. */
842 if (!(buflen == 0 && (c == ' ' || c == '\t')))
843 comment_add (c);
845 comment_line_end ();
846 continue;
848 break;
850 phase2_ungetc (c);
852 /* Read the words that make up the command. */
854 int arg = 0; /* Current argument number. */
855 flag_context_list_iterator_ty context_iter;
856 int argnum1 = 0; /* First string position. */
857 int argnum2 = 0; /* Plural string position. */
858 message_ty *plural_mp = NULL; /* Remember the msgid. */
860 for (;; arg++)
862 struct word inner;
863 flag_context_ty inner_context;
865 if (arg == 0)
866 inner_context = null_context;
867 else
868 inner_context =
869 inherited_context (outer_context,
870 flag_context_list_iterator_advance (
871 &context_iter));
873 read_word (&inner, looking_for, inner_context);
875 /* Recognize end of command. */
876 if (inner.type == t_separator || inner.type == t_bracket
877 || inner.type == t_brace || inner.type == t_eof)
878 return inner.type;
880 if (extract_all)
882 if (inner.type == t_string)
884 lex_pos_ty pos;
886 pos.file_name = logical_file_name;
887 pos.line_number = inner.line_number_at_start;
888 remember_a_message (mlp, string_of_word (&inner),
889 inner_context, &pos);
893 if (arg == 0)
895 /* This is the function position. */
896 if (inner.type == t_string)
898 char *function_name = string_of_word (&inner);
899 char *stripped_name;
900 void *keyword_value;
902 /* A leading "::" is redundant. */
903 stripped_name = function_name;
904 if (function_name[0] == ':' && function_name[1] == ':')
905 stripped_name += 2;
907 if (find_entry (&keywords,
908 stripped_name, strlen (stripped_name),
909 &keyword_value)
910 == 0)
912 argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
913 argnum2 = (int) (long) keyword_value >> 10;
916 context_iter =
917 flag_context_list_iterator (
918 flag_context_list_table_lookup (
919 flag_context_list_table,
920 stripped_name, strlen (stripped_name)));
922 free (function_name);
924 else
925 context_iter = null_context_list_iterator;
927 else
929 /* These are the argument positions.
930 Extract a string if we have reached the right
931 argument position. */
932 if (arg == argnum1)
934 if (inner.type == t_string)
936 lex_pos_ty pos;
937 message_ty *mp;
939 pos.file_name = logical_file_name;
940 pos.line_number = inner.line_number_at_start;
941 mp = remember_a_message (mlp, string_of_word (&inner),
942 inner_context, &pos);
943 if (argnum2 > 0)
944 plural_mp = mp;
947 else if (arg == argnum2)
949 if (inner.type == t_string && plural_mp != NULL)
951 lex_pos_ty pos;
953 pos.file_name = logical_file_name;
954 pos.line_number = inner.line_number_at_start;
955 remember_a_message_plural (plural_mp, string_of_word (&inner),
956 inner_context, &pos);
961 free_word (&inner);
967 /* Read a list of commands.
968 'looking_for' denotes a parse terminator, either ']' or '\0'.
969 Returns the type of the word that terminated the command list:
970 t_bracket (only if looking_for is ']') or t_brace or t_eof. */
971 static enum word_type
972 read_command_list (int looking_for, flag_context_ty outer_context)
974 for (;;)
976 enum word_type terminator;
978 terminator = read_command (looking_for, outer_context);
979 if (terminator != t_separator)
980 return terminator;
985 void
986 extract_tcl (FILE *f,
987 const char *real_filename, const char *logical_filename,
988 flag_context_list_table_ty *flag_table,
989 msgdomain_list_ty *mdlp)
991 mlp = mdlp->item[0]->messages;
993 /* We convert our strings to UTF-8 encoding. */
994 xgettext_current_source_encoding = po_charset_utf8;
996 fp = f;
997 real_file_name = real_filename;
998 logical_file_name = xstrdup (logical_filename);
999 line_number = 1;
1001 /* Initially, no brace is open. */
1002 brace_depth = 1000000;
1004 last_comment_line = -1;
1005 last_non_comment_line = -1;
1007 flag_context_list_table = flag_table;
1009 init_keywords ();
1011 /* Eat tokens until eof is seen. */
1012 read_command_list ('\0', null_context);
1014 fp = NULL;
1015 real_file_name = NULL;
1016 logical_file_name = NULL;
1017 line_number = 0;