1 /* xgettext Tcl backend.
2 Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
40 #include "po-charset.h"
41 #include "ucs4-utf8.h"
44 #define _(s) gettext(s)
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
49 /* The Tcl syntax is defined in the Tcl.n manual page.
50 Summary of Tcl syntax:
51 Like sh syntax, except that `...` is replaced with [...]. In detail:
52 - In a preprocessing pass, backslash-newline-anywhitespace is replaced
54 - Input is broken into words, which are then subject to command
55 substitution [...] , variable substitution $var, backslash substitution
57 - Strings are enclosed in "..."; command substitution, variable
58 substitution and backslash substitutions are performed here as well.
59 - {...} is a string without substitutions.
60 - The list of resulting words is split into commands by semicolon and
62 - '#' at the beginning of a command introduces a comment until end of line.
63 The parser is implemented in tcl8.3.3/generic/tclParse.c. */
66 /* ====================== Keyword set customization. ====================== */
68 /* If true extract all strings. */
69 static bool extract_all
= false;
71 static hash_table keywords
;
72 static bool default_keywords
= true;
83 x_tcl_keyword (const char *name
)
86 default_keywords
= false;
93 if (keywords
.table
== NULL
)
94 init_hash (&keywords
, 100);
96 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
98 /* The characters between name and end should form a valid Tcl
99 function name. A leading "::" is redundant. */
100 if (end
- name
>= 2 && name
[0] == ':' && name
[1] == ':')
105 insert_entry (&keywords
, name
, end
- name
,
106 (void *) (long) (argnum1
+ (argnum2
<< 10)));
110 /* Finish initializing the keywords hash table.
111 Called after argument processing, before each file is processed. */
115 if (default_keywords
)
117 x_tcl_keyword ("::msgcat::mc");
118 default_keywords
= false;
123 init_flag_table_tcl ()
125 xgettext_record_flag ("::msgcat::mc:1:pass-tcl-format");
126 xgettext_record_flag ("format:1:tcl-format");
130 /* ======================== Reading of characters. ======================== */
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name
;
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name
;
137 static int line_number
;
139 /* The input file stream. */
143 /* Fetch the next character from the input file. */
152 error (EXIT_FAILURE
, errno
, _("\
153 error while reading \"%s\""), real_file_name
);
161 /* Put back the last fetched character, not EOF. */
171 /* Combine backslash followed by newline and additional whitespace to
174 /* An int that becomes a space when casted to 'unsigned char'. */
175 #define BS_NL (UCHAR_MAX + 1 + ' ')
177 static int phase1_pushback
[1];
178 static int phase1_pushback_length
;
185 if (phase1_pushback_length
)
187 c
= phase1_pushback
[--phase1_pushback_length
];
188 if (c
== '\n' || c
== BS_NL
)
205 if (!(c
== ' ' || c
== '\t'))
213 /* Supports only one pushback character. */
215 phase1_ungetc (int c
)
228 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
230 phase1_pushback
[phase1_pushback_length
++] = c
;
236 /* Keep track of brace nesting depth.
237 When a word starts with an opening brace, a character group begins that
238 ends with the corresponding closing brace. In theory these character
239 groups are string literals, but they are used by so many Tcl primitives
240 (proc, if, ...) as representing command lists, that we treat them as
243 /* An int that becomes a closing brace when casted to 'unsigned char'. */
244 #define CL_BRACE (UCHAR_MAX + 1 + '}')
246 static int phase2_pushback
[2];
247 static int phase2_pushback_length
;
249 /* Brace nesting depth inside the current character group. */
250 static int brace_depth
;
255 int previous_depth
= brace_depth
;
257 return previous_depth
;
261 phase2_pop (int previous_depth
)
263 brace_depth
= previous_depth
;
271 if (phase2_pushback_length
)
273 c
= phase2_pushback
[--phase2_pushback_length
];
274 if (c
== '\n' || c
== BS_NL
)
287 if (--brace_depth
== 0)
293 /* Supports 2 characters of pushback. */
295 phase2_ungetc (int c
)
314 if (phase2_pushback_length
== SIZEOF (phase2_pushback
))
316 phase2_pushback
[phase2_pushback_length
++] = c
;
321 /* ========================== Reading of tokens. ========================== */
324 /* A token consists of a sequence of characters. */
327 int allocated
; /* number of allocated 'token_char's */
328 int charcount
; /* number of used 'token_char's */
329 char *chars
; /* the token's constituents */
332 /* Initialize a 'struct token'. */
334 init_token (struct token
*tp
)
337 tp
->chars
= (char *) xmalloc (tp
->allocated
* sizeof (char));
341 /* Free the memory pointed to by a 'struct token'. */
343 free_token (struct token
*tp
)
348 /* Ensure there is enough room in the token for one more character. */
350 grow_token (struct token
*tp
)
352 if (tp
->charcount
== tp
->allocated
)
355 tp
->chars
= (char *) xrealloc (tp
->chars
, tp
->allocated
* sizeof (char));
360 /* ========================= Accumulating comments ========================= */
364 static size_t bufmax
;
365 static size_t buflen
;
376 if (buflen
>= bufmax
)
378 bufmax
= 2 * bufmax
+ 10;
379 buffer
= xrealloc (buffer
, bufmax
);
381 buffer
[buflen
++] = c
;
388 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
390 if (buflen
>= bufmax
)
392 bufmax
= 2 * bufmax
+ 10;
393 buffer
= xrealloc (buffer
, bufmax
);
395 buffer
[buflen
] = '\0';
396 xgettext_comment_add (buffer
);
400 /* These are for tracking whether comments count as immediately before
402 static int last_comment_line
;
403 static int last_non_comment_line
;
406 /* ========================= Accumulating messages ========================= */
409 static message_list_ty
*mlp
;
412 /* ========================== Reading of commands ========================== */
415 /* We are only interested in constant strings (e.g. "msgcat::mc" or other
416 string literals). Other words need not to be represented precisely. */
419 t_string
, /* constant string */
420 t_other
, /* other string */
421 t_separator
, /* command separator: semicolon or newline */
422 t_bracket
, /* ']' pseudo word */
423 t_brace
, /* '}' pseudo word */
424 t_eof
/* EOF marker */
430 struct token
*token
; /* for t_string */
431 int line_number_at_start
; /* for t_string */
434 /* Free the memory pointed to by a 'struct word'. */
436 free_word (struct word
*wp
)
438 if (wp
->type
== t_string
)
440 free_token (wp
->token
);
445 /* Convert a t_string token to a char*. */
447 string_of_word (const struct word
*wp
)
452 if (!(wp
->type
== t_string
))
454 n
= wp
->token
->charcount
;
455 str
= (char *) xmalloc (n
+ 1);
456 memcpy (str
, wp
->token
->chars
, n
);
462 /* Context lookup table. */
463 static flag_context_list_table_ty
*flag_context_list_table
;
466 /* Read an escape sequence. The value is an ISO-8859-1 character (in the
467 range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff). */
500 if (c
== EOF
|| !c_isxdigit ((unsigned char) c
))
503 if (c
>= '0' && c
<= '9')
504 n
= (n
<< 4) + (c
- '0');
505 else if (c
>= 'A' && c
<= 'F')
506 n
= (n
<< 4) + (c
- 'A' + 10);
507 else if (c
>= 'a' && c
<= 'f')
508 n
= (n
<< 4) + (c
- 'a' + 10);
511 return (i
> 0 ? (unsigned char) n
: 'x');
518 for (i
= 0; i
< 4; i
++)
521 if (c
== EOF
|| !c_isxdigit ((unsigned char) c
))
524 if (c
>= '0' && c
<= '9')
525 n
= (n
<< 4) + (c
- '0');
526 else if (c
>= 'A' && c
<= 'F')
527 n
= (n
<< 4) + (c
- 'A' + 10);
528 else if (c
>= 'a' && c
<= 'f')
529 n
= (n
<< 4) + (c
- 'a' + 10);
532 return (i
> 0 ? n
: 'u');
534 case '0': case '1': case '2': case '3': case '4':
535 case '5': case '6': case '7':
542 if (c
>= '0' && c
<= '7')
544 n
= (n
<< 3) + (c
- '0');
548 if (c
>= '0' && c
<= '7')
549 n
= (n
<< 3) + (c
- '0');
557 return (unsigned char) n
;
560 /* Note: If c is non-ASCII, Tcl's behaviour is undefined here. */
561 return (unsigned char) c
;
568 te_space_separator
, /* looking for space semicolon newline */
569 te_space_separator_bracket
, /* looking for space semicolon newline ']' */
570 te_paren
, /* looking for ')' */
571 te_quote
/* looking for '"' */
574 /* Forward declaration of local functions. */
575 static enum word_type
read_command_list (int looking_for
,
576 flag_context_ty outer_context
);
578 /* Accumulate tokens into the given word.
579 'looking_for' denotes a parse terminator combination.
580 Return the first character past the token. */
582 accumulate_word (struct word
*wp
, enum terminator looking_for
,
583 flag_context_ty context
)
591 if (c
== EOF
|| c
== CL_BRACE
)
593 if ((looking_for
== te_space_separator
594 || looking_for
== te_space_separator_bracket
)
595 && (c
== ' ' || c
== BS_NL
596 || c
== '\t' || c
== '\v' || c
== '\f' || c
== '\r'
597 || c
== ';' || c
== '\n'))
599 if (looking_for
== te_space_separator_bracket
&& c
== ']')
601 if (looking_for
== te_paren
&& c
== ')')
603 if (looking_for
== te_quote
&& c
== '"')
608 /* Distinguish $varname, ${varname} and lone $. */
615 while (c
!= EOF
&& c
!= '}');
620 bool nonempty
= false;
622 for (; c
!= EOF
&& c
!= CL_BRACE
; c
= phase2_getc ())
624 if (c_isalnum ((unsigned char) c
) || (c
== '_'))
649 /* $varname(index) */
650 struct word index_word
;
652 index_word
.type
= t_other
;
653 c
= accumulate_word (&index_word
, te_paren
, null_context
);
654 if (c
!= EOF
&& c
!= ')')
669 if (wp
->type
== t_string
)
671 grow_token (wp
->token
);
672 wp
->token
->chars
[wp
->token
->charcount
++] = '$';
680 read_command_list (']', context
);
686 unsigned char utf8buf
[6];
690 uc
= do_getc_escaped ();
691 assert (uc
< 0x10000);
692 count
= u8_uctomb (utf8buf
, uc
, 6);
694 if (wp
->type
== t_string
)
695 for (i
= 0; i
< count
; i
++)
697 grow_token (wp
->token
);
698 wp
->token
->chars
[wp
->token
->charcount
++] = utf8buf
[i
];
703 if (wp
->type
== t_string
)
705 grow_token (wp
->token
);
706 wp
->token
->chars
[wp
->token
->charcount
++] = (unsigned char) c
;
713 /* Read the next word.
714 'looking_for' denotes a parse terminator, either ']' or '\0'. */
716 read_word (struct word
*wp
, int looking_for
, flag_context_ty context
)
722 while (c
== ' ' || c
== BS_NL
723 || c
== '\t' || c
== '\v' || c
== '\f' || c
== '\r');
734 last_non_comment_line
= line_number
;
740 /* Comments assumed to be grouped with a message must immediately
741 precede it, with no non-whitespace token on a line between both. */
742 if (last_non_comment_line
> last_comment_line
)
743 xgettext_comment_reset ();
744 wp
->type
= t_separator
;
750 wp
->type
= t_separator
;
751 last_non_comment_line
= line_number
;
755 if (looking_for
== ']' && c
== ']')
757 wp
->type
= t_bracket
;
758 last_non_comment_line
= line_number
;
765 enum word_type terminator
;
767 /* Start a new nested character group, which lasts until the next
768 balanced '}' (ignoring \} things). */
769 previous_depth
= phase2_push () - 1;
771 /* Interpret it as a command list. */
772 terminator
= read_command_list ('\0', null_context
);
774 if (terminator
== t_brace
)
775 phase2_pop (previous_depth
);
778 last_non_comment_line
= line_number
;
783 wp
->token
= (struct token
*) xmalloc (sizeof (struct token
));
784 init_token (wp
->token
);
785 wp
->line_number_at_start
= line_number
;
789 c
= accumulate_word (wp
, te_quote
, context
);
790 if (c
!= EOF
&& c
!= '"')
796 c
= accumulate_word (wp
,
798 ? te_space_separator_bracket
799 : te_space_separator
,
805 if (wp
->type
!= t_string
)
807 free_token (wp
->token
);
810 last_non_comment_line
= line_number
;
814 /* Read the next command.
815 'looking_for' denotes a parse terminator, either ']' or '\0'.
816 Returns the type of the word that terminated the command: t_separator or
817 t_bracket (only if looking_for is ']') or t_brace or t_eof. */
818 static enum word_type
819 read_command (int looking_for
, flag_context_ty outer_context
)
823 /* Skip whitespace and comments. */
828 if (c
== ' ' || c
== BS_NL
829 || c
== '\t' || c
== '\v' || c
== '\f' || c
== '\r')
833 /* Skip a comment up to end of line. */
834 last_comment_line
= line_number
;
839 if (c
== EOF
|| c
== CL_BRACE
|| c
== '\n')
841 /* We skip all leading white space, but not EOLs. */
842 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
852 /* Read the words that make up the command. */
854 int arg
= 0; /* Current argument number. */
855 flag_context_list_iterator_ty context_iter
;
856 int argnum1
= 0; /* First string position. */
857 int argnum2
= 0; /* Plural string position. */
858 message_ty
*plural_mp
= NULL
; /* Remember the msgid. */
863 flag_context_ty inner_context
;
866 inner_context
= null_context
;
869 inherited_context (outer_context
,
870 flag_context_list_iterator_advance (
873 read_word (&inner
, looking_for
, inner_context
);
875 /* Recognize end of command. */
876 if (inner
.type
== t_separator
|| inner
.type
== t_bracket
877 || inner
.type
== t_brace
|| inner
.type
== t_eof
)
882 if (inner
.type
== t_string
)
886 pos
.file_name
= logical_file_name
;
887 pos
.line_number
= inner
.line_number_at_start
;
888 remember_a_message (mlp
, string_of_word (&inner
),
889 inner_context
, &pos
);
895 /* This is the function position. */
896 if (inner
.type
== t_string
)
898 char *function_name
= string_of_word (&inner
);
902 /* A leading "::" is redundant. */
903 stripped_name
= function_name
;
904 if (function_name
[0] == ':' && function_name
[1] == ':')
907 if (find_entry (&keywords
,
908 stripped_name
, strlen (stripped_name
),
912 argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
913 argnum2
= (int) (long) keyword_value
>> 10;
917 flag_context_list_iterator (
918 flag_context_list_table_lookup (
919 flag_context_list_table
,
920 stripped_name
, strlen (stripped_name
)));
922 free (function_name
);
925 context_iter
= null_context_list_iterator
;
929 /* These are the argument positions.
930 Extract a string if we have reached the right
931 argument position. */
934 if (inner
.type
== t_string
)
939 pos
.file_name
= logical_file_name
;
940 pos
.line_number
= inner
.line_number_at_start
;
941 mp
= remember_a_message (mlp
, string_of_word (&inner
),
942 inner_context
, &pos
);
947 else if (arg
== argnum2
)
949 if (inner
.type
== t_string
&& plural_mp
!= NULL
)
953 pos
.file_name
= logical_file_name
;
954 pos
.line_number
= inner
.line_number_at_start
;
955 remember_a_message_plural (plural_mp
, string_of_word (&inner
),
956 inner_context
, &pos
);
967 /* Read a list of commands.
968 'looking_for' denotes a parse terminator, either ']' or '\0'.
969 Returns the type of the word that terminated the command list:
970 t_bracket (only if looking_for is ']') or t_brace or t_eof. */
971 static enum word_type
972 read_command_list (int looking_for
, flag_context_ty outer_context
)
976 enum word_type terminator
;
978 terminator
= read_command (looking_for
, outer_context
);
979 if (terminator
!= t_separator
)
986 extract_tcl (FILE *f
,
987 const char *real_filename
, const char *logical_filename
,
988 flag_context_list_table_ty
*flag_table
,
989 msgdomain_list_ty
*mdlp
)
991 mlp
= mdlp
->item
[0]->messages
;
993 /* We convert our strings to UTF-8 encoding. */
994 xgettext_current_source_encoding
= po_charset_utf8
;
997 real_file_name
= real_filename
;
998 logical_file_name
= xstrdup (logical_filename
);
1001 /* Initially, no brace is open. */
1002 brace_depth
= 1000000;
1004 last_comment_line
= -1;
1005 last_non_comment_line
= -1;
1007 flag_context_list_table
= flag_table
;
1011 /* Eat tokens until eof is seen. */
1012 read_command_list ('\0', null_context
);
1015 real_file_name
= NULL
;
1016 logical_file_name
= NULL
;