1 /* xgettext Scheme backend.
2 Copyright (C) 2004-2005 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
39 #define _(s) gettext(s)
42 /* The Scheme syntax is described in R5RS. It is implemented in
43 guile-1.6.4/libguile/read.c.
44 Since we are interested only in strings and in forms similar to
46 or (ngettext msgid msgid_plural ...)
47 we make the following simplifications:
49 - Assume the keywords and strings are in an ASCII compatible encoding.
50 This means we can read the input file one byte at a time, instead of
51 one character at a time. No need to worry about multibyte characters:
52 If they occur as part of identifiers, they most probably act as
53 constituent characters, and the byte based approach will do the same.
55 - Assume the read-hash-procedures is in the default state.
56 Non-standard reader extensions are mostly used to read data, not programs.
58 The remaining syntax rules are:
60 - The syntax code assigned to each character, and how tokens are built
61 up from characters (single escape, multiple escape etc.).
63 - Comment syntax: ';' and '#! ... \n!#\n'.
65 - String syntax: "..." with single escapes.
67 - Read macros and dispatch macro character '#'. Needed to be able to
68 tell which is the n-th argument of a function call.
73 /* ====================== Keyword set customization. ====================== */
75 /* If true extract all strings. */
76 static bool extract_all
= false;
78 static hash_table keywords
;
79 static bool default_keywords
= true;
83 x_scheme_extract_all ()
90 x_scheme_keyword (const char *name
)
93 default_keywords
= false;
101 if (keywords
.table
== NULL
)
102 init_hash (&keywords
, 100);
104 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
106 /* The characters between name and end should form a valid Lisp symbol.
107 Extract the symbol name part. */
108 colon
= strchr (name
, ':');
109 if (colon
!= NULL
&& colon
< end
)
112 if (name
< end
&& *name
== ':')
114 colon
= strchr (name
, ':');
115 if (colon
!= NULL
&& colon
< end
)
121 insert_entry (&keywords
, name
, end
- name
,
122 (void *) (long) (argnum1
+ (argnum2
<< 10)));
126 /* Finish initializing the keywords hash table.
127 Called after argument processing, before each file is processed. */
131 if (default_keywords
)
133 x_scheme_keyword ("gettext"); /* libguile/i18n.c */
134 x_scheme_keyword ("ngettext:1,2"); /* libguile/i18n.c */
135 x_scheme_keyword ("gettext-noop");
136 default_keywords
= false;
141 init_flag_table_scheme ()
143 xgettext_record_flag ("gettext:1:pass-scheme-format");
144 xgettext_record_flag ("ngettext:1:pass-scheme-format");
145 xgettext_record_flag ("ngettext:2:pass-scheme-format");
146 xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
147 xgettext_record_flag ("format:2:scheme-format");
151 /* ======================== Reading of characters. ======================== */
153 /* Real filename, used in error messages about the input file. */
154 static const char *real_file_name
;
156 /* Logical filename and line number, used to label the extracted messages. */
157 static char *logical_file_name
;
158 static int line_number
;
160 /* The input file stream. */
164 /* Fetch the next character from the input file. */
173 error (EXIT_FAILURE
, errno
, _("\
174 error while reading \"%s\""), real_file_name
);
182 /* Put back the last fetched character, not EOF. */
192 /* ========================== Reading of tokens. ========================== */
195 /* A token consists of a sequence of characters. */
198 int allocated
; /* number of allocated 'token_char's */
199 int charcount
; /* number of used 'token_char's */
200 char *chars
; /* the token's constituents */
203 /* Initialize a 'struct token'. */
205 init_token (struct token
*tp
)
208 tp
->chars
= (char *) xmalloc (tp
->allocated
* sizeof (char));
212 /* Free the memory pointed to by a 'struct token'. */
214 free_token (struct token
*tp
)
219 /* Ensure there is enough room in the token for one more character. */
221 grow_token (struct token
*tp
)
223 if (tp
->charcount
== tp
->allocated
)
226 tp
->chars
= (char *) xrealloc (tp
->chars
, tp
->allocated
* sizeof (char));
230 /* Read the next token. 'first' is the first character, which has already
233 read_token (struct token
*tp
, int first
)
238 tp
->chars
[tp
->charcount
++] = first
;
246 if (c
== ' ' || c
== '\r' || c
== '\f' || c
== '\t' || c
== '\n'
247 || c
== '"' || c
== '(' || c
== ')' || c
== ';')
253 tp
->chars
[tp
->charcount
++] = c
;
257 /* Tests if a token represents an integer.
258 Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int(). */
260 is_integer_syntax (const char *str
, int len
, int radix
)
263 const char *p_end
= str
+ len
;
265 /* The accepted syntax is
267 where DIGIT is a hexadecimal digit whose value is below radix. */
271 if (*p
== '+' || *p
== '-')
281 if (c
>= '0' && c
<= '9')
283 else if (c
>= 'A' && c
<= 'F')
285 else if (c
>= 'a' && c
<= 'f')
296 /* Tests if a token represents a rational, floating-point or complex number.
297 If unconstrained is false, only real numbers are accepted; otherwise,
298 complex numbers are accepted as well.
299 Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo(). */
301 is_other_number_syntax (const char *str
, int len
, int radix
, bool unconstrained
)
304 const char *p_end
= str
+ len
;
308 /* The accepted syntaxes are:
309 for a floating-point number:
310 ['+'|'-'] DIGIT+ [EXPONENT]
311 ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
312 where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
313 (Dot and exponent are allowed only if radix is 10.)
314 for a rational number:
315 ['+'|'-'] DIGIT+ '/' DIGIT+
316 for a complex number:
317 REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
318 REAL-NUMBER {'+'|'-'} 'i'
319 {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
321 REAL-NUMBER '@' REAL-NUMBER
325 /* Parse leading sign. */
327 if (*p
== '+' || *p
== '-')
333 /* Recognize complex number syntax: {'+'|'-'} 'i' */
334 if (unconstrained
&& (*p
== 'I' || *p
== 'i') && p
+ 1 == p_end
)
337 /* Parse digits before dot or exponent or slash. */
343 if (c
>= '0' && c
<= '9')
345 else if (c
>= 'A' && c
<= 'F')
347 if (c
>= 'D' && radix
== 10) /* exponent? */
351 else if (c
>= 'a' && c
<= 'f')
353 if (c
>= 'd' && radix
== 10) /* exponent? */
365 /* If p == p_end, we know that seen_digits = true, and the number is an
366 integer without exponent. */
369 /* If we have no digits so far, we need a decimal point later. */
370 if (!seen_digits
&& !(*p
== '.' && radix
== 10))
372 /* Trailing '#' signs are equivalent to zeroes. */
373 while (p
< p_end
&& *p
== '#')
379 /* Parse digits after the slash. */
380 bool all_zeroes
= true;
382 for (; p
< p_end
; p
++)
386 if (c
>= '0' && c
<= '9')
388 else if (c
>= 'A' && c
<= 'F')
390 else if (c
>= 'a' && c
<= 'f')
399 /* A zero denominator is not allowed. */
402 /* Trailing '#' signs are equivalent to zeroes. */
403 while (p
< p_end
&& *p
== '#')
410 /* Decimal point notation. */
413 /* Parse digits after the decimal point. */
415 for (; p
< p_end
; p
++)
419 if (c
>= '0' && c
<= '9')
424 /* Digits are required before or after the decimal point. */
427 /* Trailing '#' signs are equivalent to zeroes. */
428 while (p
< p_end
&& *p
== '#')
433 /* Parse exponent. */
446 if (*p
== '+' || *p
== '-')
452 if (!(*p
>= '0' && *p
<= '9'))
459 if (!(*p
>= '0' && *p
<= '9'))
472 /* Recognize complex number syntax. */
475 /* Recognize the syntax {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' */
476 if (seen_sign
&& (*p
== 'I' || *p
== 'i') && p
+ 1 == p_end
)
478 /* Recognize the syntaxes
479 REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
480 REAL-NUMBER {'+'|'-'} 'i'
482 if (*p
== '+' || *p
== '-')
483 return (p_end
[-1] == 'I' || p_end
[-1] == 'i')
484 && (p
+ 1 == p_end
- 1
485 || is_other_number_syntax (p
, p_end
- 1 - p
, radix
, false));
486 /* Recognize the syntax REAL-NUMBER '@' REAL-NUMBER */
490 return is_other_number_syntax (p
, p_end
- p
, radix
, false);
496 /* Tests if a token represents a number.
497 Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number(). */
499 is_number (const struct token
*tp
)
501 const char *str
= tp
->chars
;
502 int len
= tp
->charcount
;
504 enum { unknown
, exact
, inexact
} exactness
= unknown
;
505 bool seen_radix_prefix
= false;
506 bool seen_exactness_prefix
= false;
509 if (*str
== '+' || *str
== '-')
511 while (len
>= 2 && *str
== '#')
516 if (seen_radix_prefix
)
519 seen_radix_prefix
= true;
522 if (seen_radix_prefix
)
525 seen_radix_prefix
= true;
528 if (seen_radix_prefix
)
531 seen_radix_prefix
= true;
534 if (seen_radix_prefix
)
537 seen_radix_prefix
= true;
540 if (seen_exactness_prefix
)
543 seen_exactness_prefix
= true;
546 if (seen_exactness_prefix
)
549 seen_exactness_prefix
= true;
557 if (exactness
!= inexact
)
559 /* Try to parse an integer. */
560 if (is_integer_syntax (str
, len
, 10))
562 /* FIXME: Other Scheme implementations support exact rational numbers
563 or exact complex numbers. */
565 if (exactness
!= exact
)
567 /* Try to parse a rational, floating-point or complex number. */
568 if (is_other_number_syntax (str
, len
, 10, true))
575 /* ========================= Accumulating comments ========================= */
579 static size_t bufmax
;
580 static size_t buflen
;
591 if (buflen
>= bufmax
)
593 bufmax
= 2 * bufmax
+ 10;
594 buffer
= xrealloc (buffer
, bufmax
);
596 buffer
[buflen
++] = c
;
600 comment_line_end (size_t chars_to_remove
)
602 buflen
-= chars_to_remove
;
604 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
606 if (chars_to_remove
== 0 && buflen
>= bufmax
)
608 bufmax
= 2 * bufmax
+ 10;
609 buffer
= xrealloc (buffer
, bufmax
);
611 buffer
[buflen
] = '\0';
612 xgettext_comment_add (buffer
);
616 /* These are for tracking whether comments count as immediately before
618 static int last_comment_line
;
619 static int last_non_comment_line
;
622 /* ========================= Accumulating messages ========================= */
625 static message_list_ty
*mlp
;
628 /* ========================== Reading of objects. ========================= */
631 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
632 Other objects need not to be represented precisely. */
635 t_symbol
, /* symbol */
636 t_string
, /* string */
637 t_other
, /* other kind of real object */
638 t_dot
, /* '.' pseudo object */
639 t_close
, /* ')' pseudo object */
640 t_eof
/* EOF marker */
645 enum object_type type
;
646 struct token
*token
; /* for t_symbol and t_string */
647 int line_number_at_start
; /* for t_string */
650 /* Free the memory pointed to by a 'struct object'. */
652 free_object (struct object
*op
)
654 if (op
->type
== t_symbol
|| op
->type
== t_string
)
656 free_token (op
->token
);
661 /* Convert a t_symbol/t_string token to a char*. */
663 string_of_object (const struct object
*op
)
668 if (!(op
->type
== t_symbol
|| op
->type
== t_string
))
670 n
= op
->token
->charcount
;
671 str
= (char *) xmalloc (n
+ 1);
672 memcpy (str
, op
->token
->chars
, n
);
677 /* Context lookup table. */
678 static flag_context_list_table_ty
*flag_context_list_table
;
680 /* Read the next object. */
682 read_object (struct object
*op
, flag_context_ty outer_context
)
694 case ' ': case '\r': case '\f': case '\t':
698 /* Comments assumed to be grouped with a message must immediately
699 precede it, with no non-whitespace token on a line between
701 if (last_non_comment_line
> last_comment_line
)
702 xgettext_comment_reset ();
707 bool all_semicolons
= true;
709 last_comment_line
= line_number
;
714 if (c
== EOF
|| c
== '\n')
717 all_semicolons
= false;
720 /* We skip all leading white space, but not EOLs. */
721 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
725 comment_line_end (0);
731 int arg
= 0; /* Current argument number. */
732 flag_context_list_iterator_ty context_iter
;
733 int argnum1
= 0; /* First string position. */
734 int argnum2
= 0; /* Plural string position. */
735 message_ty
*plural_mp
= NULL
; /* Remember the msgid. */
740 flag_context_ty inner_context
;
743 inner_context
= null_context
;
746 inherited_context (outer_context
,
747 flag_context_list_iterator_advance (
750 read_object (&inner
, inner_context
);
752 /* Recognize end of list. */
753 if (inner
.type
== t_close
)
756 last_non_comment_line
= line_number
;
760 /* Dots are not allowed in every position.
763 /* EOF inside list is illegal.
765 if (inner
.type
== t_eof
)
770 /* This is the function position. */
771 if (inner
.type
== t_symbol
)
773 char *symbol_name
= string_of_object (&inner
);
776 if (find_entry (&keywords
,
777 symbol_name
, strlen (symbol_name
),
781 argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
782 argnum2
= (int) (long) keyword_value
>> 10;
786 flag_context_list_iterator (
787 flag_context_list_table_lookup (
788 flag_context_list_table
,
789 symbol_name
, strlen (symbol_name
)));
794 context_iter
= null_context_list_iterator
;
798 /* These are the argument positions.
799 Extract a string if we have reached the right
800 argument position. */
803 if (inner
.type
== t_string
)
808 pos
.file_name
= logical_file_name
;
809 pos
.line_number
= inner
.line_number_at_start
;
810 mp
= remember_a_message (mlp
, string_of_object (&inner
),
811 inner_context
, &pos
);
816 else if (arg
== argnum2
)
818 if (inner
.type
== t_string
&& plural_mp
!= NULL
)
822 pos
.file_name
= logical_file_name
;
823 pos
.line_number
= inner
.line_number_at_start
;
824 remember_a_message_plural (plural_mp
, string_of_object (&inner
),
825 inner_context
, &pos
);
830 free_object (&inner
);
834 last_non_comment_line
= line_number
;
838 /* Tell the caller about the end of list.
839 Unmatched closing parenthesis is illegal.
842 last_non_comment_line
= line_number
;
848 /* The ,@ handling inside lists is wrong anyway, because
849 ,@form expands to an unknown number of elements. */
850 if (c
!= EOF
&& c
!= '@')
859 read_object (&inner
, null_context
);
861 /* Dots and EOF are not allowed here. But be tolerant. */
863 free_object (&inner
);
866 last_non_comment_line
= line_number
;
871 /* Dispatch macro handling. */
875 /* Invalid input. Be tolerant, no error message. */
883 case '(': /* Vector */
887 read_object (&inner
, null_context
);
888 /* Dots and EOF are not allowed here.
890 free_object (&inner
);
892 last_non_comment_line
= line_number
;
896 case 'T': case 't': /* Boolean true */
897 case 'F': case 'f': /* Boolean false */
899 last_non_comment_line
= line_number
;
911 read_token (&token
, '#');
912 if (is_number (&token
))
917 last_non_comment_line
= line_number
;
922 if (token
.charcount
== 2
923 && (token
.chars
[1] == 'e' || token
.chars
[1] == 'i'))
929 /* Homogenous vector syntax, see arrays.scm. */
930 case 'a': /* Vectors of char */
931 case 'c': /* Vectors of complex */
932 /*case 'e':*/ /* Vectors of long */
933 case 'h': /* Vectors of short */
934 /*case 'i':*/ /* Vectors of double-float */
935 case 'l': /* Vectors of long long */
936 case 's': /* Vectors of single-float */
937 case 'u': /* Vectors of unsigned long */
938 case 'y': /* Vectors of byte */
941 read_object (&inner
, null_context
);
942 /* Dots and EOF are not allowed here.
945 free_object (&inner
);
947 last_non_comment_line
= line_number
;
951 /* Unknown # object. But be tolerant. */
954 last_non_comment_line
= line_number
;
960 /* Block comment '#! ... \n!#\n'. We don't extract it
961 because it's only used to introduce scripts on Unix. */
971 /* EOF is not allowed here. But be tolerant. */
973 if (last3
== '\n' && last2
== '!' && last1
== '#'
987 read_token (&token
, c
);
988 /* The token should consists only of '0' and '1', except
989 for the initial '*'. But be tolerant. */
992 last_non_comment_line
= line_number
;
997 /* Symbol with multiple escapes: #{...}# */
999 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
1001 init_token (op
->token
);
1024 grow_token (op
->token
);
1025 op
->token
->chars
[op
->token
->charcount
++] = c
;
1028 op
->type
= t_symbol
;
1029 last_non_comment_line
= line_number
;
1040 read_token (&token
, c
);
1041 free_token (&token
);
1044 last_non_comment_line
= line_number
;
1048 case ':': /* Keyword. */
1049 case '&': /* Deprecated keyword, installed in optargs.scm. */
1052 read_token (&token
, '-');
1053 free_token (&token
);
1055 last_non_comment_line
= line_number
;
1059 /* The following are installed through read-hash-extend. */
1062 case '0': case '1': case '2': case '3': case '4':
1063 case '5': case '6': case '7': case '8': case '9':
1064 /* Multidimensional array syntax: #nx(...) where
1066 x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1070 while (c
>= '0' && c
<= '9');
1071 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1074 case '\'': /* boot-9.scm */
1075 case '.': /* boot-9.scm */
1076 case ',': /* srfi-10.scm */
1078 struct object inner
;
1079 read_object (&inner
, null_context
);
1080 /* Dots and EOF are not allowed here.
1082 free_object (&inner
);
1084 last_non_comment_line
= line_number
;
1091 last_non_comment_line
= line_number
;
1100 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
1101 init_token (op
->token
);
1102 op
->line_number_at_start
= line_number
;
1107 /* Invalid input. Be tolerant, no error message. */
1115 /* Invalid input. Be tolerant, no error message. */
1146 grow_token (op
->token
);
1147 op
->token
->chars
[op
->token
->charcount
++] = c
;
1149 op
->type
= t_string
;
1155 pos
.file_name
= logical_file_name
;
1156 pos
.line_number
= op
->line_number_at_start
;
1157 remember_a_message (mlp
, string_of_object (op
),
1158 null_context
, &pos
);
1160 last_non_comment_line
= line_number
;
1164 case '0': case '1': case '2': case '3': case '4':
1165 case '5': case '6': case '7': case '8': case '9':
1166 case '+': case '-': case '.':
1167 /* Read a number or symbol token. */
1168 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
1169 read_token (op
->token
, c
);
1170 if (op
->token
->charcount
== 1 && op
->token
->chars
[0] == '.')
1172 free_token (op
->token
);
1176 else if (is_number (op
->token
))
1179 free_token (op
->token
);
1186 op
->type
= t_symbol
;
1188 last_non_comment_line
= line_number
;
1193 /* Read a symbol token. */
1194 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
1195 read_token (op
->token
, c
);
1196 op
->type
= t_symbol
;
1197 last_non_comment_line
= line_number
;
1205 extract_scheme (FILE *f
,
1206 const char *real_filename
, const char *logical_filename
,
1207 flag_context_list_table_ty
*flag_table
,
1208 msgdomain_list_ty
*mdlp
)
1210 mlp
= mdlp
->item
[0]->messages
;
1213 real_file_name
= real_filename
;
1214 logical_file_name
= xstrdup (logical_filename
);
1217 last_comment_line
= -1;
1218 last_non_comment_line
= -1;
1220 flag_context_list_table
= flag_table
;
1224 /* Eat tokens until eof is seen. When read_object returns
1225 due to an unbalanced closing parenthesis, just restart it. */
1228 struct object toplevel_object
;
1230 read_object (&toplevel_object
, null_context
);
1232 if (toplevel_object
.type
== t_eof
)
1235 free_object (&toplevel_object
);
1239 /* Close scanner. */
1241 real_file_name
= NULL
;
1242 logical_file_name
= NULL
;