1 /* xgettext librep backend.
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
40 #define _(s) gettext(s)
43 /* Summary of librep syntax:
44 - ';' starts a comment until end of line.
45 - Block comments start with '#|' and end with '|#'.
46 - Numbers are constituted of an optional prefix (#b, #B for binary,
47 #o, #O for octal, #d, #D for decimal, #x, #X for hexadecimal,
48 #e, #E for exact, #i, #I for inexact), an optional sign (+ or -), and
50 - Characters are written as '?' followed by the character, possibly
51 with an escape sequence, for examples '?a', '?\n', '?\177'.
52 - Strings are delimited by double quotes. Backslash introduces an escape
53 sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
54 '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
55 - Symbols: can contain meta-characters - whitespace or any from ()[]'";|\' -
56 if preceded by backslash or enclosed in |...|.
57 - Keywords: written as #:SYMBOL.
60 The reader is implemented in librep-0.14/src/lisp.c. */
63 /* ====================== Keyword set customization. ====================== */
65 /* If true extract all strings. */
66 static bool extract_all
= false;
68 static hash_table keywords
;
69 static bool default_keywords
= true;
73 x_librep_extract_all ()
80 x_librep_keyword (const char *name
)
83 default_keywords
= false;
91 if (keywords
.table
== NULL
)
92 init_hash (&keywords
, 100);
94 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
96 /* The characters between name and end should form a valid Lisp
98 colon
= strchr (name
, ':');
99 if (colon
== NULL
|| colon
>= end
)
103 insert_entry (&keywords
, name
, end
- name
,
104 (void *) (long) (argnum1
+ (argnum2
<< 10)));
109 /* Finish initializing the keywords hash table.
110 Called after argument processing, before each file is processed. */
114 if (default_keywords
)
116 x_librep_keyword ("_");
117 default_keywords
= false;
122 init_flag_table_librep ()
124 xgettext_record_flag ("_:1:pass-librep-format");
125 xgettext_record_flag ("format:2:librep-format");
129 /* ======================== Reading of characters. ======================== */
131 /* Real filename, used in error messages about the input file. */
132 static const char *real_file_name
;
134 /* Logical filename and line number, used to label the extracted messages. */
135 static char *logical_file_name
;
136 static int line_number
;
138 /* The input file stream. */
142 /* Fetch the next character from the input file. */
151 error (EXIT_FAILURE
, errno
, _("\
152 error while reading \"%s\""), real_file_name
);
160 /* Put back the last fetched character, not EOF. */
170 /* ========================== Reading of tokens. ========================== */
173 /* A token consists of a sequence of characters. */
176 int allocated
; /* number of allocated 'token_char's */
177 int charcount
; /* number of used 'token_char's */
178 char *chars
; /* the token's constituents */
181 /* Initialize a 'struct token'. */
183 init_token (struct token
*tp
)
186 tp
->chars
= (char *) xmalloc (tp
->allocated
* sizeof (char));
190 /* Free the memory pointed to by a 'struct token'. */
192 free_token (struct token
*tp
)
197 /* Ensure there is enough room in the token for one more character. */
199 grow_token (struct token
*tp
)
201 if (tp
->charcount
== tp
->allocated
)
204 tp
->chars
= (char *) xrealloc (tp
->chars
, tp
->allocated
* sizeof (char));
208 /* Read the next token. If 'first' is given, it points to the first
209 character, which has already been read. Returns true for a symbol,
210 false for a number. */
212 read_token (struct token
*tp
, const int *first
)
215 /* Variables for speculative number parsing: */
219 bool rational
= false;
220 bool exponent
= false;
221 bool had_sign
= false;
222 bool expecting_prefix
= false;
231 for (;; c
= do_getc ())
238 case ' ': case '\t': case '\n': case '\f': case '\r':
239 case '(': case ')': case '[': case ']':
240 case '\'': case '"': case ';': case ',': case '`':
247 /* Invalid, but be tolerant. */
250 tp
->chars
[tp
->charcount
++] = c
;
258 if (c
== EOF
|| c
== '|')
261 tp
->chars
[tp
->charcount
++] = c
;
268 if (expecting_prefix
)
291 expecting_prefix
= false;
292 nfirst
= tp
->charcount
+ 1;
294 else if (tp
->charcount
== nfirst
295 && (c
== '+' || c
== '-' || c
== '#'))
302 expecting_prefix
= true;
306 nfirst
= tp
->charcount
+ 1;
318 else if (!(c
>= '0' && c
<= '9'))
331 nfirst
= tp
->charcount
+ 1;
333 case '0': case '1': case '2': case '3': case '4':
334 case '5': case '6': case '7':
336 nfirst
= tp
->charcount
;
338 case '.': case 'E': case 'e':
356 if (exact
&& radix
== 10 && !rational
)
362 if (exact
&& !rational
)
370 if (!rational
&& !exponent
)
381 if (exponent
&& (c
== '+' || c
== '-'))
384 && !(c
>= '0' && c
<= '0' + radix
- 1))
385 || (radix
== 16 && !isxdigit (c
)))
399 tp
->chars
[tp
->charcount
++] = c
;
405 if (radix
> 0 && nfirst
< tp
->charcount
)
406 return false; /* number */
408 return true; /* symbol */
412 /* ========================= Accumulating comments ========================= */
416 static size_t bufmax
;
417 static size_t buflen
;
428 if (buflen
>= bufmax
)
430 bufmax
= 2 * bufmax
+ 10;
431 buffer
= xrealloc (buffer
, bufmax
);
433 buffer
[buflen
++] = c
;
437 comment_line_end (size_t chars_to_remove
)
439 buflen
-= chars_to_remove
;
441 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
443 if (chars_to_remove
== 0 && buflen
>= bufmax
)
445 bufmax
= 2 * bufmax
+ 10;
446 buffer
= xrealloc (buffer
, bufmax
);
448 buffer
[buflen
] = '\0';
449 xgettext_comment_add (buffer
);
453 /* These are for tracking whether comments count as immediately before
455 static int last_comment_line
;
456 static int last_non_comment_line
;
459 /* ========================= Accumulating messages ========================= */
462 static message_list_ty
*mlp
;
465 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
468 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
469 Other objects need not to be represented precisely. */
472 t_symbol
, /* symbol */
473 t_string
, /* string */
474 t_other
, /* other kind of real object */
475 t_dot
, /* '.' pseudo object */
476 t_close
, /* ')' or ']' pseudo object */
477 t_eof
/* EOF marker */
482 enum object_type type
;
483 struct token
*token
; /* for t_symbol and t_string */
484 int line_number_at_start
; /* for t_string */
487 /* Free the memory pointed to by a 'struct object'. */
489 free_object (struct object
*op
)
491 if (op
->type
== t_symbol
|| op
->type
== t_string
)
493 free_token (op
->token
);
498 /* Convert a t_symbol/t_string token to a char*. */
500 string_of_object (const struct object
*op
)
505 if (!(op
->type
== t_symbol
|| op
->type
== t_string
))
507 n
= op
->token
->charcount
;
508 str
= (char *) xmalloc (n
+ 1);
509 memcpy (str
, op
->token
->chars
, n
);
514 /* Context lookup table. */
515 static flag_context_list_table_ty
*flag_context_list_table
;
517 /* Returns the character represented by an escape sequence. */
519 do_getc_escaped (int c
)
540 case '0': case '1': case '2': case '3': case '4':
541 case '5': case '6': case '7':
548 if (c
>= '0' && c
<= '7')
550 n
= (n
<< 3) + (c
- '0');
554 if (c
>= '0' && c
<= '7')
555 n
= (n
<< 3) + (c
- '0');
563 return (unsigned char) n
;
574 else if (c
>= '0' && c
<= '9')
575 n
= (n
<< 4) + (c
- '0');
576 else if (c
>= 'A' && c
<= 'F')
577 n
= (n
<< 4) + (c
- 'A' + 10);
578 else if (c
>= 'a' && c
<= 'f')
579 n
= (n
<< 4) + (c
- 'a' + 10);
586 return (unsigned char) n
;
593 /* Read the next object. */
595 read_object (struct object
*op
, flag_context_ty outer_context
)
610 /* Comments assumed to be grouped with a message must immediately
611 precede it, with no non-whitespace token on a line between
613 if (last_non_comment_line
> last_comment_line
)
614 xgettext_comment_reset ();
617 case ' ': case '\t': case '\f': case '\r':
622 int arg
= 0; /* Current argument number. */
623 flag_context_list_iterator_ty context_iter
;
624 int argnum1
= 0; /* First string position. */
625 int argnum2
= 0; /* Plural string position. */
626 message_ty
*plural_mp
= NULL
; /* Remember the msgid. */
631 flag_context_ty inner_context
;
634 inner_context
= null_context
;
637 inherited_context (outer_context
,
638 flag_context_list_iterator_advance (
641 read_object (&inner
, inner_context
);
643 /* Recognize end of list. */
644 if (inner
.type
== t_close
)
647 /* Don't bother converting "()" to "NIL". */
648 last_non_comment_line
= line_number
;
652 /* Dots are not allowed in every position.
655 /* EOF inside list is illegal. But be tolerant. */
656 if (inner
.type
== t_eof
)
661 /* This is the function position. */
662 if (inner
.type
== t_symbol
)
664 char *symbol_name
= string_of_object (&inner
);
667 if (find_entry (&keywords
,
668 symbol_name
, strlen (symbol_name
),
672 argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
673 argnum2
= (int) (long) keyword_value
>> 10;
677 flag_context_list_iterator (
678 flag_context_list_table_lookup (
679 flag_context_list_table
,
680 symbol_name
, strlen (symbol_name
)));
685 context_iter
= null_context_list_iterator
;
689 /* These are the argument positions.
690 Extract a string if we have reached the right
691 argument position. */
694 if (inner
.type
== t_string
)
699 pos
.file_name
= logical_file_name
;
700 pos
.line_number
= inner
.line_number_at_start
;
701 mp
= remember_a_message (mlp
, string_of_object (&inner
),
702 inner_context
, &pos
);
707 else if (arg
== argnum2
)
709 if (inner
.type
== t_string
&& plural_mp
!= NULL
)
713 pos
.file_name
= logical_file_name
;
714 pos
.line_number
= inner
.line_number_at_start
;
715 remember_a_message_plural (plural_mp
, string_of_object (&inner
),
716 inner_context
, &pos
);
721 free_object (&inner
);
725 last_non_comment_line
= line_number
;
734 read_object (&inner
, null_context
);
736 /* Recognize end of vector. */
737 if (inner
.type
== t_close
)
740 last_non_comment_line
= line_number
;
744 /* Dots are not allowed. But be tolerant. */
746 /* EOF inside vector is illegal. But be tolerant. */
747 if (inner
.type
== t_eof
)
750 free_object (&inner
);
754 last_non_comment_line
= line_number
;
758 /* Tell the caller about the end of list or vector.
759 Unmatched closing parenthesis is illegal. But be tolerant. */
761 last_non_comment_line
= line_number
;
767 /* The ,@ handling inside lists is wrong anyway, because
768 ,@form expands to an unknown number of elements. */
769 if (c
!= EOF
&& c
!= '@')
778 read_object (&inner
, null_context
);
780 /* Dots and EOF are not allowed here. But be tolerant. */
782 free_object (&inner
);
785 last_non_comment_line
= line_number
;
791 bool all_semicolons
= true;
793 last_comment_line
= line_number
;
798 if (c
== EOF
|| c
== '\n' || c
== '\f' || c
== '\r')
801 all_semicolons
= false;
804 /* We skip all leading white space, but not EOLs. */
805 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
809 comment_line_end (0);
815 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
816 init_token (op
->token
);
817 op
->line_number_at_start
= line_number
;
822 /* Invalid input. Be tolerant, no error message. */
830 /* Invalid input. Be tolerant, no error message. */
833 /* Ignore escaped newline. */
837 c
= do_getc_escaped (c
);
839 /* Invalid input. Be tolerant, no error message. */
841 grow_token (op
->token
);
842 op
->token
->chars
[op
->token
->charcount
++] = c
;
847 grow_token (op
->token
);
848 op
->token
->chars
[op
->token
->charcount
++] = c
;
857 pos
.file_name
= logical_file_name
;
858 pos
.line_number
= op
->line_number_at_start
;
859 remember_a_message (mlp
, string_of_object (op
),
862 last_non_comment_line
= line_number
;
869 /* Invalid input. Be tolerant, no error message. */
875 /* Invalid input. Be tolerant, no error message. */
879 c
= do_getc_escaped (c
);
881 /* Invalid input. Be tolerant, no error message. */
886 last_non_comment_line
= line_number
;
890 /* Dispatch macro handling. */
893 /* Invalid input. Be tolerant, no error message. */
903 /* Skip comment until !# */
913 if (c
== EOF
|| c
== '#')
921 /* EOF not allowed here. But be tolerant. */
932 read_object (&inner
, null_context
);
933 /* Dots and EOF are not allowed here.
935 free_object (&inner
);
937 last_non_comment_line
= line_number
;
946 read_object (&inner
, null_context
);
947 /* Dots and EOF are not allowed here.
949 free_object (&inner
);
951 last_non_comment_line
= line_number
;
974 comment_line_end (0);
1000 /* We skip all leading white space. */
1001 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
1005 comment_line_end (1);
1013 /* EOF not allowed here. But be tolerant. */
1017 last_comment_line
= line_number
;
1025 read_token (&token
, &first
);
1026 free_token (&token
);
1028 last_non_comment_line
= line_number
;
1035 last_non_comment_line
= line_number
;
1048 read_token (&token
, &c
);
1049 free_token (&token
);
1051 last_non_comment_line
= line_number
;
1056 /* Invalid input. Be tolerant, no error message. */
1058 last_non_comment_line
= line_number
;
1070 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
1071 symbol
= read_token (op
->token
, &c
);
1072 if (op
->token
->charcount
== 1 && op
->token
->chars
[0] == '.')
1074 free_token (op
->token
);
1077 last_non_comment_line
= line_number
;
1082 free_token (op
->token
);
1085 last_non_comment_line
= line_number
;
1088 /* Distinguish between "foo" and "foo#bar". */
1092 struct token second_token
;
1094 free_token (op
->token
);
1096 read_token (&second_token
, NULL
);
1097 free_token (&second_token
);
1099 last_non_comment_line
= line_number
;
1106 op
->type
= t_symbol
;
1107 last_non_comment_line
= line_number
;
1117 extract_librep (FILE *f
,
1118 const char *real_filename
, const char *logical_filename
,
1119 flag_context_list_table_ty
*flag_table
,
1120 msgdomain_list_ty
*mdlp
)
1122 mlp
= mdlp
->item
[0]->messages
;
1125 real_file_name
= real_filename
;
1126 logical_file_name
= xstrdup (logical_filename
);
1129 last_comment_line
= -1;
1130 last_non_comment_line
= -1;
1132 flag_context_list_table
= flag_table
;
1136 /* Eat tokens until eof is seen. When read_object returns
1137 due to an unbalanced closing parenthesis, just restart it. */
1140 struct object toplevel_object
;
1142 read_object (&toplevel_object
, null_context
);
1144 if (toplevel_object
.type
== t_eof
)
1147 free_object (&toplevel_object
);
1151 /* Close scanner. */
1153 real_file_name
= NULL
;
1154 logical_file_name
= NULL
;