1 /* xgettext Emacs Lisp backend.
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001-2002.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
40 #define _(s) gettext(s)
43 /* Summary of Emacs Lisp syntax:
44 - ';' starts a comment until end of line.
45 - '#@nn' starts a comment of nn bytes.
46 - Integers are constituted of an optional prefix (#b, #B for binary,
47 #o, #O for octal, #x, #X for hexadecimal, #nnr, #nnR for any radix),
48 an optional sign (+ or -), the digits, and an optional trailing dot.
49 - Characters are written as '?' followed by the character, possibly
50 with an escape sequence, for examples '?a', '?\n', '?\177'.
51 - Strings are delimited by double quotes. Backslash introduces an escape
52 sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
53 '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
54 - Symbols: can contain meta-characters if preceded by backslash.
55 - Uninterned symbols: written as #:SYMBOL.
58 The reader is implemented in emacs-21.1/src/lread.c. */
61 /* ====================== Keyword set customization. ====================== */
63 /* If true extract all strings. */
64 static bool extract_all
= false;
66 static hash_table keywords
;
67 static bool default_keywords
= true;
71 x_elisp_extract_all ()
78 x_elisp_keyword (const char *name
)
81 default_keywords
= false;
89 if (keywords
.table
== NULL
)
90 init_hash (&keywords
, 100);
92 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
94 /* The characters between name and end should form a valid Lisp
96 colon
= strchr (name
, ':');
97 if (colon
== NULL
|| colon
>= end
)
101 insert_entry (&keywords
, name
, end
- name
,
102 (void *) (long) (argnum1
+ (argnum2
<< 10)));
107 /* Finish initializing the keywords hash table.
108 Called after argument processing, before each file is processed. */
112 if (default_keywords
)
114 x_elisp_keyword ("_");
115 default_keywords
= false;
120 init_flag_table_elisp ()
122 xgettext_record_flag ("_:1:pass-elisp-format");
123 xgettext_record_flag ("format:1:elisp-format");
127 /* ======================== Reading of characters. ======================== */
129 /* Real filename, used in error messages about the input file. */
130 static const char *real_file_name
;
132 /* Logical filename and line number, used to label the extracted messages. */
133 static char *logical_file_name
;
134 static int line_number
;
136 /* The input file stream. */
140 /* Fetch the next character from the input file. */
149 error (EXIT_FAILURE
, errno
, _("\
150 error while reading \"%s\""), real_file_name
);
158 /* Put back the last fetched character, not EOF. */
168 /* ========================== Reading of tokens. ========================== */
171 /* A token consists of a sequence of characters. */
174 int allocated
; /* number of allocated 'token_char's */
175 int charcount
; /* number of used 'token_char's */
176 char *chars
; /* the token's constituents */
179 /* Initialize a 'struct token'. */
181 init_token (struct token
*tp
)
184 tp
->chars
= (char *) xmalloc (tp
->allocated
* sizeof (char));
188 /* Free the memory pointed to by a 'struct token'. */
190 free_token (struct token
*tp
)
195 /* Ensure there is enough room in the token for one more character. */
197 grow_token (struct token
*tp
)
199 if (tp
->charcount
== tp
->allocated
)
202 tp
->chars
= (char *) xrealloc (tp
->chars
, tp
->allocated
* sizeof (char));
206 /* Test whether a token has integer syntax. */
208 is_integer (const char *p
)
210 /* NB: Yes, '+.' and '-.' both designate the integer 0. */
211 const char *p_start
= p
;
213 if (*p
== '+' || *p
== '-')
217 while (*p
>= '0' && *p
<= '9')
219 if (p
> p_start
&& *p
== '.')
224 /* Test whether a token has float syntax. */
226 is_float (const char *p
)
228 enum { LEAD_INT
= 1, DOT_CHAR
= 2, TRAIL_INT
= 4, E_CHAR
= 8, EXP_INT
= 16 };
232 if (*p
== '+' || *p
== '-')
234 if (*p
>= '0' && *p
<= '9')
239 while (*p
>= '0' && *p
<= '9');
246 if (*p
>= '0' && *p
<= '9')
251 while (*p
>= '0' && *p
<= '9');
253 if (*p
== 'e' || *p
== 'E')
257 if (*p
== '+' || *p
== '-')
259 if (*p
>= '0' && *p
<= '9')
264 while (*p
>= '0' && *p
<= '9');
266 else if (p
[-1] == '+'
267 && ((p
[0] == 'I' && p
[1] == 'N' && p
[2] == 'F')
268 || (p
[0] == 'N' && p
[1] == 'a' && p
[2] == 'N')))
275 && (state
== (LEAD_INT
| DOT_CHAR
| TRAIL_INT
)
276 || state
== (DOT_CHAR
| TRAIL_INT
)
277 || state
== (LEAD_INT
| E_CHAR
| EXP_INT
)
278 || state
== (LEAD_INT
| DOT_CHAR
| TRAIL_INT
| E_CHAR
| EXP_INT
)
279 || state
== (DOT_CHAR
| TRAIL_INT
| E_CHAR
| EXP_INT
));
282 /* Read the next token. 'first' is the first character, which has already
283 been read. Returns true for a symbol, false for a number. */
285 read_token (struct token
*tp
, int first
)
294 for (;; c
= do_getc ())
298 if (c
<= ' ') /* FIXME: Assumes ASCII compatible encoding */
300 if (c
== '\"' || c
== '\'' || c
== ';' || c
== '(' || c
== ')'
301 || c
== '[' || c
== ']' || c
== '#')
308 /* Invalid, but be tolerant. */
312 tp
->chars
[tp
->charcount
++] = c
;
318 return true; /* symbol */
320 /* Add a NUL byte at the end, for is_integer and is_float. */
322 tp
->chars
[tp
->charcount
] = '\0';
324 if (is_integer (tp
->chars
) || is_float (tp
->chars
))
325 return false; /* number */
327 return true; /* symbol */
331 /* ========================= Accumulating comments ========================= */
335 static size_t bufmax
;
336 static size_t buflen
;
347 if (buflen
>= bufmax
)
349 bufmax
= 2 * bufmax
+ 10;
350 buffer
= xrealloc (buffer
, bufmax
);
352 buffer
[buflen
++] = c
;
356 comment_line_end (size_t chars_to_remove
)
358 buflen
-= chars_to_remove
;
360 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
362 if (chars_to_remove
== 0 && buflen
>= bufmax
)
364 bufmax
= 2 * bufmax
+ 10;
365 buffer
= xrealloc (buffer
, bufmax
);
367 buffer
[buflen
] = '\0';
368 xgettext_comment_add (buffer
);
372 /* These are for tracking whether comments count as immediately before
374 static int last_comment_line
;
375 static int last_non_comment_line
;
378 /* ========================= Accumulating messages ========================= */
381 static message_list_ty
*mlp
;
384 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
387 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
388 Other objects need not to be represented precisely. */
391 t_symbol
, /* symbol */
392 t_string
, /* string */
393 t_other
, /* other kind of real object */
394 t_dot
, /* '.' pseudo object */
395 t_listclose
, /* ')' pseudo object */
396 t_vectorclose
,/* ']' pseudo object */
397 t_eof
/* EOF marker */
402 enum object_type type
;
403 struct token
*token
; /* for t_symbol and t_string */
404 int line_number_at_start
; /* for t_string */
407 /* Free the memory pointed to by a 'struct object'. */
409 free_object (struct object
*op
)
411 if (op
->type
== t_symbol
|| op
->type
== t_string
)
413 free_token (op
->token
);
418 /* Convert a t_symbol/t_string token to a char*. */
420 string_of_object (const struct object
*op
)
425 if (!(op
->type
== t_symbol
|| op
->type
== t_string
))
427 n
= op
->token
->charcount
;
428 str
= (char *) xmalloc (n
+ 1);
429 memcpy (str
, op
->token
->chars
, n
);
434 /* Context lookup table. */
435 static flag_context_list_table_ty
*flag_context_list_table
;
437 /* Returns the character represented by an escape sequence. */
438 #define IGNORABLE_ESCAPE (EOF - 1)
440 do_getc_escaped (int c
, bool in_string
)
464 return IGNORABLE_ESCAPE
;
467 return (in_string
? IGNORABLE_ESCAPE
: ' ');
474 /* Invalid input. But be tolerant. */
484 c
= do_getc_escaped (c
, false);
488 case 'S': /* shift */
493 /* Invalid input. But be tolerant. */
503 c
= do_getc_escaped (c
, false);
505 return (c
>= 'a' && c
<= 'z' ? c
- 'a' + 'A' : c
);
507 case 'H': /* hyper */
509 case 's': /* super */
514 /* Invalid input. But be tolerant. */
524 c
= do_getc_escaped (c
, false);
533 /* Invalid input. But be tolerant. */
545 c
= do_getc_escaped (c
, false);
549 if ((c
& 0x5F) >= 0x41 && (c
& 0x5F) <= 0x5A)
551 if ((c
& 0x7F) >= 0x40 && (c
& 0x7F) <= 0x5F)
553 #if 0 /* We cannot handle NUL bytes in strings. */
559 case '0': case '1': case '2': case '3': case '4':
560 case '5': case '6': case '7':
561 /* An octal escape, as in ANSI C. */
568 if (c
>= '0' && c
<= '7')
570 n
= (n
<< 3) + (c
- '0');
574 if (c
>= '0' && c
<= '7')
575 n
= (n
<< 3) + (c
- '0');
583 return (unsigned char) n
;
587 /* A hexadecimal escape, as in ANSI C. */
596 else if (c
>= '0' && c
<= '9')
597 n
= (n
<< 4) + (c
- '0');
598 else if (c
>= 'A' && c
<= 'F')
599 n
= (n
<< 4) + (c
- 'A' + 10);
600 else if (c
>= 'a' && c
<= 'f')
601 n
= (n
<< 4) + (c
- 'a' + 10);
608 return (unsigned char) n
;
612 /* Ignore Emacs multibyte character stuff. All the strings we are
613 interested in are ASCII strings. */
618 /* Read the next object.
619 'first_in_list' and 'new_backquote_flag' are used for reading old
620 backquote syntax and new backquote syntax. */
622 read_object (struct object
*op
, bool first_in_list
, bool new_backquote_flag
,
623 flag_context_ty outer_context
)
638 /* Comments assumed to be grouped with a message must immediately
639 precede it, with no non-whitespace token on a line between
641 if (last_non_comment_line
> last_comment_line
)
642 xgettext_comment_reset ();
647 int arg
= 0; /* Current argument number. */
648 flag_context_list_iterator_ty context_iter
;
649 int argnum1
= 0; /* First string position. */
650 int argnum2
= 0; /* Plural string position. */
651 message_ty
*plural_mp
= NULL
; /* Remember the msgid. */
656 flag_context_ty inner_context
;
659 inner_context
= null_context
;
662 inherited_context (outer_context
,
663 flag_context_list_iterator_advance (
666 read_object (&inner
, arg
== 0, new_backquote_flag
,
669 /* Recognize end of list. */
670 if (inner
.type
== t_listclose
)
673 /* Don't bother converting "()" to "NIL". */
674 last_non_comment_line
= line_number
;
678 /* Dots are not allowed in every position. ']' is not allowed.
681 /* EOF inside list is illegal. But be tolerant. */
682 if (inner
.type
== t_eof
)
687 /* This is the function position. */
688 if (inner
.type
== t_symbol
)
690 char *symbol_name
= string_of_object (&inner
);
693 if (find_entry (&keywords
,
694 symbol_name
, strlen (symbol_name
),
698 argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
699 argnum2
= (int) (long) keyword_value
>> 10;
703 flag_context_list_iterator (
704 flag_context_list_table_lookup (
705 flag_context_list_table
,
706 symbol_name
, strlen (symbol_name
)));
711 context_iter
= null_context_list_iterator
;
715 /* These are the argument positions.
716 Extract a string if we have reached the right
717 argument position. */
720 if (inner
.type
== t_string
)
725 pos
.file_name
= logical_file_name
;
726 pos
.line_number
= inner
.line_number_at_start
;
727 mp
= remember_a_message (mlp
, string_of_object (&inner
),
728 inner_context
, &pos
);
733 else if (arg
== argnum2
)
735 if (inner
.type
== t_string
&& plural_mp
!= NULL
)
739 pos
.file_name
= logical_file_name
;
740 pos
.line_number
= inner
.line_number_at_start
;
741 remember_a_message_plural (plural_mp
, string_of_object (&inner
),
742 inner_context
, &pos
);
747 free_object (&inner
);
751 last_non_comment_line
= line_number
;
755 /* Tell the caller about the end of list.
756 Unmatched closing parenthesis is illegal. But be tolerant. */
757 op
->type
= t_listclose
;
758 last_non_comment_line
= line_number
;
767 read_object (&inner
, false, new_backquote_flag
, null_context
);
769 /* Recognize end of vector. */
770 if (inner
.type
== t_vectorclose
)
773 last_non_comment_line
= line_number
;
777 /* Dots and ')' are not allowed. But be tolerant. */
779 /* EOF inside vector is illegal. But be tolerant. */
780 if (inner
.type
== t_eof
)
783 free_object (&inner
);
787 last_non_comment_line
= line_number
;
791 /* Tell the caller about the end of vector.
792 Unmatched closing bracket is illegal. But be tolerant. */
793 op
->type
= t_vectorclose
;
794 last_non_comment_line
= line_number
;
801 read_object (&inner
, false, new_backquote_flag
, null_context
);
803 /* Dots and EOF are not allowed here. But be tolerant. */
805 free_object (&inner
);
808 last_non_comment_line
= line_number
;
818 read_object (&inner
, false, true, null_context
);
820 /* Dots and EOF are not allowed here. But be tolerant. */
822 free_object (&inner
);
825 last_non_comment_line
= line_number
;
830 if (!new_backquote_flag
)
834 /* The ,@ handling inside lists is wrong anyway, because
835 ,@form expands to an unknown number of elements. */
836 if (c
!= EOF
&& c
!= '@' && c
!= '.')
842 read_object (&inner
, false, false, null_context
);
844 /* Dots and EOF are not allowed here. But be tolerant. */
846 free_object (&inner
);
849 last_non_comment_line
= line_number
;
855 bool all_semicolons
= true;
857 last_comment_line
= line_number
;
862 if (c
== EOF
|| c
== '\n')
865 all_semicolons
= false;
868 /* We skip all leading white space, but not EOLs. */
869 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
873 comment_line_end (0);
879 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
880 init_token (op
->token
);
881 op
->line_number_at_start
= line_number
;
886 /* Invalid input. Be tolerant, no error message. */
894 /* Invalid input. Be tolerant, no error message. */
896 c
= do_getc_escaped (c
, true);
898 /* Invalid input. Be tolerant, no error message. */
900 if (c
== IGNORABLE_ESCAPE
)
901 /* Ignore escaped newline and escaped space. */
905 grow_token (op
->token
);
906 op
->token
->chars
[op
->token
->charcount
++] = c
;
911 grow_token (op
->token
);
912 op
->token
->chars
[op
->token
->charcount
++] = c
;
921 pos
.file_name
= logical_file_name
;
922 pos
.line_number
= op
->line_number_at_start
;
923 remember_a_message (mlp
, string_of_object (op
),
926 last_non_comment_line
= line_number
;
933 /* Invalid input. Be tolerant, no error message. */
939 /* Invalid input. Be tolerant, no error message. */
943 c
= do_getc_escaped (c
, false);
945 /* Invalid input. Be tolerant, no error message. */
949 /* Impossible to deal with Emacs multibyte character stuff here. */
951 last_non_comment_line
= line_number
;
955 /* Dispatch macro handling. */
958 /* Invalid input. Be tolerant, no error message. */
972 /* Read a char table, same syntax as a vector. */
977 read_object (&inner
, false, new_backquote_flag
,
980 /* Recognize end of vector. */
981 if (inner
.type
== t_vectorclose
)
984 last_non_comment_line
= line_number
;
988 /* Dots and ')' are not allowed. But be tolerant. */
990 /* EOF inside vector is illegal. But be tolerant. */
991 if (inner
.type
== t_eof
)
994 free_object (&inner
);
997 last_non_comment_line
= line_number
;
1001 /* Invalid input. Be tolerant, no error message. */
1005 last_non_comment_line
= line_number
;
1010 /* Read a bit vector. */
1012 struct object length
;
1013 read_object (&length
, first_in_list
, new_backquote_flag
,
1015 /* Dots and EOF are not allowed here.
1017 free_object (&length
);
1022 struct object string
;
1023 read_object (&string
, first_in_list
, new_backquote_flag
,
1025 free_object (&string
);
1028 /* Invalid input. Be tolerant, no error message. */
1031 last_non_comment_line
= line_number
;
1035 /* Read a compiled function, same syntax as a vector. */
1037 /* Read a string with properties, same syntax as a list. */
1039 struct object inner
;
1041 read_object (&inner
, false, new_backquote_flag
, null_context
);
1042 /* Dots and EOF are not allowed here.
1044 free_object (&inner
);
1046 last_non_comment_line
= line_number
;
1051 /* Read a comment consisting of a given number of bytes. */
1053 unsigned int nskip
= 0;
1058 if (!(c
>= '0' && c
<= '9'))
1060 nskip
= 10 * nskip
+ (c
- '0');
1065 for (; nskip
> 0; nskip
--)
1066 if (do_getc () == EOF
)
1074 last_non_comment_line
= line_number
;
1079 case 'S': case 's': /* XEmacs only */
1081 struct object inner
;
1082 read_object (&inner
, false, new_backquote_flag
, null_context
);
1083 /* Dots and EOF are not allowed here.
1085 free_object (&inner
);
1087 last_non_comment_line
= line_number
;
1091 case '0': case '1': case '2': case '3': case '4':
1092 case '5': case '6': case '7': case '8': case '9':
1093 /* Read Common Lisp style #n# or #n=. */
1097 if (!(c
>= '0' && c
<= '9'))
1101 /* Invalid input. Be tolerant, no error message. */
1108 read_object (op
, false, new_backquote_flag
, outer_context
);
1109 last_non_comment_line
= line_number
;
1115 last_non_comment_line
= line_number
;
1118 if (c
== 'R' || c
== 'r')
1120 /* Read an integer. */
1122 if (c
== '+' || c
== '-')
1124 for (; c
!= EOF
; c
= do_getc ())
1131 last_non_comment_line
= line_number
;
1134 /* Invalid input. Be tolerant, no error message. */
1136 last_non_comment_line
= line_number
;
1143 /* Read an integer. */
1145 if (c
== '+' || c
== '-')
1147 for (; c
!= EOF
; c
= do_getc ())
1154 last_non_comment_line
= line_number
;
1158 case '*': /* XEmacs only */
1160 /* Read a bit-vector. */
1163 while (c
== '0' || c
== '1');
1167 last_non_comment_line
= line_number
;
1171 case '+': /* XEmacs only */
1172 case '-': /* XEmacs only */
1173 /* Simply assume every feature expression is true. */
1175 struct object inner
;
1176 read_object (&inner
, false, new_backquote_flag
, null_context
);
1177 /* Dots and EOF are not allowed here.
1179 free_object (&inner
);
1184 /* Invalid input. Be tolerant, no error message. */
1186 last_non_comment_line
= line_number
;
1198 if (c
<= ' ' /* FIXME: Assumes ASCII compatible encoding */
1199 || strchr ("\"'`,(", c
) != NULL
)
1202 last_non_comment_line
= line_number
;
1210 if (c
<= ' ') /* FIXME: Assumes ASCII compatible encoding */
1216 op
->token
= (struct token
*) xmalloc (sizeof (struct token
));
1217 symbol
= read_token (op
->token
, c
);
1220 op
->type
= t_symbol
;
1221 last_non_comment_line
= line_number
;
1226 free_token (op
->token
);
1229 last_non_comment_line
= line_number
;
1239 extract_elisp (FILE *f
,
1240 const char *real_filename
, const char *logical_filename
,
1241 flag_context_list_table_ty
*flag_table
,
1242 msgdomain_list_ty
*mdlp
)
1244 mlp
= mdlp
->item
[0]->messages
;
1247 real_file_name
= real_filename
;
1248 logical_file_name
= xstrdup (logical_filename
);
1251 last_comment_line
= -1;
1252 last_non_comment_line
= -1;
1254 flag_context_list_table
= flag_table
;
1258 /* Eat tokens until eof is seen. When read_object returns
1259 due to an unbalanced closing parenthesis, just restart it. */
1262 struct object toplevel_object
;
1264 read_object (&toplevel_object
, false, false, null_context
);
1266 if (toplevel_object
.type
== t_eof
)
1269 free_object (&toplevel_object
);
1273 /* Close scanner. */
1275 real_file_name
= NULL
;
1276 logical_file_name
= NULL
;