Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-lisp.c
blobe3a78859d838d0bc2fc4d5a63f21d60ad9a6c08c
1 /* xgettext Lisp backend.
2 Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-lisp.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "gettext.h"
39 #define _(s) gettext(s)
42 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
43 Since we are interested only in strings and in forms similar to
44 (gettext msgid ...)
45 or (ngettext msgid msgid_plural ...)
46 we make the following simplifications:
48 - Assume the keywords and strings are in an ASCII compatible encoding.
49 This means we can read the input file one byte at a time, instead of
50 one character at a time. No need to worry about multibyte characters:
51 If they occur as part of identifiers, they most probably act as
52 constituent characters, and the byte based approach will do the same.
54 - Assume the read table is the standard Common Lisp read table.
55 Non-standard read tables are mostly used to read data, not programs.
57 - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
59 - Don't interpret #n= and #n#, they usually don't appear in programs.
61 - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
63 The remaining syntax rules are:
65 - The syntax code assigned to each character, and how tokens are built
66 up from characters (single escape, multiple escape etc.).
68 - Comment syntax: ';' and '#| ... |#'.
70 - String syntax: "..." with single escapes.
72 - Read macros and dispatch macro character '#'. Needed to be able to
73 tell which is the n-th argument of a function call.
78 /* ========================= Lexer customization. ========================= */
80 /* 'readtable_case' is the case conversion that is applied to non-escaped
81 parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */
83 enum rtcase
85 case_upcase,
86 case_downcase,
87 case_preserve,
88 case_invert
91 static enum rtcase readtable_case = case_upcase;
93 /* 'read_base' is the assumed radix of integers and rational numbers.
94 In Common Lisp: *read-base*. */
95 static int read_base = 10;
97 /* 'read_preserve_whitespace' specifies whether a whitespace character
98 that terminates a token must be pushed back on the input stream.
99 We set it to true, because the special newline side effect in read_object()
100 requires that read_object() sees every newline not inside a token. */
101 static bool read_preserve_whitespace = true;
104 /* ====================== Keyword set customization. ====================== */
106 /* If true extract all strings. */
107 static bool extract_all = false;
109 static hash_table keywords;
110 static bool default_keywords = true;
113 void
114 x_lisp_extract_all ()
116 extract_all = true;
120 void
121 x_lisp_keyword (const char *name)
123 if (name == NULL)
124 default_keywords = false;
125 else
127 const char *end;
128 int argnum1;
129 int argnum2;
130 const char *colon;
131 size_t len;
132 char *symname;
133 size_t i;
135 if (keywords.table == NULL)
136 init_hash (&keywords, 100);
138 split_keywordspec (name, &end, &argnum1, &argnum2);
140 /* The characters between name and end should form a valid Lisp symbol.
141 Extract the symbol name part. */
142 colon = strchr (name, ':');
143 if (colon != NULL && colon < end)
145 name = colon + 1;
146 if (name < end && *name == ':')
147 name++;
148 colon = strchr (name, ':');
149 if (colon != NULL && colon < end)
150 return;
153 /* Uppercase it. */
154 len = end - name;
155 symname = (char *) xmalloc (len);
156 for (i = 0; i < len; i++)
157 symname[i] =
158 (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
160 if (argnum1 == 0)
161 argnum1 = 1;
162 insert_entry (&keywords, symname, len,
163 (void *) (long) (argnum1 + (argnum2 << 10)));
167 /* Finish initializing the keywords hash table.
168 Called after argument processing, before each file is processed. */
169 static void
170 init_keywords ()
172 if (default_keywords)
174 x_lisp_keyword ("gettext"); /* I18N:GETTEXT */
175 x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */
176 x_lisp_keyword ("gettext-noop");
177 default_keywords = false;
181 void
182 init_flag_table_lisp ()
184 xgettext_record_flag ("gettext:1:pass-lisp-format");
185 xgettext_record_flag ("ngettext:1:pass-lisp-format");
186 xgettext_record_flag ("ngettext:2:pass-lisp-format");
187 xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
188 xgettext_record_flag ("format:2:lisp-format");
192 /* ======================== Reading of characters. ======================== */
194 /* Real filename, used in error messages about the input file. */
195 static const char *real_file_name;
197 /* Logical filename and line number, used to label the extracted messages. */
198 static char *logical_file_name;
199 static int line_number;
201 /* The input file stream. */
202 static FILE *fp;
205 /* Fetch the next character from the input file. */
206 static int
207 do_getc ()
209 int c = getc (fp);
211 if (c == EOF)
213 if (ferror (fp))
214 error (EXIT_FAILURE, errno, _("\
215 error while reading \"%s\""), real_file_name);
217 else if (c == '\n')
218 line_number++;
220 return c;
223 /* Put back the last fetched character, not EOF. */
224 static void
225 do_ungetc (int c)
227 if (c == '\n')
228 line_number--;
229 ungetc (c, fp);
233 /* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */
236 /* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */
238 enum syntax_code
240 syntax_illegal, /* non-printable, except whitespace */
241 syntax_single_esc, /* '\' (single escape) */
242 syntax_multi_esc, /* '|' (multiple escape) */
243 syntax_constituent, /* everything else (constituent) */
244 syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */
245 syntax_eof, /* EOF */
246 syntax_t_macro, /* '()'"' (terminating macro) */
247 syntax_nt_macro /* '#' (non-terminating macro) */
250 /* Returns the syntax code of a character. */
251 static enum syntax_code
252 syntax_code_of (unsigned char c)
254 switch (c)
256 case '\\':
257 return syntax_single_esc;
258 case '|':
259 return syntax_multi_esc;
260 case '\t': case '\n': case '\f': case '\r': case ' ':
261 return syntax_whitespace;
262 case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
263 return syntax_t_macro;
264 case '#':
265 return syntax_nt_macro;
266 default:
267 if (c < ' ' && c != '\b')
268 return syntax_illegal;
269 else
270 return syntax_constituent;
274 struct char_syntax
276 int ch; /* character */
277 enum syntax_code scode; /* syntax code */
280 /* Returns the next character and its syntax code. */
281 static void
282 read_char_syntax (struct char_syntax *p)
284 int c = do_getc ();
286 p->ch = c;
287 p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
290 /* Every character in a token has an attribute assigned. The attributes
291 help during interpretation of the token. See
292 CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
293 and CLHS 2.1.4.2 "Constituent Traits". */
295 enum attribute
297 a_illg, /* invalid constituent */
298 a_pack_m, /* ':' package marker */
299 a_alpha, /* normal alphabetic */
300 a_escaped, /* alphabetic but not subject to case conversion */
301 a_ratio, /* '/' */
302 a_dot, /* '.' */
303 a_sign, /* '+-' */
304 a_extens, /* '_^' extension characters */
305 a_digit, /* '0123456789' */
306 a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
307 a_expodigit, /* 'esfdlESFDL' below base */
308 a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
309 a_expo /* 'esfdlESFDL' */
312 #define is_letter_attribute(a) ((a) >= a_letter)
313 #define is_number_attribute(a) ((a) >= a_ratio)
315 /* Returns the attribute of a character, assuming base 10. */
316 static enum attribute
317 attribute_of (unsigned char c)
319 switch (c)
321 case ':':
322 return a_pack_m;
323 case '/':
324 return a_ratio;
325 case '.':
326 return a_dot;
327 case '+': case '-':
328 return a_sign;
329 case '_': case '^':
330 return a_extens;
331 case '0': case '1': case '2': case '3': case '4':
332 case '5': case '6': case '7': case '8': case '9':
333 return a_digit;
334 case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
335 case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
336 case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
337 case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
338 case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
339 case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
340 return a_letter;
341 case 'e': case 's': case 'd': case 'f': case 'l':
342 case 'E': case 'S': case 'D': case 'F': case 'L':
343 return a_expo;
344 default:
345 /* Treat everything as valid. Never return a_illg. */
346 return a_alpha;
350 struct token_char
352 unsigned char ch; /* character */
353 unsigned char attribute; /* attribute */
356 /* A token consists of a sequence of characters with associated attribute. */
357 struct token
359 int allocated; /* number of allocated 'token_char's */
360 int charcount; /* number of used 'token_char's */
361 struct token_char *chars; /* the token's constituents */
362 bool with_escape; /* whether single-escape or multiple escape occurs */
365 /* Initialize a 'struct token'. */
366 static inline void
367 init_token (struct token *tp)
369 tp->allocated = 10;
370 tp->chars =
371 (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char));
372 tp->charcount = 0;
375 /* Free the memory pointed to by a 'struct token'. */
376 static inline void
377 free_token (struct token *tp)
379 free (tp->chars);
382 /* Ensure there is enough room in the token for one more character. */
383 static inline void
384 grow_token (struct token *tp)
386 if (tp->charcount == tp->allocated)
388 tp->allocated *= 2;
389 tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
393 /* Read the next token. If 'first' is given, it points to the first
394 character, which has already been read.
395 The algorithm follows CLHS 2.2 "Reader Algorithm". */
396 static void
397 read_token (struct token *tp, const struct char_syntax *first)
399 bool multiple_escape_flag;
400 struct char_syntax curr;
402 init_token (tp);
403 tp->with_escape = false;
405 multiple_escape_flag = false;
406 if (first)
407 curr = *first;
408 else
409 read_char_syntax (&curr);
411 for (;; read_char_syntax (&curr))
413 switch (curr.scode)
415 case syntax_illegal:
416 /* Invalid input. Be tolerant, no error message. */
417 do_ungetc (curr.ch);
418 return;
420 case syntax_single_esc:
421 tp->with_escape = true;
422 read_char_syntax (&curr);
423 if (curr.scode == syntax_eof)
424 /* Invalid input. Be tolerant, no error message. */
425 return;
426 grow_token (tp);
427 tp->chars[tp->charcount].ch = curr.ch;
428 tp->chars[tp->charcount].attribute = a_escaped;
429 tp->charcount++;
430 break;
432 case syntax_multi_esc:
433 multiple_escape_flag = !multiple_escape_flag;
434 tp->with_escape = true;
435 break;
437 case syntax_constituent:
438 case syntax_nt_macro:
439 grow_token (tp);
440 if (multiple_escape_flag)
442 tp->chars[tp->charcount].ch = curr.ch;
443 tp->chars[tp->charcount].attribute = a_escaped;
444 tp->charcount++;
446 else
448 tp->chars[tp->charcount].ch = curr.ch;
449 tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
450 tp->charcount++;
452 break;
454 case syntax_whitespace:
455 case syntax_t_macro:
456 if (multiple_escape_flag)
458 grow_token (tp);
459 tp->chars[tp->charcount].ch = curr.ch;
460 tp->chars[tp->charcount].attribute = a_escaped;
461 tp->charcount++;
463 else
465 if (curr.scode != syntax_whitespace || read_preserve_whitespace)
466 do_ungetc (curr.ch);
467 return;
469 break;
471 case syntax_eof:
472 if (multiple_escape_flag)
473 /* Invalid input. Be tolerant, no error message. */
475 return;
480 /* A potential number is a token which
481 1. consists only of digits, '+','-','/','^','_','.' and number markers.
482 The base for digits is context dependent, but always 10 if a dot '.'
483 occurs. A number marker is a non-digit letter which is not adjacent
484 to a non-digit letter.
485 2. has at least one digit.
486 3. starts with a digit, '+','-','.','^' or '_'.
487 4. does not end with '+' or '-'.
488 See CLHS 2.3.1.1 "Potential Numbers as Tokens".
491 static inline bool
492 has_a_dot (const struct token *tp)
494 int n = tp->charcount;
495 int i;
497 for (i = 0; i < n; i++)
498 if (tp->chars[i].attribute == a_dot)
499 return true;
500 return false;
503 static inline bool
504 all_a_number (const struct token *tp)
506 int n = tp->charcount;
507 int i;
509 for (i = 0; i < n; i++)
510 if (!is_number_attribute (tp->chars[i].attribute))
511 return false;
512 return true;
515 static inline void
516 a_letter_to_digit (const struct token *tp, int base)
518 int n = tp->charcount;
519 int i;
521 for (i = 0; i < n; i++)
522 if (is_letter_attribute (tp->chars[i].attribute))
524 int c = tp->chars[i].ch;
526 if (c >= 'a')
527 c -= 'a' - 'A';
528 if (c - 'A' + 10 < base)
529 tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
530 a_expo -> a_expodigit */
534 static inline bool
535 has_a_digit (const struct token *tp)
537 int n = tp->charcount;
538 int i;
540 for (i = 0; i < n; i++)
541 if (tp->chars[i].attribute == a_digit
542 || tp->chars[i].attribute == a_letterdigit
543 || tp->chars[i].attribute == a_expodigit)
544 return true;
545 return false;
548 static inline bool
549 has_adjacent_letters (const struct token *tp)
551 int n = tp->charcount;
552 int i;
554 for (i = 1; i < n; i++)
555 if (is_letter_attribute (tp->chars[i-1].attribute)
556 && is_letter_attribute (tp->chars[i].attribute))
557 return true;
558 return false;
561 static bool
562 is_potential_number (const struct token *tp, int *basep)
564 /* CLHS 2.3.1.1.1:
565 "A potential number cannot contain any escape characters." */
566 if (tp->with_escape)
567 return false;
569 if (has_a_dot (tp))
570 *basep = 10;
572 if (!all_a_number (tp))
573 return false;
575 a_letter_to_digit (tp, *basep);
577 if (!has_a_digit (tp))
578 return false;
580 if (has_adjacent_letters (tp))
581 return false;
583 if (!(tp->chars[0].attribute >= a_dot
584 && tp->chars[0].attribute <= a_expodigit))
585 return false;
587 if (tp->chars[tp->charcount - 1].attribute == a_sign)
588 return false;
590 return true;
593 /* A number is one of integer, ratio, float. Each has a particular syntax.
594 See CLHS 2.3.1 "Numbers as Tokens".
595 But note a mistake: The exponent rule should read:
596 exponent ::= exponent-marker [sign] {decimal-digit}+
597 (see 22.1.3.1.3 "Printing Floats"). */
599 enum number_type
601 n_none,
602 n_integer,
603 n_ratio,
604 n_float
607 static enum number_type
608 is_number (const struct token *tp, int *basep)
610 struct token_char *ptr_limit;
611 struct token_char *ptr1;
613 if (!is_potential_number (tp, basep))
614 return n_none;
616 /* is_potential_number guarantees
617 - all attributes are >= a_ratio,
618 - there is at least one a_digit or a_letterdigit or a_expodigit, and
619 - if there is an a_dot, then *basep = 10. */
621 ptr1 = &tp->chars[0];
622 ptr_limit = &tp->chars[tp->charcount];
624 if (ptr1->attribute == a_sign)
625 ptr1++;
627 /* Test for syntax
628 * { a_sign | }
629 * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
632 bool seen_a_ratio = false;
633 bool seen_a_digit = false; /* seen a digit in last digit block? */
634 struct token_char *ptr;
636 for (ptr = ptr1;; ptr++)
638 if (ptr >= ptr_limit)
640 if (!seen_a_digit)
641 break;
642 if (seen_a_ratio)
643 return n_ratio;
644 else
645 return n_integer;
647 if (ptr->attribute == a_digit
648 || ptr->attribute == a_letterdigit
649 || ptr->attribute == a_expodigit)
651 int c = ptr->ch;
653 c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
654 if (c >= *basep)
655 break;
656 seen_a_digit = true;
658 else if (ptr->attribute == a_ratio)
660 if (seen_a_ratio || !seen_a_digit)
661 break;
662 seen_a_ratio = true;
663 seen_a_digit = false;
665 else
666 break;
670 /* Test for syntax
671 * { a_sign | }
672 * { a_digit }* { a_dot { a_digit }* | }
673 * { a_expo { a_sign | } { a_digit }+ | }
675 * If there is an exponent part, there must be digits before the dot or
676 * after the dot. The result is a float.
677 * If there is no exponen:
678 * If there is no dot, it would an integer in base 10, but is has already
679 * been verified to not be an integer in the current base.
680 * If there is a dot:
681 * If there are digits after the dot, it's a float.
682 * Otherwise, if there are digits before the dot, it's an integer.
684 *basep = 10;
686 bool seen_a_dot = false;
687 bool seen_a_dot_with_leading_digits = false;
688 bool seen_a_digit = false; /* seen a digit in last digit block? */
689 struct token_char *ptr;
691 for (ptr = ptr1;; ptr++)
693 if (ptr >= ptr_limit)
695 /* no exponent */
696 if (!seen_a_dot)
697 return n_none;
698 if (seen_a_digit)
699 return n_float;
700 if (seen_a_dot_with_leading_digits)
701 return n_integer;
702 else
703 return n_none;
705 if (ptr->attribute == a_digit)
707 seen_a_digit = true;
709 else if (ptr->attribute == a_dot)
711 if (seen_a_dot)
712 return n_none;
713 seen_a_dot = true;
714 if (seen_a_digit)
715 seen_a_dot_with_leading_digits = true;
716 seen_a_digit = false;
718 else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
719 break;
720 else
721 return n_none;
723 ptr++;
724 if (!seen_a_dot_with_leading_digits || !seen_a_digit)
725 return n_none;
726 if (ptr >= ptr_limit)
727 return n_none;
728 if (ptr->attribute == a_sign)
729 ptr++;
730 seen_a_digit = false;
731 for (;; ptr++)
733 if (ptr >= ptr_limit)
734 break;
735 if (ptr->attribute != a_digit)
736 return n_none;
737 seen_a_digit = true;
739 if (!seen_a_digit)
740 return n_none;
741 return n_float;
745 /* A token representing a symbol must be case converted.
746 For portability, we convert only ASCII characters here. */
748 static void
749 upcase_token (struct token *tp)
751 int n = tp->charcount;
752 int i;
754 for (i = 0; i < n; i++)
755 if (tp->chars[i].attribute != a_escaped)
757 unsigned char c = tp->chars[i].ch;
758 if (c >= 'a' && c <= 'z')
759 tp->chars[i].ch = c - 'a' + 'A';
763 static void
764 downcase_token (struct token *tp)
766 int n = tp->charcount;
767 int i;
769 for (i = 0; i < n; i++)
770 if (tp->chars[i].attribute != a_escaped)
772 unsigned char c = tp->chars[i].ch;
773 if (c >= 'A' && c <= 'Z')
774 tp->chars[i].ch = c - 'A' + 'a';
778 static void
779 case_convert_token (struct token *tp)
781 int n = tp->charcount;
782 int i;
784 switch (readtable_case)
786 case case_upcase:
787 upcase_token (tp);
788 break;
790 case case_downcase:
791 downcase_token (tp);
792 break;
794 case case_preserve:
795 break;
797 case case_invert:
799 bool seen_uppercase = false;
800 bool seen_lowercase = false;
801 for (i = 0; i < n; i++)
802 if (tp->chars[i].attribute != a_escaped)
804 unsigned char c = tp->chars[i].ch;
805 if (c >= 'a' && c <= 'z')
806 seen_lowercase = true;
807 if (c >= 'A' && c <= 'Z')
808 seen_uppercase = true;
810 if (seen_uppercase)
812 if (!seen_lowercase)
813 downcase_token (tp);
815 else
817 if (seen_lowercase)
818 upcase_token (tp);
821 break;
826 /* ========================= Accumulating comments ========================= */
829 static char *buffer;
830 static size_t bufmax;
831 static size_t buflen;
833 static inline void
834 comment_start ()
836 buflen = 0;
839 static inline void
840 comment_add (int c)
842 if (buflen >= bufmax)
844 bufmax = 2 * bufmax + 10;
845 buffer = xrealloc (buffer, bufmax);
847 buffer[buflen++] = c;
850 static inline void
851 comment_line_end (size_t chars_to_remove)
853 buflen -= chars_to_remove;
854 while (buflen >= 1
855 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
856 --buflen;
857 if (chars_to_remove == 0 && buflen >= bufmax)
859 bufmax = 2 * bufmax + 10;
860 buffer = xrealloc (buffer, bufmax);
862 buffer[buflen] = '\0';
863 xgettext_comment_add (buffer);
867 /* These are for tracking whether comments count as immediately before
868 keyword. */
869 static int last_comment_line;
870 static int last_non_comment_line;
873 /* ========================= Accumulating messages ========================= */
876 static message_list_ty *mlp;
879 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
882 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
883 Other objects need not to be represented precisely. */
884 enum object_type
886 t_symbol, /* symbol */
887 t_string, /* string */
888 t_other, /* other kind of real object */
889 t_dot, /* '.' pseudo object */
890 t_close, /* ')' pseudo object */
891 t_eof /* EOF marker */
894 struct object
896 enum object_type type;
897 struct token *token; /* for t_symbol and t_string */
898 int line_number_at_start; /* for t_string */
901 /* Free the memory pointed to by a 'struct object'. */
902 static inline void
903 free_object (struct object *op)
905 if (op->type == t_symbol || op->type == t_string)
907 free_token (op->token);
908 free (op->token);
912 /* Convert a t_symbol/t_string token to a char*. */
913 static char *
914 string_of_object (const struct object *op)
916 char *str;
917 const struct token_char *p;
918 char *q;
919 int n;
921 if (!(op->type == t_symbol || op->type == t_string))
922 abort ();
923 n = op->token->charcount;
924 str = (char *) xmalloc (n + 1);
925 q = str;
926 for (p = op->token->chars; n > 0; p++, n--)
927 *q++ = p->ch;
928 *q = '\0';
929 return str;
932 /* Context lookup table. */
933 static flag_context_list_table_ty *flag_context_list_table;
935 /* Read the next object. */
936 static void
937 read_object (struct object *op, flag_context_ty outer_context)
939 for (;;)
941 struct char_syntax curr;
943 read_char_syntax (&curr);
945 switch (curr.scode)
947 case syntax_eof:
948 op->type = t_eof;
949 return;
951 case syntax_whitespace:
952 if (curr.ch == '\n')
953 /* Comments assumed to be grouped with a message must immediately
954 precede it, with no non-whitespace token on a line between
955 both. */
956 if (last_non_comment_line > last_comment_line)
957 xgettext_comment_reset ();
958 continue;
960 case syntax_illegal:
961 op->type = t_other;
962 return;
964 case syntax_single_esc:
965 case syntax_multi_esc:
966 case syntax_constituent:
967 /* Start reading a token. */
968 op->token = (struct token *) xmalloc (sizeof (struct token));
969 read_token (op->token, &curr);
970 last_non_comment_line = line_number;
972 /* Interpret the token. */
974 /* Dots. */
975 if (!op->token->with_escape
976 && op->token->charcount == 1
977 && op->token->chars[0].attribute == a_dot)
979 free_token (op->token);
980 free (op->token);
981 op->type = t_dot;
982 return;
984 /* Tokens consisting entirely of dots are illegal, but be tolerant
985 here. */
987 /* Number. */
989 int base = read_base;
991 if (is_number (op->token, &base) != n_none)
993 free_token (op->token);
994 free (op->token);
995 op->type = t_other;
996 return;
1000 /* We interpret all other tokens as symbols (including 'reserved
1001 tokens', i.e. potential numbers which are not numbers). */
1002 case_convert_token (op->token);
1003 op->type = t_symbol;
1004 return;
1006 case syntax_t_macro:
1007 case syntax_nt_macro:
1008 /* Read a macro. */
1009 switch (curr.ch)
1011 case '(':
1013 int arg = 0; /* Current argument number. */
1014 flag_context_list_iterator_ty context_iter;
1015 int argnum1 = 0; /* First string position. */
1016 int argnum2 = 0; /* Plural string position. */
1017 message_ty *plural_mp = NULL; /* Remember the msgid. */
1019 for (;; arg++)
1021 struct object inner;
1022 flag_context_ty inner_context;
1024 if (arg == 0)
1025 inner_context = null_context;
1026 else
1027 inner_context =
1028 inherited_context (outer_context,
1029 flag_context_list_iterator_advance (
1030 &context_iter));
1032 read_object (&inner, inner_context);
1034 /* Recognize end of list. */
1035 if (inner.type == t_close)
1037 op->type = t_other;
1038 /* Don't bother converting "()" to "NIL". */
1039 last_non_comment_line = line_number;
1040 return;
1043 /* Dots are not allowed in every position.
1044 But be tolerant. */
1046 /* EOF inside list is illegal.
1047 But be tolerant. */
1048 if (inner.type == t_eof)
1049 break;
1051 if (arg == 0)
1053 /* This is the function position. */
1054 if (inner.type == t_symbol)
1056 char *symbol_name = string_of_object (&inner);
1057 int i;
1058 int prefix_len;
1059 void *keyword_value;
1061 /* Omit any package name. */
1062 i = inner.token->charcount;
1063 while (i > 0
1064 && inner.token->chars[i-1].attribute != a_pack_m)
1065 i--;
1066 prefix_len = i;
1068 if (find_entry (&keywords,
1069 symbol_name + prefix_len,
1070 strlen (symbol_name + prefix_len),
1071 &keyword_value)
1072 == 0)
1074 argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1075 argnum2 = (int) (long) keyword_value >> 10;
1078 context_iter =
1079 flag_context_list_iterator (
1080 flag_context_list_table_lookup (
1081 flag_context_list_table,
1082 symbol_name, strlen (symbol_name)));
1084 free (symbol_name);
1086 else
1087 context_iter = null_context_list_iterator;
1089 else
1091 /* These are the argument positions.
1092 Extract a string if we have reached the right
1093 argument position. */
1094 if (arg == argnum1)
1096 if (inner.type == t_string)
1098 lex_pos_ty pos;
1099 message_ty *mp;
1101 pos.file_name = logical_file_name;
1102 pos.line_number = inner.line_number_at_start;
1103 mp = remember_a_message (mlp, string_of_object (&inner),
1104 inner_context, &pos);
1105 if (argnum2 > 0)
1106 plural_mp = mp;
1109 else if (arg == argnum2)
1111 if (inner.type == t_string && plural_mp != NULL)
1113 lex_pos_ty pos;
1115 pos.file_name = logical_file_name;
1116 pos.line_number = inner.line_number_at_start;
1117 remember_a_message_plural (plural_mp, string_of_object (&inner),
1118 inner_context, &pos);
1123 free_object (&inner);
1126 op->type = t_other;
1127 last_non_comment_line = line_number;
1128 return;
1130 case ')':
1131 /* Tell the caller about the end of list.
1132 Unmatched closing parenthesis is illegal.
1133 But be tolerant. */
1134 op->type = t_close;
1135 last_non_comment_line = line_number;
1136 return;
1138 case ',':
1140 int c = do_getc ();
1141 /* The ,@ handling inside lists is wrong anyway, because
1142 ,@form expands to an unknown number of elements. */
1143 if (c != EOF && c != '@' && c != '.')
1144 do_ungetc (c);
1146 /*FALLTHROUGH*/
1147 case '\'':
1148 case '`':
1150 struct object inner;
1152 read_object (&inner, null_context);
1154 /* Dots and EOF are not allowed here. But be tolerant. */
1156 free_object (&inner);
1158 op->type = t_other;
1159 last_non_comment_line = line_number;
1160 return;
1163 case ';':
1165 bool all_semicolons = true;
1167 last_comment_line = line_number;
1168 comment_start ();
1169 for (;;)
1171 int c = do_getc ();
1172 if (c == EOF || c == '\n')
1173 break;
1174 if (c != ';')
1175 all_semicolons = false;
1176 if (!all_semicolons)
1178 /* We skip all leading white space, but not EOLs. */
1179 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1180 comment_add (c);
1183 comment_line_end (0);
1184 continue;
1187 case '"':
1189 op->token = (struct token *) xmalloc (sizeof (struct token));
1190 init_token (op->token);
1191 op->line_number_at_start = line_number;
1192 for (;;)
1194 int c = do_getc ();
1195 if (c == EOF)
1196 /* Invalid input. Be tolerant, no error message. */
1197 break;
1198 if (c == '"')
1199 break;
1200 if (c == '\\') /* syntax_single_esc */
1202 c = do_getc ();
1203 if (c == EOF)
1204 /* Invalid input. Be tolerant, no error message. */
1205 break;
1207 grow_token (op->token);
1208 op->token->chars[op->token->charcount++].ch = c;
1210 op->type = t_string;
1212 if (extract_all)
1214 lex_pos_ty pos;
1216 pos.file_name = logical_file_name;
1217 pos.line_number = op->line_number_at_start;
1218 remember_a_message (mlp, string_of_object (op),
1219 null_context, &pos);
1221 last_non_comment_line = line_number;
1222 return;
1225 case '#':
1226 /* Dispatch macro handling. */
1228 int c;
1230 for (;;)
1232 c = do_getc ();
1233 if (c == EOF)
1234 /* Invalid input. Be tolerant, no error message. */
1236 op->type = t_other;
1237 return;
1239 if (!(c >= '0' && c <= '9'))
1240 break;
1243 switch (c)
1245 case '(':
1246 case '"':
1247 do_ungetc (c);
1248 /*FALLTHROUGH*/
1249 case '\'':
1250 case ':':
1251 case '.':
1252 case ',':
1253 case 'A': case 'a':
1254 case 'C': case 'c':
1255 case 'P': case 'p':
1256 case 'S': case 's':
1258 struct object inner;
1259 read_object (&inner, null_context);
1260 /* Dots and EOF are not allowed here.
1261 But be tolerant. */
1262 free_object (&inner);
1263 op->type = t_other;
1264 last_non_comment_line = line_number;
1265 return;
1268 case '|':
1270 int depth = 0;
1271 int c;
1273 comment_start ();
1274 c = do_getc ();
1275 for (;;)
1277 if (c == EOF)
1278 break;
1279 if (c == '|')
1281 c = do_getc ();
1282 if (c == EOF)
1283 break;
1284 if (c == '#')
1286 if (depth == 0)
1288 comment_line_end (0);
1289 break;
1291 depth--;
1292 comment_add ('|');
1293 comment_add ('#');
1294 c = do_getc ();
1296 else
1297 comment_add ('|');
1299 else if (c == '#')
1301 c = do_getc ();
1302 if (c == EOF)
1303 break;
1304 comment_add ('#');
1305 if (c == '|')
1307 depth++;
1308 comment_add ('|');
1309 c = do_getc ();
1312 else
1314 /* We skip all leading white space. */
1315 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1316 comment_add (c);
1317 if (c == '\n')
1319 comment_line_end (1);
1320 comment_start ();
1322 c = do_getc ();
1325 if (c == EOF)
1327 /* EOF not allowed here. But be tolerant. */
1328 op->type = t_eof;
1329 return;
1331 last_comment_line = line_number;
1332 continue;
1335 case '\\':
1337 struct token token;
1338 struct char_syntax first;
1339 first.ch = '\\';
1340 first.scode = syntax_single_esc;
1341 read_token (&token, &first);
1342 free_token (&token);
1343 op->type = t_other;
1344 last_non_comment_line = line_number;
1345 return;
1348 case 'B': case 'b':
1349 case 'O': case 'o':
1350 case 'X': case 'x':
1351 case 'R': case 'r':
1352 case '*':
1354 struct token token;
1355 read_token (&token, NULL);
1356 free_token (&token);
1357 op->type = t_other;
1358 last_non_comment_line = line_number;
1359 return;
1362 case '=':
1363 /* Ignore read labels. */
1364 continue;
1366 case '#':
1367 /* Don't bother looking up the corresponding object. */
1368 op->type = t_other;
1369 last_non_comment_line = line_number;
1370 return;
1372 case '+':
1373 case '-':
1374 /* Simply assume every feature expression is true. */
1376 struct object inner;
1377 read_object (&inner, null_context);
1378 /* Dots and EOF are not allowed here.
1379 But be tolerant. */
1380 free_object (&inner);
1381 continue;
1384 default:
1385 op->type = t_other;
1386 last_non_comment_line = line_number;
1387 return;
1389 /*NOTREACHED*/
1390 abort ();
1393 default:
1394 /*NOTREACHED*/
1395 abort ();
1398 default:
1399 /*NOTREACHED*/
1400 abort ();
1406 void
1407 extract_lisp (FILE *f,
1408 const char *real_filename, const char *logical_filename,
1409 flag_context_list_table_ty *flag_table,
1410 msgdomain_list_ty *mdlp)
1412 mlp = mdlp->item[0]->messages;
1414 fp = f;
1415 real_file_name = real_filename;
1416 logical_file_name = xstrdup (logical_filename);
1417 line_number = 1;
1419 last_comment_line = -1;
1420 last_non_comment_line = -1;
1422 flag_context_list_table = flag_table;
1424 init_keywords ();
1426 /* Eat tokens until eof is seen. When read_object returns
1427 due to an unbalanced closing parenthesis, just restart it. */
1430 struct object toplevel_object;
1432 read_object (&toplevel_object, null_context);
1434 if (toplevel_object.type == t_eof)
1435 break;
1437 free_object (&toplevel_object);
1439 while (!feof (fp));
1441 /* Close scanner. */
1442 fp = NULL;
1443 real_file_name = NULL;
1444 logical_file_name = NULL;
1445 line_number = 0;