Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / po-lex.c
blob8de21a85550e2e41caf7c8a6c4b4250b13e618f2
1 /* GNU gettext - internationalization aids
2 Copyright (C) 1995-1999, 2000-2004 Free Software Foundation, Inc.
4 This file was written by Peter Miller <millerp@canb.auug.org.au>.
5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software Foundation,
19 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22 #ifdef HAVE_CONFIG_H
23 # include "config.h"
24 #endif
26 /* Specification. */
27 #include "po-lex.h"
29 #include <errno.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdarg.h>
36 #if HAVE_ICONV
37 # include <iconv.h>
38 #endif
40 #include "c-ctype.h"
41 #include "linebreak.h"
42 #include "vasprintf.h"
43 #include "gettext.h"
44 #include "po-charset.h"
45 #include "xalloc.h"
46 #include "exit.h"
47 #include "error.h"
48 #include "error-progname.h"
49 #include "pos.h"
50 #include "str-list.h"
51 #include "po-gram-gen2.h"
53 #define _(str) gettext(str)
55 #if HAVE_ICONV
56 # include "utf8-ucs4.h"
57 #endif
59 #if HAVE_DECL_GETC_UNLOCKED
60 # undef getc
61 # define getc getc_unlocked
62 #endif
65 /* Current position within the PO file. */
66 lex_pos_ty gram_pos;
67 int gram_pos_column;
70 /* Error handling during the parsing of a PO file.
71 These functions can access gram_pos and gram_pos_column. */
73 #if !(__STDC__ && \
74 ((defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L && !defined __DECC) \
75 || (defined __GNUC__ && __GNUC__ >= 2 && !defined __APPLE_CC__)))
77 /* CAUTION: If you change this function, you must also make identical
78 changes to the macro of the same name in src/po-lex.h */
80 /* VARARGS1 */
81 void
82 po_gram_error (const char *fmt, ...)
84 va_list ap;
85 char *buffer;
87 va_start (ap, fmt);
88 if (vasprintf (&buffer, fmt, ap) < 0)
89 error (EXIT_FAILURE, 0, _("memory exhausted"));
90 va_end (ap);
91 error_with_progname = false;
92 po_error (0, 0, "%s:%lu:%d: %s", gram_pos.file_name,
93 (unsigned long) gram_pos.line_number, gram_pos_column + 1, buffer);
94 error_with_progname = true;
95 free (buffer);
97 /* Some messages need more than one line. Continuation lines are
98 indicated by using "..." at the start of the string. We don't
99 increment the error counter for these continuation lines. */
100 if (*fmt == '.')
101 --error_message_count;
102 else if (error_message_count >= gram_max_allowed_errors)
103 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
106 /* CAUTION: If you change this function, you must also make identical
107 changes to the macro of the same name in src/po-lex.h */
109 /* VARARGS2 */
110 void
111 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
113 va_list ap;
114 char *buffer;
116 va_start (ap, fmt);
117 if (vasprintf (&buffer, fmt, ap) < 0)
118 error (EXIT_FAILURE, 0, _("memory exhausted"));
119 va_end (ap);
120 error_with_progname = false;
121 po_error_at_line (0, 0, pp->file_name, pp->line_number, "%s", buffer);
122 error_with_progname = true;
123 free (buffer);
125 /* Some messages need more than one line, or more than one location.
126 Continuation lines are indicated by using "..." at the start of the
127 string. We don't increment the error counter for these
128 continuation lines. */
129 if (*fmt == '.')
130 --error_message_count;
131 else if (error_message_count >= gram_max_allowed_errors)
132 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
135 #endif
138 /* The lowest level of PO file parsing converts bytes to multibyte characters.
139 This is needed
140 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
141 translation phase maps bytes to characters.
142 2. to keep track of the current column, for the sake of precise error
143 location. Emacs compile.el interprets the column in error messages
144 by default as a screen column number, not as character number.
145 3. to avoid skipping backslash-newline in the midst of a multibyte
146 character. If XY is a multibyte character, X \ newline Y is invalid.
149 /* Multibyte character data type. */
150 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
151 while the file is being parsed. */
153 #define MBCHAR_BUF_SIZE 24
155 struct mbchar
157 size_t bytes; /* number of bytes of current character, > 0 */
158 #if HAVE_ICONV
159 bool uc_valid; /* true if uc is a valid Unicode character */
160 unsigned int uc; /* if uc_valid: the current character */
161 #endif
162 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
165 /* We want to pass multibyte characters by reference automatically,
166 therefore we use an array type. */
167 typedef struct mbchar mbchar_t[1];
169 /* A version of memcpy optimized for the case n <= 1. */
170 static inline void
171 memcpy_small (void *dst, const void *src, size_t n)
173 if (n > 0)
175 char *q = (char *) dst;
176 const char *p = (const char *) src;
178 *q = *p;
179 if (--n > 0)
180 do *++q = *++p; while (--n > 0);
184 /* EOF (not a real character) is represented with bytes = 0 and
185 uc_valid = false. */
186 static inline bool
187 mb_iseof (const mbchar_t mbc)
189 return (mbc->bytes == 0);
192 /* Access the current character. */
193 static inline const char *
194 mb_ptr (const mbchar_t mbc)
196 return mbc->buf;
198 static inline size_t
199 mb_len (const mbchar_t mbc)
201 return mbc->bytes;
204 /* Comparison of characters. */
206 static inline bool
207 mb_iseq (const mbchar_t mbc, char sc)
209 /* Note: It is wrong to compare only mbc->uc, because when the encoding is
210 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
211 want to treat it as an escape character, although it looks like a Yen
212 sign. */
213 #if HAVE_ICONV && 0
214 if (mbc->uc_valid)
215 return (mbc->uc == sc); /* wrong! */
216 else
217 #endif
218 return (mbc->bytes == 1 && mbc->buf[0] == sc);
221 static inline bool
222 mb_isnul (const mbchar_t mbc)
224 #if HAVE_ICONV
225 if (mbc->uc_valid)
226 return (mbc->uc == 0);
227 else
228 #endif
229 return (mbc->bytes == 1 && mbc->buf[0] == 0);
232 static inline int
233 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
235 #if HAVE_ICONV
236 if (mbc1->uc_valid && mbc2->uc_valid)
237 return (int) mbc1->uc - (int) mbc2->uc;
238 else
239 #endif
240 return (mbc1->bytes == mbc2->bytes
241 ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
242 : mbc1->bytes < mbc2->bytes
243 ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
244 : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
247 static inline bool
248 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
250 #if HAVE_ICONV
251 if (mbc1->uc_valid && mbc2->uc_valid)
252 return mbc1->uc == mbc2->uc;
253 else
254 #endif
255 return (mbc1->bytes == mbc2->bytes
256 && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
259 /* <ctype.h>, <wctype.h> classification. */
261 static inline bool
262 mb_isascii (const mbchar_t mbc)
264 #if HAVE_ICONV
265 if (mbc->uc_valid)
266 return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
267 else
268 #endif
269 return mbc->bytes == 1 && (mbc->buf[0] & 0x80) == 0;
272 /* Extra <wchar.h> function. */
274 /* Unprintable characters appear as a small box of width 1. */
275 #define MB_UNPRINTABLE_WIDTH 1
277 static int
278 mb_width (const mbchar_t mbc)
280 #if HAVE_ICONV
281 if (mbc->uc_valid)
283 unsigned int uc = mbc->uc;
284 const char *encoding =
285 (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
286 int w = uc_width (uc, encoding);
287 /* For unprintable characters, arbitrarily return 0 for control
288 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */
289 if (w >= 0)
290 return w;
291 if (uc >= 0x0000 && uc <= 0x001F)
293 if (uc == 0x0009)
294 return 8 - (gram_pos_column & 7);
295 return 0;
297 if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
298 return 0;
299 return MB_UNPRINTABLE_WIDTH;
301 else
302 #endif
304 if (mbc->bytes == 1)
306 if ((mbc->buf[0] & 0x80) == 0 && mbc->buf[0] <= 0x1F)
308 if (mbc->buf[0] == 0x09)
309 return 8 - (gram_pos_column & 7);
310 return 0;
312 if (mbc->buf[0] == 0x7F)
313 return 0;
315 return MB_UNPRINTABLE_WIDTH;
319 /* Output. */
320 static inline void
321 mb_putc (const mbchar_t mbc, FILE *stream)
323 fwrite (mbc->buf, 1, mbc->bytes, stream);
326 /* Assignment. */
327 static inline void
328 mb_setascii (mbchar_t mbc, char sc)
330 mbc->bytes = 1;
331 #if HAVE_ICONV
332 mbc->uc_valid = 1;
333 mbc->uc = sc;
334 #endif
335 mbc->buf[0] = sc;
338 /* Copying a character. */
339 static inline void
340 mb_copy (mbchar_t new, const mbchar_t old)
342 memcpy_small (&new->buf[0], &old->buf[0], old->bytes);
343 new->bytes = old->bytes;
344 #if HAVE_ICONV
345 if ((new->uc_valid = old->uc_valid))
346 new->uc = old->uc;
347 #endif
351 /* Multibyte character input. */
353 /* Number of characters that can be pushed back.
354 We need 1 for lex_getc, plus 1 for lex_ungetc. */
355 #define NPUSHBACK 2
357 /* Data type of a multibyte character input stream. */
358 struct mbfile
360 FILE *fp;
361 bool eof_seen;
362 int have_pushback;
363 unsigned int bufcount;
364 char buf[MBCHAR_BUF_SIZE];
365 struct mbchar pushback[NPUSHBACK];
368 /* We want to pass multibyte streams by reference automatically,
369 therefore we use an array type. */
370 typedef struct mbfile mbfile_t[1];
372 /* Whether invalid multibyte sequences in the input shall be signalled
373 or silently tolerated. */
374 static bool signal_eilseq;
376 static inline void
377 mbfile_init (mbfile_t mbf, FILE *stream)
379 mbf->fp = stream;
380 mbf->eof_seen = false;
381 mbf->have_pushback = 0;
382 mbf->bufcount = 0;
385 /* Read the next multibyte character from mbf and put it into mbc.
386 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */
387 static void
388 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
390 size_t bytes;
392 /* If EOF has already been seen, don't use getc. This matters if
393 mbf->fp is connected to an interactive tty. */
394 if (mbf->eof_seen)
395 goto eof;
397 /* Return character pushed back, if there is one. */
398 if (mbf->have_pushback > 0)
400 mbf->have_pushback--;
401 mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
402 return;
405 /* Before using iconv, we need at least one byte. */
406 if (mbf->bufcount == 0)
408 int c = getc (mbf->fp);
409 if (c == EOF)
411 mbf->eof_seen = true;
412 goto eof;
414 mbf->buf[0] = (unsigned char) c;
415 mbf->bufcount++;
418 #if HAVE_ICONV
419 if (po_lex_iconv != (iconv_t)(-1))
421 /* Use iconv on an increasing number of bytes. Read only as many
422 bytes from mbf->fp as needed. This is needed to give reasonable
423 interactive behaviour when mbf->fp is connected to an interactive
424 tty. */
425 for (;;)
427 unsigned char scratchbuf[64];
428 const char *inptr = &mbf->buf[0];
429 size_t insize = mbf->bufcount;
430 char *outptr = (char *) &scratchbuf[0];
431 size_t outsize = sizeof (scratchbuf);
433 size_t res = iconv (po_lex_iconv,
434 (ICONV_CONST char **) &inptr, &insize,
435 &outptr, &outsize);
436 /* We expect that a character has been produced if and only if
437 some input bytes have been consumed. */
438 if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
439 abort ();
440 if (outsize == sizeof (scratchbuf))
442 /* No character has been produced. Must be an error. */
443 if (res != (size_t)(-1))
444 abort ();
446 if (errno == EILSEQ)
448 /* An invalid multibyte sequence was encountered. */
449 /* Return a single byte. */
450 if (signal_eilseq)
451 po_gram_error (_("invalid multibyte sequence"));
452 bytes = 1;
453 mbc->uc_valid = false;
454 break;
456 else if (errno == EINVAL)
458 /* An incomplete multibyte character. */
459 int c;
461 if (mbf->bufcount == MBCHAR_BUF_SIZE)
463 /* An overlong incomplete multibyte sequence was
464 encountered. */
465 /* Return a single byte. */
466 bytes = 1;
467 mbc->uc_valid = false;
468 break;
471 /* Read one more byte and retry iconv. */
472 c = getc (mbf->fp);
473 if (c == EOF)
475 mbf->eof_seen = true;
476 if (ferror (mbf->fp))
477 goto eof;
478 if (signal_eilseq)
479 po_gram_error (_("\
480 incomplete multibyte sequence at end of file"));
481 bytes = mbf->bufcount;
482 mbc->uc_valid = false;
483 break;
485 mbf->buf[mbf->bufcount++] = (unsigned char) c;
486 if (c == '\n')
488 if (signal_eilseq)
489 po_gram_error (_("\
490 incomplete multibyte sequence at end of line"));
491 bytes = mbf->bufcount - 1;
492 mbc->uc_valid = false;
493 break;
496 else
497 po_error (EXIT_FAILURE, errno, _("iconv failure"));
499 else
501 size_t outbytes = sizeof (scratchbuf) - outsize;
502 bytes = mbf->bufcount - insize;
504 /* We expect that one character has been produced. */
505 if (bytes == 0)
506 abort ();
507 if (outbytes == 0)
508 abort ();
509 /* Convert it from UTF-8 to UCS-4. */
510 if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes)
512 /* scratchbuf contains an out-of-range Unicode character
513 (> 0x10ffff). */
514 if (signal_eilseq)
515 po_gram_error (_("invalid multibyte sequence"));
516 mbc->uc_valid = false;
517 break;
519 mbc->uc_valid = true;
520 break;
524 else
525 #endif
527 if (po_lex_weird_cjk
528 /* Special handling of encodings with CJK structure. */
529 && (unsigned char) mbf->buf[0] >= 0x80)
531 if (mbf->bufcount == 1)
533 /* Read one more byte. */
534 int c = getc (mbf->fp);
535 if (c == EOF)
537 if (ferror (mbf->fp))
539 mbf->eof_seen = true;
540 goto eof;
543 else
545 mbf->buf[1] = (unsigned char) c;
546 mbf->bufcount++;
549 if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
550 /* Return a double byte. */
551 bytes = 2;
552 else
553 /* Return a single byte. */
554 bytes = 1;
556 else
558 /* Return a single byte. */
559 bytes = 1;
561 #if HAVE_ICONV
562 mbc->uc_valid = false;
563 #endif
566 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
567 memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
568 mbc->bytes = bytes;
570 mbf->bufcount -= bytes;
571 if (mbf->bufcount > 0)
573 /* It's not worth calling memmove() for so few bytes. */
574 unsigned int count = mbf->bufcount;
575 char *p = &mbf->buf[0];
579 *p = *(p + bytes);
580 p++;
582 while (--count > 0);
584 return;
586 eof:
587 /* An mbchar_t with bytes == 0 is used to indicate EOF. */
588 mbc->bytes = 0;
589 #if HAVE_ICONV
590 mbc->uc_valid = false;
591 #endif
592 return;
595 static void
596 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
598 if (mbf->have_pushback >= NPUSHBACK)
599 abort ();
600 mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
601 mbf->have_pushback++;
605 /* Lexer variables. */
607 static mbfile_t mbf;
608 unsigned int gram_max_allowed_errors = 20;
609 static bool po_lex_obsolete;
610 static bool pass_comments = false;
611 bool pass_obsolete_entries = false;
614 /* Prepare lexical analysis. */
615 void
616 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
618 /* Ignore the logical_filename, because PO file entries already have
619 their file names attached. But use real_filename for error messages. */
620 gram_pos.file_name = xstrdup (real_filename);
622 mbfile_init (mbf, fp);
624 gram_pos.line_number = 1;
625 gram_pos_column = 0;
626 signal_eilseq = true;
627 po_lex_obsolete = false;
628 po_lex_charset_init ();
631 /* Terminate lexical analysis. */
632 void
633 lex_end ()
635 mbf->fp = NULL;
636 gram_pos.file_name = NULL;
637 gram_pos.line_number = 0;
638 gram_pos_column = 0;
639 signal_eilseq = false;
640 po_lex_obsolete = false;
641 po_lex_charset_close ();
645 /* Read a single character, dealing with backslash-newline.
646 Also keep track of the current line number and column number. */
647 static void
648 lex_getc (mbchar_t mbc)
650 for (;;)
652 mbfile_getc (mbc, mbf);
654 if (mb_iseof (mbc))
656 if (ferror (mbf->fp))
658 bomb:
659 po_error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
660 gram_pos.file_name);
662 break;
665 if (mb_iseq (mbc, '\n'))
667 gram_pos.line_number++;
668 gram_pos_column = 0;
669 break;
672 gram_pos_column += mb_width (mbc);
674 if (mb_iseq (mbc, '\\'))
676 mbchar_t mbc2;
678 mbfile_getc (mbc2, mbf);
680 if (mb_iseof (mbc2))
682 if (ferror (mbf->fp))
683 goto bomb;
684 break;
687 if (!mb_iseq (mbc2, '\n'))
689 mbfile_ungetc (mbc2, mbf);
690 break;
693 gram_pos.line_number++;
694 gram_pos_column = 0;
696 else
697 break;
702 static void
703 lex_ungetc (const mbchar_t mbc)
705 if (!mb_iseof (mbc))
707 if (mb_iseq (mbc, '\n'))
708 /* Decrement the line number, but don't care about the column. */
709 gram_pos.line_number--;
710 else
711 /* Decrement the column number. Also works well enough for tabs. */
712 gram_pos_column -= mb_width (mbc);
714 mbfile_ungetc (mbc, mbf);
719 static int
720 keyword_p (const char *s)
722 if (!strcmp (s, "domain"))
723 return DOMAIN;
724 if (!strcmp (s, "msgid"))
725 return MSGID;
726 if (!strcmp (s, "msgid_plural"))
727 return MSGID_PLURAL;
728 if (!strcmp (s, "msgstr"))
729 return MSGSTR;
730 po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
731 return NAME;
735 static int
736 control_sequence ()
738 mbchar_t mbc;
739 int val;
740 int max;
742 lex_getc (mbc);
743 if (mb_len (mbc) == 1)
744 switch (mb_ptr (mbc) [0])
746 case 'n':
747 return '\n';
749 case 't':
750 return '\t';
752 case 'b':
753 return '\b';
755 case 'r':
756 return '\r';
758 case 'f':
759 return '\f';
761 case 'v':
762 return '\v';
764 case 'a':
765 return '\a';
767 case '\\':
768 case '"':
769 return mb_ptr (mbc) [0];
771 case '0': case '1': case '2': case '3':
772 case '4': case '5': case '6': case '7':
773 val = 0;
774 max = 0;
775 for (;;)
777 char c = mb_ptr (mbc) [0];
778 /* Warning: not portable, can't depend on '0'..'7' ordering. */
779 val = val * 8 + (c - '0');
780 if (++max == 3)
781 break;
782 lex_getc (mbc);
783 if (mb_len (mbc) == 1)
784 switch (mb_ptr (mbc) [0])
786 case '0': case '1': case '2': case '3':
787 case '4': case '5': case '6': case '7':
788 continue;
790 default:
791 break;
793 lex_ungetc (mbc);
794 break;
796 return val;
798 case 'x':
799 lex_getc (mbc);
800 if (mb_iseof (mbc) || mb_len (mbc) != 1
801 || !c_isxdigit (mb_ptr (mbc) [0]))
802 break;
804 val = 0;
805 for (;;)
807 char c = mb_ptr (mbc) [0];
808 val *= 16;
809 if (c_isdigit (c))
810 /* Warning: not portable, can't depend on '0'..'9' ordering */
811 val += c - '0';
812 else if (c_isupper (c))
813 /* Warning: not portable, can't depend on 'A'..'F' ordering */
814 val += c - 'A' + 10;
815 else
816 /* Warning: not portable, can't depend on 'a'..'f' ordering */
817 val += c - 'a' + 10;
819 lex_getc (mbc);
820 if (mb_len (mbc) == 1)
821 switch (mb_ptr (mbc) [0])
823 case '0': case '1': case '2': case '3': case '4':
824 case '5': case '6': case '7': case '8': case '9':
825 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
826 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
827 continue;
829 default:
830 break;
832 lex_ungetc (mbc);
833 break;
835 return val;
837 /* FIXME: \u and \U are not handled. */
839 lex_ungetc (mbc);
840 po_gram_error (_("invalid control sequence"));
841 return ' ';
845 /* Return the next token in the PO file. The return codes are defined
846 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */
848 po_gram_lex ()
850 static char *buf;
851 static size_t bufmax;
852 mbchar_t mbc;
853 size_t bufpos;
855 for (;;)
857 lex_getc (mbc);
859 if (mb_iseof (mbc))
860 /* Yacc want this for end of file. */
861 return 0;
863 if (mb_len (mbc) == 1)
864 switch (mb_ptr (mbc) [0])
866 case '\n':
867 po_lex_obsolete = false;
868 /* Ignore whitespace, not relevant for the grammar. */
869 break;
871 case ' ':
872 case '\t':
873 case '\r':
874 case '\f':
875 case '\v':
876 /* Ignore whitespace, not relevant for the grammar. */
877 break;
879 case '#':
880 lex_getc (mbc);
881 if (mb_iseq (mbc, '~'))
882 /* A pseudo-comment beginning with #~ is found. This is
883 not a comment. It is the format for obsolete entries.
884 We simply discard the "#~" prefix. The following
885 characters are expected to be well formed. */
887 po_lex_obsolete = true;
888 break;
891 /* Accumulate comments into a buffer. If we have been asked
892 to pass comments, generate a COMMENT token, otherwise
893 discard it. */
894 signal_eilseq = false;
895 if (pass_comments)
897 bufpos = 0;
898 for (;;)
900 while (bufpos + mb_len (mbc) >= bufmax)
902 bufmax += 100;
903 buf = xrealloc (buf, bufmax);
905 if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
906 break;
908 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
909 bufpos += mb_len (mbc);
911 lex_getc (mbc);
913 buf[bufpos] = '\0';
915 po_gram_lval.string.string = buf;
916 po_gram_lval.string.pos = gram_pos;
917 po_gram_lval.string.obsolete = po_lex_obsolete;
918 po_lex_obsolete = false;
919 signal_eilseq = true;
920 return COMMENT;
922 else
924 /* We do this in separate loop because collecting large
925 comments while they get not passed to the upper layers
926 is not very effective. */
927 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
928 lex_getc (mbc);
929 po_lex_obsolete = false;
930 signal_eilseq = true;
932 break;
934 case '"':
935 /* Accumulate a string. */
936 bufpos = 0;
937 for (;;)
939 lex_getc (mbc);
940 while (bufpos + mb_len (mbc) >= bufmax)
942 bufmax += 100;
943 buf = xrealloc (buf, bufmax);
945 if (mb_iseof (mbc))
947 po_gram_error_at_line (&gram_pos,
948 _("end-of-file within string"));
949 break;
951 if (mb_iseq (mbc, '\n'))
953 po_gram_error_at_line (&gram_pos,
954 _("end-of-line within string"));
955 break;
957 if (mb_iseq (mbc, '"'))
958 break;
959 if (mb_iseq (mbc, '\\'))
961 buf[bufpos++] = control_sequence ();
962 continue;
965 /* Add mbc to the accumulator. */
966 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
967 bufpos += mb_len (mbc);
969 buf[bufpos] = '\0';
971 /* FIXME: Treatment of embedded \000 chars is incorrect. */
972 po_gram_lval.string.string = xstrdup (buf);
973 po_gram_lval.string.pos = gram_pos;
974 po_gram_lval.string.obsolete = po_lex_obsolete;
975 return STRING;
977 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
978 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
979 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
980 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
981 case 'y': case 'z':
982 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
983 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
984 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
985 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
986 case 'Y': case 'Z':
987 case '_': case '$':
988 bufpos = 0;
989 for (;;)
991 char c = mb_ptr (mbc) [0];
992 if (bufpos + 1 >= bufmax)
994 bufmax += 100;
995 buf = xrealloc (buf, bufmax);
997 buf[bufpos++] = c;
998 lex_getc (mbc);
999 if (mb_len (mbc) == 1)
1000 switch (mb_ptr (mbc) [0])
1002 default:
1003 break;
1004 case 'a': case 'b': case 'c': case 'd': case 'e':
1005 case 'f': case 'g': case 'h': case 'i': case 'j':
1006 case 'k': case 'l': case 'm': case 'n': case 'o':
1007 case 'p': case 'q': case 'r': case 's': case 't':
1008 case 'u': case 'v': case 'w': case 'x': case 'y':
1009 case 'z':
1010 case 'A': case 'B': case 'C': case 'D': case 'E':
1011 case 'F': case 'G': case 'H': case 'I': case 'J':
1012 case 'K': case 'L': case 'M': case 'N': case 'O':
1013 case 'P': case 'Q': case 'R': case 'S': case 'T':
1014 case 'U': case 'V': case 'W': case 'X': case 'Y':
1015 case 'Z':
1016 case '_': case '$':
1017 case '0': case '1': case '2': case '3': case '4':
1018 case '5': case '6': case '7': case '8': case '9':
1019 continue;
1021 break;
1023 lex_ungetc (mbc);
1025 buf[bufpos] = '\0';
1028 int k = keyword_p (buf);
1029 if (k == NAME)
1031 po_gram_lval.string.string = xstrdup (buf);
1032 po_gram_lval.string.pos = gram_pos;
1033 po_gram_lval.string.obsolete = po_lex_obsolete;
1035 else
1037 po_gram_lval.pos.pos = gram_pos;
1038 po_gram_lval.pos.obsolete = po_lex_obsolete;
1040 return k;
1043 case '0': case '1': case '2': case '3': case '4':
1044 case '5': case '6': case '7': case '8': case '9':
1045 bufpos = 0;
1046 for (;;)
1048 char c = mb_ptr (mbc) [0];
1049 if (bufpos + 1 >= bufmax)
1051 bufmax += 100;
1052 buf = xrealloc (buf, bufmax + 1);
1054 buf[bufpos++] = c;
1055 lex_getc (mbc);
1056 if (mb_len (mbc) == 1)
1057 switch (mb_ptr (mbc) [0])
1059 default:
1060 break;
1062 case '0': case '1': case '2': case '3': case '4':
1063 case '5': case '6': case '7': case '8': case '9':
1064 continue;
1066 break;
1068 lex_ungetc (mbc);
1070 buf[bufpos] = '\0';
1072 po_gram_lval.number.number = atol (buf);
1073 po_gram_lval.number.pos = gram_pos;
1074 po_gram_lval.number.obsolete = po_lex_obsolete;
1075 return NUMBER;
1077 case '[':
1078 po_gram_lval.pos.pos = gram_pos;
1079 po_gram_lval.pos.obsolete = po_lex_obsolete;
1080 return '[';
1082 case ']':
1083 po_gram_lval.pos.pos = gram_pos;
1084 po_gram_lval.pos.obsolete = po_lex_obsolete;
1085 return ']';
1087 default:
1088 /* This will cause a syntax error. */
1089 return JUNK;
1091 else
1092 /* This will cause a syntax error. */
1093 return JUNK;
1098 /* po_gram_lex() can return comments as COMMENT. Switch this on or off. */
1099 void
1100 po_lex_pass_comments (bool flag)
1102 pass_comments = flag;
1106 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1107 Switch this on or off. */
1108 void
1109 po_lex_pass_obsolete_entries (bool flag)
1111 pass_obsolete_entries = flag;