1 /* GNU gettext - internationalization aids
2 Copyright (C) 1995-1999, 2000-2004 Free Software Foundation, Inc.
4 This file was written by Peter Miller <millerp@canb.auug.org.au>.
5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software Foundation,
19 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
41 #include "linebreak.h"
42 #include "vasprintf.h"
44 #include "po-charset.h"
48 #include "error-progname.h"
51 #include "po-gram-gen2.h"
53 #define _(str) gettext(str)
56 # include "utf8-ucs4.h"
59 #if HAVE_DECL_GETC_UNLOCKED
61 # define getc getc_unlocked
65 /* Current position within the PO file. */
70 /* Error handling during the parsing of a PO file.
71 These functions can access gram_pos and gram_pos_column. */
74 ((defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L && !defined __DECC) \
75 || (defined __GNUC__ && __GNUC__ >= 2 && !defined __APPLE_CC__)))
77 /* CAUTION: If you change this function, you must also make identical
78 changes to the macro of the same name in src/po-lex.h */
82 po_gram_error (const char *fmt
, ...)
88 if (vasprintf (&buffer
, fmt
, ap
) < 0)
89 error (EXIT_FAILURE
, 0, _("memory exhausted"));
91 error_with_progname
= false;
92 po_error (0, 0, "%s:%lu:%d: %s", gram_pos
.file_name
,
93 (unsigned long) gram_pos
.line_number
, gram_pos_column
+ 1, buffer
);
94 error_with_progname
= true;
97 /* Some messages need more than one line. Continuation lines are
98 indicated by using "..." at the start of the string. We don't
99 increment the error counter for these continuation lines. */
101 --error_message_count
;
102 else if (error_message_count
>= gram_max_allowed_errors
)
103 po_error (EXIT_FAILURE
, 0, _("too many errors, aborting"));
106 /* CAUTION: If you change this function, you must also make identical
107 changes to the macro of the same name in src/po-lex.h */
111 po_gram_error_at_line (const lex_pos_ty
*pp
, const char *fmt
, ...)
117 if (vasprintf (&buffer
, fmt
, ap
) < 0)
118 error (EXIT_FAILURE
, 0, _("memory exhausted"));
120 error_with_progname
= false;
121 po_error_at_line (0, 0, pp
->file_name
, pp
->line_number
, "%s", buffer
);
122 error_with_progname
= true;
125 /* Some messages need more than one line, or more than one location.
126 Continuation lines are indicated by using "..." at the start of the
127 string. We don't increment the error counter for these
128 continuation lines. */
130 --error_message_count
;
131 else if (error_message_count
>= gram_max_allowed_errors
)
132 po_error (EXIT_FAILURE
, 0, _("too many errors, aborting"));
138 /* The lowest level of PO file parsing converts bytes to multibyte characters.
140 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
141 translation phase maps bytes to characters.
142 2. to keep track of the current column, for the sake of precise error
143 location. Emacs compile.el interprets the column in error messages
144 by default as a screen column number, not as character number.
145 3. to avoid skipping backslash-newline in the midst of a multibyte
146 character. If XY is a multibyte character, X \ newline Y is invalid.
149 /* Multibyte character data type. */
150 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
151 while the file is being parsed. */
153 #define MBCHAR_BUF_SIZE 24
157 size_t bytes
; /* number of bytes of current character, > 0 */
159 bool uc_valid
; /* true if uc is a valid Unicode character */
160 unsigned int uc
; /* if uc_valid: the current character */
162 char buf
[MBCHAR_BUF_SIZE
]; /* room for the bytes */
165 /* We want to pass multibyte characters by reference automatically,
166 therefore we use an array type. */
167 typedef struct mbchar mbchar_t
[1];
169 /* A version of memcpy optimized for the case n <= 1. */
171 memcpy_small (void *dst
, const void *src
, size_t n
)
175 char *q
= (char *) dst
;
176 const char *p
= (const char *) src
;
180 do *++q
= *++p
; while (--n
> 0);
184 /* EOF (not a real character) is represented with bytes = 0 and
187 mb_iseof (const mbchar_t mbc
)
189 return (mbc
->bytes
== 0);
192 /* Access the current character. */
193 static inline const char *
194 mb_ptr (const mbchar_t mbc
)
199 mb_len (const mbchar_t mbc
)
204 /* Comparison of characters. */
207 mb_iseq (const mbchar_t mbc
, char sc
)
209 /* Note: It is wrong to compare only mbc->uc, because when the encoding is
210 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
211 want to treat it as an escape character, although it looks like a Yen
215 return (mbc
->uc
== sc
); /* wrong! */
218 return (mbc
->bytes
== 1 && mbc
->buf
[0] == sc
);
222 mb_isnul (const mbchar_t mbc
)
226 return (mbc
->uc
== 0);
229 return (mbc
->bytes
== 1 && mbc
->buf
[0] == 0);
233 mb_cmp (const mbchar_t mbc1
, const mbchar_t mbc2
)
236 if (mbc1
->uc_valid
&& mbc2
->uc_valid
)
237 return (int) mbc1
->uc
- (int) mbc2
->uc
;
240 return (mbc1
->bytes
== mbc2
->bytes
241 ? memcmp (mbc1
->buf
, mbc2
->buf
, mbc1
->bytes
)
242 : mbc1
->bytes
< mbc2
->bytes
243 ? (memcmp (mbc1
->buf
, mbc2
->buf
, mbc1
->bytes
) > 0 ? 1 : -1)
244 : (memcmp (mbc1
->buf
, mbc2
->buf
, mbc2
->bytes
) >= 0 ? 1 : -1));
248 mb_equal (const mbchar_t mbc1
, const mbchar_t mbc2
)
251 if (mbc1
->uc_valid
&& mbc2
->uc_valid
)
252 return mbc1
->uc
== mbc2
->uc
;
255 return (mbc1
->bytes
== mbc2
->bytes
256 && memcmp (mbc1
->buf
, mbc2
->buf
, mbc1
->bytes
) == 0);
259 /* <ctype.h>, <wctype.h> classification. */
262 mb_isascii (const mbchar_t mbc
)
266 return (mbc
->uc
>= 0x0000 && mbc
->uc
<= 0x007F);
269 return mbc
->bytes
== 1 && (mbc
->buf
[0] & 0x80) == 0;
272 /* Extra <wchar.h> function. */
274 /* Unprintable characters appear as a small box of width 1. */
275 #define MB_UNPRINTABLE_WIDTH 1
278 mb_width (const mbchar_t mbc
)
283 unsigned int uc
= mbc
->uc
;
284 const char *encoding
=
285 (po_lex_iconv
!= (iconv_t
)(-1) ? po_lex_charset
: "");
286 int w
= uc_width (uc
, encoding
);
287 /* For unprintable characters, arbitrarily return 0 for control
288 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */
291 if (uc
>= 0x0000 && uc
<= 0x001F)
294 return 8 - (gram_pos_column
& 7);
297 if ((uc
>= 0x007F && uc
<= 0x009F) || (uc
>= 0x2028 && uc
<= 0x2029))
299 return MB_UNPRINTABLE_WIDTH
;
306 if ((mbc
->buf
[0] & 0x80) == 0 && mbc
->buf
[0] <= 0x1F)
308 if (mbc
->buf
[0] == 0x09)
309 return 8 - (gram_pos_column
& 7);
312 if (mbc
->buf
[0] == 0x7F)
315 return MB_UNPRINTABLE_WIDTH
;
321 mb_putc (const mbchar_t mbc
, FILE *stream
)
323 fwrite (mbc
->buf
, 1, mbc
->bytes
, stream
);
328 mb_setascii (mbchar_t mbc
, char sc
)
338 /* Copying a character. */
340 mb_copy (mbchar_t
new, const mbchar_t old
)
342 memcpy_small (&new->buf
[0], &old
->buf
[0], old
->bytes
);
343 new->bytes
= old
->bytes
;
345 if ((new->uc_valid
= old
->uc_valid
))
351 /* Multibyte character input. */
353 /* Number of characters that can be pushed back.
354 We need 1 for lex_getc, plus 1 for lex_ungetc. */
357 /* Data type of a multibyte character input stream. */
363 unsigned int bufcount
;
364 char buf
[MBCHAR_BUF_SIZE
];
365 struct mbchar pushback
[NPUSHBACK
];
368 /* We want to pass multibyte streams by reference automatically,
369 therefore we use an array type. */
370 typedef struct mbfile mbfile_t
[1];
372 /* Whether invalid multibyte sequences in the input shall be signalled
373 or silently tolerated. */
374 static bool signal_eilseq
;
377 mbfile_init (mbfile_t mbf
, FILE *stream
)
380 mbf
->eof_seen
= false;
381 mbf
->have_pushback
= 0;
385 /* Read the next multibyte character from mbf and put it into mbc.
386 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */
388 mbfile_getc (mbchar_t mbc
, mbfile_t mbf
)
392 /* If EOF has already been seen, don't use getc. This matters if
393 mbf->fp is connected to an interactive tty. */
397 /* Return character pushed back, if there is one. */
398 if (mbf
->have_pushback
> 0)
400 mbf
->have_pushback
--;
401 mb_copy (mbc
, &mbf
->pushback
[mbf
->have_pushback
]);
405 /* Before using iconv, we need at least one byte. */
406 if (mbf
->bufcount
== 0)
408 int c
= getc (mbf
->fp
);
411 mbf
->eof_seen
= true;
414 mbf
->buf
[0] = (unsigned char) c
;
419 if (po_lex_iconv
!= (iconv_t
)(-1))
421 /* Use iconv on an increasing number of bytes. Read only as many
422 bytes from mbf->fp as needed. This is needed to give reasonable
423 interactive behaviour when mbf->fp is connected to an interactive
427 unsigned char scratchbuf
[64];
428 const char *inptr
= &mbf
->buf
[0];
429 size_t insize
= mbf
->bufcount
;
430 char *outptr
= (char *) &scratchbuf
[0];
431 size_t outsize
= sizeof (scratchbuf
);
433 size_t res
= iconv (po_lex_iconv
,
434 (ICONV_CONST
char **) &inptr
, &insize
,
436 /* We expect that a character has been produced if and only if
437 some input bytes have been consumed. */
438 if ((insize
< mbf
->bufcount
) != (outsize
< sizeof (scratchbuf
)))
440 if (outsize
== sizeof (scratchbuf
))
442 /* No character has been produced. Must be an error. */
443 if (res
!= (size_t)(-1))
448 /* An invalid multibyte sequence was encountered. */
449 /* Return a single byte. */
451 po_gram_error (_("invalid multibyte sequence"));
453 mbc
->uc_valid
= false;
456 else if (errno
== EINVAL
)
458 /* An incomplete multibyte character. */
461 if (mbf
->bufcount
== MBCHAR_BUF_SIZE
)
463 /* An overlong incomplete multibyte sequence was
465 /* Return a single byte. */
467 mbc
->uc_valid
= false;
471 /* Read one more byte and retry iconv. */
475 mbf
->eof_seen
= true;
476 if (ferror (mbf
->fp
))
480 incomplete multibyte sequence at end of file"));
481 bytes
= mbf
->bufcount
;
482 mbc
->uc_valid
= false;
485 mbf
->buf
[mbf
->bufcount
++] = (unsigned char) c
;
490 incomplete multibyte sequence at end of line"));
491 bytes
= mbf
->bufcount
- 1;
492 mbc
->uc_valid
= false;
497 po_error (EXIT_FAILURE
, errno
, _("iconv failure"));
501 size_t outbytes
= sizeof (scratchbuf
) - outsize
;
502 bytes
= mbf
->bufcount
- insize
;
504 /* We expect that one character has been produced. */
509 /* Convert it from UTF-8 to UCS-4. */
510 if (u8_mbtouc (&mbc
->uc
, scratchbuf
, outbytes
) < outbytes
)
512 /* scratchbuf contains an out-of-range Unicode character
515 po_gram_error (_("invalid multibyte sequence"));
516 mbc
->uc_valid
= false;
519 mbc
->uc_valid
= true;
528 /* Special handling of encodings with CJK structure. */
529 && (unsigned char) mbf
->buf
[0] >= 0x80)
531 if (mbf
->bufcount
== 1)
533 /* Read one more byte. */
534 int c
= getc (mbf
->fp
);
537 if (ferror (mbf
->fp
))
539 mbf
->eof_seen
= true;
545 mbf
->buf
[1] = (unsigned char) c
;
549 if (mbf
->bufcount
>= 2 && (unsigned char) mbf
->buf
[1] >= 0x30)
550 /* Return a double byte. */
553 /* Return a single byte. */
558 /* Return a single byte. */
562 mbc
->uc_valid
= false;
566 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
567 memcpy_small (&mbc
->buf
[0], &mbf
->buf
[0], bytes
);
570 mbf
->bufcount
-= bytes
;
571 if (mbf
->bufcount
> 0)
573 /* It's not worth calling memmove() for so few bytes. */
574 unsigned int count
= mbf
->bufcount
;
575 char *p
= &mbf
->buf
[0];
587 /* An mbchar_t with bytes == 0 is used to indicate EOF. */
590 mbc
->uc_valid
= false;
596 mbfile_ungetc (const mbchar_t mbc
, mbfile_t mbf
)
598 if (mbf
->have_pushback
>= NPUSHBACK
)
600 mb_copy (&mbf
->pushback
[mbf
->have_pushback
], mbc
);
601 mbf
->have_pushback
++;
605 /* Lexer variables. */
608 unsigned int gram_max_allowed_errors
= 20;
609 static bool po_lex_obsolete
;
610 static bool pass_comments
= false;
611 bool pass_obsolete_entries
= false;
614 /* Prepare lexical analysis. */
616 lex_start (FILE *fp
, const char *real_filename
, const char *logical_filename
)
618 /* Ignore the logical_filename, because PO file entries already have
619 their file names attached. But use real_filename for error messages. */
620 gram_pos
.file_name
= xstrdup (real_filename
);
622 mbfile_init (mbf
, fp
);
624 gram_pos
.line_number
= 1;
626 signal_eilseq
= true;
627 po_lex_obsolete
= false;
628 po_lex_charset_init ();
631 /* Terminate lexical analysis. */
636 gram_pos
.file_name
= NULL
;
637 gram_pos
.line_number
= 0;
639 signal_eilseq
= false;
640 po_lex_obsolete
= false;
641 po_lex_charset_close ();
645 /* Read a single character, dealing with backslash-newline.
646 Also keep track of the current line number and column number. */
648 lex_getc (mbchar_t mbc
)
652 mbfile_getc (mbc
, mbf
);
656 if (ferror (mbf
->fp
))
659 po_error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
665 if (mb_iseq (mbc
, '\n'))
667 gram_pos
.line_number
++;
672 gram_pos_column
+= mb_width (mbc
);
674 if (mb_iseq (mbc
, '\\'))
678 mbfile_getc (mbc2
, mbf
);
682 if (ferror (mbf
->fp
))
687 if (!mb_iseq (mbc2
, '\n'))
689 mbfile_ungetc (mbc2
, mbf
);
693 gram_pos
.line_number
++;
703 lex_ungetc (const mbchar_t mbc
)
707 if (mb_iseq (mbc
, '\n'))
708 /* Decrement the line number, but don't care about the column. */
709 gram_pos
.line_number
--;
711 /* Decrement the column number. Also works well enough for tabs. */
712 gram_pos_column
-= mb_width (mbc
);
714 mbfile_ungetc (mbc
, mbf
);
720 keyword_p (const char *s
)
722 if (!strcmp (s
, "domain"))
724 if (!strcmp (s
, "msgid"))
726 if (!strcmp (s
, "msgid_plural"))
728 if (!strcmp (s
, "msgstr"))
730 po_gram_error_at_line (&gram_pos
, _("keyword \"%s\" unknown"), s
);
743 if (mb_len (mbc
) == 1)
744 switch (mb_ptr (mbc
) [0])
769 return mb_ptr (mbc
) [0];
771 case '0': case '1': case '2': case '3':
772 case '4': case '5': case '6': case '7':
777 char c
= mb_ptr (mbc
) [0];
778 /* Warning: not portable, can't depend on '0'..'7' ordering. */
779 val
= val
* 8 + (c
- '0');
783 if (mb_len (mbc
) == 1)
784 switch (mb_ptr (mbc
) [0])
786 case '0': case '1': case '2': case '3':
787 case '4': case '5': case '6': case '7':
800 if (mb_iseof (mbc
) || mb_len (mbc
) != 1
801 || !c_isxdigit (mb_ptr (mbc
) [0]))
807 char c
= mb_ptr (mbc
) [0];
810 /* Warning: not portable, can't depend on '0'..'9' ordering */
812 else if (c_isupper (c
))
813 /* Warning: not portable, can't depend on 'A'..'F' ordering */
816 /* Warning: not portable, can't depend on 'a'..'f' ordering */
820 if (mb_len (mbc
) == 1)
821 switch (mb_ptr (mbc
) [0])
823 case '0': case '1': case '2': case '3': case '4':
824 case '5': case '6': case '7': case '8': case '9':
825 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
826 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
837 /* FIXME: \u and \U are not handled. */
840 po_gram_error (_("invalid control sequence"));
845 /* Return the next token in the PO file. The return codes are defined
846 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */
851 static size_t bufmax
;
860 /* Yacc want this for end of file. */
863 if (mb_len (mbc
) == 1)
864 switch (mb_ptr (mbc
) [0])
867 po_lex_obsolete
= false;
868 /* Ignore whitespace, not relevant for the grammar. */
876 /* Ignore whitespace, not relevant for the grammar. */
881 if (mb_iseq (mbc
, '~'))
882 /* A pseudo-comment beginning with #~ is found. This is
883 not a comment. It is the format for obsolete entries.
884 We simply discard the "#~" prefix. The following
885 characters are expected to be well formed. */
887 po_lex_obsolete
= true;
891 /* Accumulate comments into a buffer. If we have been asked
892 to pass comments, generate a COMMENT token, otherwise
894 signal_eilseq
= false;
900 while (bufpos
+ mb_len (mbc
) >= bufmax
)
903 buf
= xrealloc (buf
, bufmax
);
905 if (mb_iseof (mbc
) || mb_iseq (mbc
, '\n'))
908 memcpy_small (&buf
[bufpos
], mb_ptr (mbc
), mb_len (mbc
));
909 bufpos
+= mb_len (mbc
);
915 po_gram_lval
.string
.string
= buf
;
916 po_gram_lval
.string
.pos
= gram_pos
;
917 po_gram_lval
.string
.obsolete
= po_lex_obsolete
;
918 po_lex_obsolete
= false;
919 signal_eilseq
= true;
924 /* We do this in separate loop because collecting large
925 comments while they get not passed to the upper layers
926 is not very effective. */
927 while (!mb_iseof (mbc
) && !mb_iseq (mbc
, '\n'))
929 po_lex_obsolete
= false;
930 signal_eilseq
= true;
935 /* Accumulate a string. */
940 while (bufpos
+ mb_len (mbc
) >= bufmax
)
943 buf
= xrealloc (buf
, bufmax
);
947 po_gram_error_at_line (&gram_pos
,
948 _("end-of-file within string"));
951 if (mb_iseq (mbc
, '\n'))
953 po_gram_error_at_line (&gram_pos
,
954 _("end-of-line within string"));
957 if (mb_iseq (mbc
, '"'))
959 if (mb_iseq (mbc
, '\\'))
961 buf
[bufpos
++] = control_sequence ();
965 /* Add mbc to the accumulator. */
966 memcpy_small (&buf
[bufpos
], mb_ptr (mbc
), mb_len (mbc
));
967 bufpos
+= mb_len (mbc
);
971 /* FIXME: Treatment of embedded \000 chars is incorrect. */
972 po_gram_lval
.string
.string
= xstrdup (buf
);
973 po_gram_lval
.string
.pos
= gram_pos
;
974 po_gram_lval
.string
.obsolete
= po_lex_obsolete
;
977 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
978 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
979 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
980 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
982 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
983 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
984 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
985 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
991 char c
= mb_ptr (mbc
) [0];
992 if (bufpos
+ 1 >= bufmax
)
995 buf
= xrealloc (buf
, bufmax
);
999 if (mb_len (mbc
) == 1)
1000 switch (mb_ptr (mbc
) [0])
1004 case 'a': case 'b': case 'c': case 'd': case 'e':
1005 case 'f': case 'g': case 'h': case 'i': case 'j':
1006 case 'k': case 'l': case 'm': case 'n': case 'o':
1007 case 'p': case 'q': case 'r': case 's': case 't':
1008 case 'u': case 'v': case 'w': case 'x': case 'y':
1010 case 'A': case 'B': case 'C': case 'D': case 'E':
1011 case 'F': case 'G': case 'H': case 'I': case 'J':
1012 case 'K': case 'L': case 'M': case 'N': case 'O':
1013 case 'P': case 'Q': case 'R': case 'S': case 'T':
1014 case 'U': case 'V': case 'W': case 'X': case 'Y':
1017 case '0': case '1': case '2': case '3': case '4':
1018 case '5': case '6': case '7': case '8': case '9':
1028 int k
= keyword_p (buf
);
1031 po_gram_lval
.string
.string
= xstrdup (buf
);
1032 po_gram_lval
.string
.pos
= gram_pos
;
1033 po_gram_lval
.string
.obsolete
= po_lex_obsolete
;
1037 po_gram_lval
.pos
.pos
= gram_pos
;
1038 po_gram_lval
.pos
.obsolete
= po_lex_obsolete
;
1043 case '0': case '1': case '2': case '3': case '4':
1044 case '5': case '6': case '7': case '8': case '9':
1048 char c
= mb_ptr (mbc
) [0];
1049 if (bufpos
+ 1 >= bufmax
)
1052 buf
= xrealloc (buf
, bufmax
+ 1);
1056 if (mb_len (mbc
) == 1)
1057 switch (mb_ptr (mbc
) [0])
1062 case '0': case '1': case '2': case '3': case '4':
1063 case '5': case '6': case '7': case '8': case '9':
1072 po_gram_lval
.number
.number
= atol (buf
);
1073 po_gram_lval
.number
.pos
= gram_pos
;
1074 po_gram_lval
.number
.obsolete
= po_lex_obsolete
;
1078 po_gram_lval
.pos
.pos
= gram_pos
;
1079 po_gram_lval
.pos
.obsolete
= po_lex_obsolete
;
1083 po_gram_lval
.pos
.pos
= gram_pos
;
1084 po_gram_lval
.pos
.obsolete
= po_lex_obsolete
;
1088 /* This will cause a syntax error. */
1092 /* This will cause a syntax error. */
1098 /* po_gram_lex() can return comments as COMMENT. Switch this on or off. */
1100 po_lex_pass_comments (bool flag
)
1102 pass_comments
= flag
;
1106 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1107 Switch this on or off. */
1109 po_lex_pass_obsolete_entries (bool flag
)
1111 pass_obsolete_entries
= flag
;