Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / read-stringtable.c
blobb96d036b75e576965dd348ef28bbab58b76af355
1 /* Reading NeXTstep/GNUstep .strings files.
2 Copyright (C) 2003, 2005 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
23 /* Specification. */
24 #include "read-stringtable.h"
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
33 #include "error.h"
34 #include "error-progname.h"
35 #include "read-po-abstract.h"
36 #include "xalloc.h"
37 #include "exit.h"
38 #include "utf8-ucs4.h"
39 #include "ucs4-utf8.h"
40 #include "gettext.h"
42 #define _(str) gettext (str)
44 /* The format of NeXTstep/GNUstep .strings files is documented in
45 gnustep-base-1.8.0/Tools/make_strings/Using.txt
46 and in the comments of method propertyListFromStringsFileFormat in
47 gnustep-base-1.8.0/Source/NSString.m
48 In summary, it's a Objective-C like file with pseudo-assignments of the form
49 "key" = "value";
50 where the key is the msgid and the value is the msgstr.
52 The implementation of the parser of .strings files is in
53 gnustep-base-1.8.0/Source/NSString.m
54 function GSPropertyListFromStringsFormat
55 (indirectly called from NSBundle's method localizedStringForKey).
57 A test case is in
58 gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
61 /* Handling of comments: We copy all comments from the .strings file to
62 the PO file. This is not really needed; it's a service for translators
63 who don't like PO files and prefer to maintain the .strings file. */
66 /* Real filename, used in error messages about the input file. */
67 static const char *real_file_name;
69 /* File name and line number. */
70 extern lex_pos_ty gram_pos;
72 /* The input file stream. */
73 static FILE *fp;
76 /* Phase 1: Read a byte.
77 Max. 4 pushback characters. */
79 static unsigned char phase1_pushback[4];
80 static int phase1_pushback_length;
82 static int
83 phase1_getc ()
85 int c;
87 if (phase1_pushback_length)
88 return phase1_pushback[--phase1_pushback_length];
90 c = getc (fp);
92 if (c == EOF)
94 if (ferror (fp))
95 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
96 real_file_name);
97 return EOF;
100 return c;
103 static void
104 phase1_ungetc (int c)
106 if (c != EOF)
107 phase1_pushback[phase1_pushback_length++] = c;
111 /* Phase 2: Read an UCS-4 character.
112 Max. 2 pushback characters. */
114 /* End-of-file indicator for functions returning an UCS-4 character. */
115 #define UEOF -1
117 static int phase2_pushback[4];
118 static int phase2_pushback_length;
120 /* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
121 with a BOM!), or otherwise the locale-dependent default encoding is used.
122 Since we don't want to depend on the locale here, we use ISO-8859-1
123 instead. */
124 enum enc
126 enc_undetermined,
127 enc_ucs2be,
128 enc_ucs2le,
129 enc_utf8,
130 enc_iso8859_1
132 static enum enc encoding;
134 static int
135 phase2_getc ()
137 if (phase2_pushback_length)
138 return phase2_pushback[--phase2_pushback_length];
140 if (encoding == enc_undetermined)
142 /* Determine the input file's encoding. */
143 int c0, c1;
145 c0 = phase1_getc ();
146 if (c0 == EOF)
147 return UEOF;
148 c1 = phase1_getc ();
149 if (c1 == EOF)
151 phase1_ungetc (c0);
152 encoding = enc_iso8859_1;
154 else if (c0 == 0xfe && c1 == 0xff)
155 encoding = enc_ucs2be;
156 else if (c0 == 0xff && c1 == 0xfe)
157 encoding = enc_ucs2le;
158 else
160 int c2;
162 c2 = phase1_getc ();
163 if (c2 == EOF)
165 phase1_ungetc (c1);
166 phase1_ungetc (c0);
167 encoding = enc_iso8859_1;
169 else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
170 encoding = enc_utf8;
171 else
173 phase1_ungetc (c2);
174 phase1_ungetc (c1);
175 phase1_ungetc (c0);
176 encoding = enc_iso8859_1;
181 switch (encoding)
183 case enc_ucs2be:
184 /* Read an UCS-2BE encoded character. */
186 int c0, c1;
188 c0 = phase1_getc ();
189 if (c0 == EOF)
190 return UEOF;
191 c1 = phase1_getc ();
192 if (c1 == EOF)
193 return UEOF;
194 return (c0 << 8) + c1;
197 case enc_ucs2le:
198 /* Read an UCS-2LE encoded character. */
200 int c0, c1;
202 c0 = phase1_getc ();
203 if (c0 == EOF)
204 return UEOF;
205 c1 = phase1_getc ();
206 if (c1 == EOF)
207 return UEOF;
208 return c0 + (c1 << 8);
211 case enc_utf8:
212 /* Read an UTF-8 encoded character. */
214 unsigned char buf[6];
215 unsigned int count;
216 int c;
217 unsigned int uc;
219 c = phase1_getc ();
220 if (c == EOF)
221 return UEOF;
222 buf[0] = c;
223 count = 1;
225 if (buf[0] >= 0xc0)
227 c = phase1_getc ();
228 if (c == EOF)
229 return UEOF;
230 buf[1] = c;
231 count = 2;
234 if (buf[0] >= 0xe0
235 && ((buf[1] ^ 0x80) < 0x40))
237 c = phase1_getc ();
238 if (c == EOF)
239 return UEOF;
240 buf[2] = c;
241 count = 3;
244 if (buf[0] >= 0xf0
245 && ((buf[1] ^ 0x80) < 0x40)
246 && ((buf[2] ^ 0x80) < 0x40))
248 c = phase1_getc ();
249 if (c == EOF)
250 return UEOF;
251 buf[3] = c;
252 count = 4;
255 if (buf[0] >= 0xf8
256 && ((buf[1] ^ 0x80) < 0x40)
257 && ((buf[2] ^ 0x80) < 0x40)
258 && ((buf[3] ^ 0x80) < 0x40))
260 c = phase1_getc ();
261 if (c == EOF)
262 return UEOF;
263 buf[4] = c;
264 count = 5;
267 if (buf[0] >= 0xfc
268 && ((buf[1] ^ 0x80) < 0x40)
269 && ((buf[2] ^ 0x80) < 0x40)
270 && ((buf[3] ^ 0x80) < 0x40)
271 && ((buf[4] ^ 0x80) < 0x40))
273 c = phase1_getc ();
274 if (c == EOF)
275 return UEOF;
276 buf[5] = c;
277 count = 6;
280 u8_mbtouc (&uc, buf, count);
281 return uc;
284 case enc_iso8859_1:
285 /* Read an ISO-8859-1 encoded character. */
287 int c = phase1_getc ();
289 if (c == EOF)
290 return UEOF;
291 return c;
294 default:
295 abort ();
299 static void
300 phase2_ungetc (int c)
302 if (c != UEOF)
303 phase2_pushback[phase2_pushback_length++] = c;
307 /* Phase 3: Read an UCS-4 character, with line number handling. */
309 static int
310 phase3_getc ()
312 int c = phase2_getc ();
314 if (c == '\n')
315 gram_pos.line_number++;
317 return c;
320 static void
321 phase3_ungetc (int c)
323 if (c == '\n')
324 --gram_pos.line_number;
325 phase2_ungetc (c);
329 /* Convert from UCS-4 to UTF-8. */
330 static char *
331 conv_from_ucs4 (const int *buffer, size_t buflen)
333 unsigned char *utf8_string;
334 size_t pos;
335 unsigned char *q;
337 /* Each UCS-4 word needs 6 bytes at worst. */
338 utf8_string = (unsigned char *) xmalloc (6 * buflen + 1);
340 for (pos = 0, q = utf8_string; pos < buflen; )
342 unsigned int uc;
343 int n;
345 uc = buffer[pos++];
346 n = u8_uctomb (q, uc, 6);
347 assert (n > 0);
348 q += n;
350 *q = '\0';
351 assert (q - utf8_string <= 6 * buflen);
353 return (char *) utf8_string;
357 /* Parse a string enclosed in double-quotes. Input is UCS-4 encoded.
358 Return the string in UTF-8 encoding, or NULL if the input doesn't represent
359 a valid string enclosed in double-quotes. */
360 static char *
361 parse_escaped_string (const int *string, size_t length)
363 static int *buffer;
364 static size_t bufmax;
365 static size_t buflen;
366 const int *string_limit = string + length;
367 int c;
369 if (string == string_limit)
370 return NULL;
371 c = *string++;
372 if (c != '"')
373 return NULL;
374 buflen = 0;
375 for (;;)
377 if (string == string_limit)
378 return NULL;
379 c = *string++;
380 if (c == '"')
381 break;
382 if (c == '\\')
384 if (string == string_limit)
385 return NULL;
386 c = *string++;
387 if (c >= '0' && c <= '7')
389 unsigned int n = 0;
390 int j = 0;
391 for (;;)
393 n = n * 8 + (c - '0');
394 if (++j == 3)
395 break;
396 if (string == string_limit)
397 break;
398 c = *string;
399 if (!(c >= '0' && c <= '7'))
400 break;
401 string++;
403 c = n;
405 else if (c == 'u' || c == 'U')
407 unsigned int n = 0;
408 int j;
409 for (j = 0; j < 4; j++)
411 if (string == string_limit)
412 break;
413 c = *string;
414 if (c >= '0' && c <= '9')
415 n = n * 16 + (c - '0');
416 else if (c >= 'A' && c <= 'F')
417 n = n * 16 + (c - 'A' + 10);
418 else if (c >= 'a' && c <= 'f')
419 n = n * 16 + (c - 'a' + 10);
420 else
421 break;
422 string++;
424 c = n;
426 else
427 switch (c)
429 case 'a': c = '\a'; break;
430 case 'b': c = '\b'; break;
431 case 't': c = '\t'; break;
432 case 'r': c = '\r'; break;
433 case 'n': c = '\n'; break;
434 case 'v': c = '\v'; break;
435 case 'f': c = '\f'; break;
438 if (buflen >= bufmax)
440 bufmax = 2 * bufmax + 10;
441 buffer = xrealloc (buffer, bufmax * sizeof (int));
443 buffer[buflen++] = c;
446 return conv_from_ucs4 (buffer, buflen);
450 /* Accumulating flag comments. */
452 static char *special_comment;
454 static inline void
455 special_comment_reset ()
457 if (special_comment != NULL)
458 free (special_comment);
459 special_comment = NULL;
462 static void
463 special_comment_add (const char *flag)
465 if (special_comment == NULL)
466 special_comment = xstrdup (flag);
467 else
469 size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
470 special_comment = xrealloc (special_comment, total_len);
471 strcat (special_comment, ", ");
472 strcat (special_comment, flag);
476 static inline void
477 special_comment_finish ()
479 if (special_comment != NULL)
481 po_callback_comment_special (special_comment);
482 free (special_comment);
483 special_comment = NULL;
488 /* Accumulating comments. */
490 static int *buffer;
491 static size_t bufmax;
492 static size_t buflen;
493 static bool next_is_obsolete;
494 static bool next_is_fuzzy;
495 static char *fuzzy_msgstr;
496 static bool expect_fuzzy_msgstr_as_c_comment;
497 static bool expect_fuzzy_msgstr_as_cxx_comment;
499 static inline void
500 comment_start ()
502 buflen = 0;
505 static inline void
506 comment_add (int c)
508 if (buflen >= bufmax)
510 bufmax = 2 * bufmax + 10;
511 buffer = xrealloc (buffer, bufmax * sizeof (int));
513 buffer[buflen++] = c;
516 static inline void
517 comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
519 char *line;
521 buflen -= chars_to_remove;
522 /* Drop trailing white space, but not EOLs. */
523 while (buflen >= 1
524 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
525 --buflen;
527 /* At special positions we interpret a comment of the form
528 = "escaped string"
529 with an optional trailing semicolon as being the fuzzy msgstr, not a
530 regular comment. */
531 if (test_for_fuzzy_msgstr
532 && buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
533 && (fuzzy_msgstr =
534 parse_escaped_string (buffer + 2,
535 buflen - (buffer[buflen - 1] == ';') - 2)))
536 return;
538 line = conv_from_ucs4 (buffer, buflen);
540 if (strcmp (line, "Flag: untranslated") == 0)
542 special_comment_add ("fuzzy");
543 next_is_fuzzy = true;
545 else if (strcmp (line, "Flag: unmatched") == 0)
546 next_is_obsolete = true;
547 else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
548 special_comment_add (line + 6);
549 else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
550 /* A comment extracted from the source. */
551 po_callback_comment_dot (line + 9);
552 else
554 char *last_colon;
555 unsigned long number;
556 char *endp;
558 if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
559 && (last_colon = strrchr (line + 6, ':')) != NULL
560 && *(last_colon + 1) != '\0'
561 && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
563 /* A "File: <filename>:<number>" type comment. */
564 *last_colon = '\0';
565 po_callback_comment_filepos (line + 6, number);
567 else
568 po_callback_comment (line);
573 /* Phase 4: Replace each comment that is not inside a string with a space
574 character. */
576 static int
577 phase4_getc ()
579 int c;
581 c = phase3_getc ();
582 if (c != '/')
583 return c;
584 c = phase3_getc ();
585 switch (c)
587 default:
588 phase3_ungetc (c);
589 return '/';
591 case '*':
592 /* C style comment. */
594 bool last_was_star;
595 size_t trailing_stars;
596 bool seen_newline;
598 comment_start ();
599 last_was_star = false;
600 trailing_stars = 0;
601 seen_newline = false;
602 /* Drop additional stars at the beginning of the comment. */
603 for (;;)
605 c = phase3_getc ();
606 if (c != '*')
607 break;
608 last_was_star = true;
610 phase3_ungetc (c);
611 for (;;)
613 c = phase3_getc ();
614 if (c == UEOF)
615 break;
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen == 0 && (c == ' ' || c == '\t')))
618 comment_add (c);
619 switch (c)
621 case '\n':
622 seen_newline = true;
623 comment_line_end (1, false);
624 comment_start ();
625 last_was_star = false;
626 trailing_stars = 0;
627 continue;
629 case '*':
630 last_was_star = true;
631 trailing_stars++;
632 continue;
634 case '/':
635 if (last_was_star)
637 /* Drop additional stars at the end of the comment. */
638 comment_line_end (trailing_stars + 1,
639 expect_fuzzy_msgstr_as_c_comment
640 && !seen_newline);
641 break;
643 /* FALLTHROUGH */
645 default:
646 last_was_star = false;
647 trailing_stars = 0;
648 continue;
650 break;
652 return ' ';
655 case '/':
656 /* C++ style comment. */
657 comment_start ();
658 for (;;)
660 c = phase3_getc ();
661 if (c == '\n' || c == UEOF)
662 break;
663 /* We skip all leading white space, but not EOLs. */
664 if (!(buflen == 0 && (c == ' ' || c == '\t')))
665 comment_add (c);
667 comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
668 return '\n';
672 static inline void
673 phase4_ungetc (int c)
675 phase3_ungetc (c);
679 /* Return true if a character is considered as whitespace. */
680 static bool
681 is_whitespace (int c)
683 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'
684 || c == '\b');
687 /* Return true if a character needs quoting, i.e. cannot be used in unquoted
688 tokens. */
689 static bool
690 is_quotable (int c)
692 if ((c >= '0' && c <= '9')
693 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
694 return false;
695 switch (c)
697 case '!': case '#': case '$': case '%': case '&': case '*':
698 case '+': case '-': case '.': case '/': case ':': case '?':
699 case '@': case '|': case '~': case '_': case '^':
700 return false;
701 default:
702 return true;
707 /* Read a key or value string.
708 Return the string in UTF-8 encoding, or NULL if no string is seen.
709 Return the start position of the string in *pos. */
710 static char *
711 read_string (lex_pos_ty *pos)
713 static int *buffer;
714 static size_t bufmax;
715 static size_t buflen;
716 int c;
718 /* Skip whitespace before the string. */
720 c = phase4_getc ();
721 while (is_whitespace (c));
723 if (c == UEOF)
724 /* No more string. */
725 return NULL;
727 *pos = gram_pos;
728 buflen = 0;
729 if (c == '"')
731 /* Read a string enclosed in double-quotes. */
732 for (;;)
734 c = phase3_getc ();
735 if (c == UEOF || c == '"')
736 break;
737 if (c == '\\')
739 c = phase3_getc ();
740 if (c == UEOF)
741 break;
742 if (c >= '0' && c <= '7')
744 unsigned int n = 0;
745 int j = 0;
746 for (;;)
748 n = n * 8 + (c - '0');
749 if (++j == 3)
750 break;
751 c = phase3_getc ();
752 if (!(c >= '0' && c <= '7'))
754 phase3_ungetc (c);
755 break;
758 c = n;
760 else if (c == 'u' || c == 'U')
762 unsigned int n = 0;
763 int j;
764 for (j = 0; j < 4; j++)
766 c = phase3_getc ();
767 if (c >= '0' && c <= '9')
768 n = n * 16 + (c - '0');
769 else if (c >= 'A' && c <= 'F')
770 n = n * 16 + (c - 'A' + 10);
771 else if (c >= 'a' && c <= 'f')
772 n = n * 16 + (c - 'a' + 10);
773 else
775 phase3_ungetc (c);
776 break;
779 c = n;
781 else
782 switch (c)
784 case 'a': c = '\a'; break;
785 case 'b': c = '\b'; break;
786 case 't': c = '\t'; break;
787 case 'r': c = '\r'; break;
788 case 'n': c = '\n'; break;
789 case 'v': c = '\v'; break;
790 case 'f': c = '\f'; break;
793 if (buflen >= bufmax)
795 bufmax = 2 * bufmax + 10;
796 buffer = xrealloc (buffer, bufmax * sizeof (int));
798 buffer[buflen++] = c;
800 if (c == UEOF)
802 error_with_progname = false;
803 error (0, 0, _("%s:%lu: warning: unterminated string"),
804 real_file_name, (unsigned long) gram_pos.line_number);
805 error_with_progname = true;
808 else
810 /* Read a token outside quotes. */
811 if (is_quotable (c))
813 error_with_progname = false;
814 error (0, 0, _("%s:%lu: warning: syntax error"),
815 real_file_name, (unsigned long) gram_pos.line_number);
816 error_with_progname = true;
818 for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
820 if (buflen >= bufmax)
822 bufmax = 2 * bufmax + 10;
823 buffer = xrealloc (buffer, bufmax * sizeof (int));
825 buffer[buflen++] = c;
829 return conv_from_ucs4 (buffer, buflen);
833 /* Read a .strings file from a stream, and dispatch to the various
834 abstract_po_reader_class_ty methods. */
835 void
836 stringtable_parse (abstract_po_reader_ty *pop, FILE *file,
837 const char *real_filename, const char *logical_filename)
839 fp = file;
840 real_file_name = real_filename;
841 gram_pos.file_name = xstrdup (real_file_name);
842 gram_pos.line_number = 1;
843 encoding = enc_undetermined;
844 expect_fuzzy_msgstr_as_c_comment = false;
845 expect_fuzzy_msgstr_as_cxx_comment = false;
847 for (;;)
849 char *msgid;
850 lex_pos_ty msgid_pos;
851 char *msgstr;
852 lex_pos_ty msgstr_pos;
853 int c;
855 /* Prepare for next msgid/msgstr pair. */
856 special_comment_reset ();
857 next_is_obsolete = false;
858 next_is_fuzzy = false;
859 fuzzy_msgstr = NULL;
861 /* Read the key and all the comments preceding it. */
862 msgid = read_string (&msgid_pos);
863 if (msgid == NULL)
864 break;
866 special_comment_finish ();
868 /* Skip whitespace. */
870 c = phase4_getc ();
871 while (is_whitespace (c));
873 /* Expect a '=' or ';'. */
874 if (c == UEOF)
876 error_with_progname = false;
877 error (0, 0, _("%s:%lu: warning: unterminated key/value pair"),
878 real_file_name, (unsigned long) gram_pos.line_number);
879 error_with_progname = true;
880 break;
882 if (c == ';')
884 /* "key"; is an abbreviation for "key"=""; and does not
885 necessarily designate an untranslated entry. */
886 msgstr = "";
887 msgstr_pos = msgid_pos;
888 po_callback_message (msgid, &msgid_pos, NULL,
889 msgstr, strlen (msgstr) + 1, &msgstr_pos,
890 false, next_is_obsolete);
892 else if (c == '=')
894 /* Read the value. */
895 msgstr = read_string (&msgstr_pos);
896 if (msgstr == NULL)
898 error_with_progname = false;
899 error (0, 0, _("%s:%lu: warning: unterminated key/value pair"),
900 real_file_name, (unsigned long) gram_pos.line_number);
901 error_with_progname = true;
902 break;
905 /* Skip whitespace. But for fuzzy key/value pairs, look for the
906 tentative msgstr in the form of a C style comment. */
907 expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
910 c = phase4_getc ();
911 if (fuzzy_msgstr != NULL)
912 expect_fuzzy_msgstr_as_c_comment = false;
914 while (is_whitespace (c));
915 expect_fuzzy_msgstr_as_c_comment = false;
917 /* Expect a ';'. */
918 if (c == ';')
920 /* But for fuzzy key/value pairs, look for the tentative msgstr
921 in the form of a C++ style comment. */
922 if (fuzzy_msgstr == NULL && next_is_fuzzy)
925 c = phase3_getc ();
926 while (c == ' ');
927 phase3_ungetc (c);
929 expect_fuzzy_msgstr_as_cxx_comment = true;
930 c = phase4_getc ();
931 phase4_ungetc (c);
932 expect_fuzzy_msgstr_as_cxx_comment = false;
934 if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
935 msgstr = fuzzy_msgstr;
937 /* A key/value pair. */
938 po_callback_message (msgid, &msgid_pos, NULL,
939 msgstr, strlen (msgstr) + 1, &msgstr_pos,
940 false, next_is_obsolete);
942 else
944 error_with_progname = false;
945 error (0, 0, _("\
946 %s:%lu: warning: syntax error, expected ';' after string"),
947 real_file_name, (unsigned long) gram_pos.line_number);
948 error_with_progname = true;
949 break;
952 else
954 error_with_progname = false;
955 error (0, 0, _("\
956 %s:%lu: warning: syntax error, expected '=' or ';' after string"),
957 real_file_name, (unsigned long) gram_pos.line_number);
958 error_with_progname = true;
959 break;
963 fp = NULL;
964 real_file_name = NULL;
965 gram_pos.line_number = 0;