1 /* Reading NeXTstep/GNUstep .strings files.
2 Copyright (C) 2003, 2005 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24 #include "read-stringtable.h"
34 #include "error-progname.h"
35 #include "read-po-abstract.h"
38 #include "utf8-ucs4.h"
39 #include "ucs4-utf8.h"
42 #define _(str) gettext (str)
44 /* The format of NeXTstep/GNUstep .strings files is documented in
45 gnustep-base-1.8.0/Tools/make_strings/Using.txt
46 and in the comments of method propertyListFromStringsFileFormat in
47 gnustep-base-1.8.0/Source/NSString.m
48 In summary, it's a Objective-C like file with pseudo-assignments of the form
50 where the key is the msgid and the value is the msgstr.
52 The implementation of the parser of .strings files is in
53 gnustep-base-1.8.0/Source/NSString.m
54 function GSPropertyListFromStringsFormat
55 (indirectly called from NSBundle's method localizedStringForKey).
58 gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
61 /* Handling of comments: We copy all comments from the .strings file to
62 the PO file. This is not really needed; it's a service for translators
63 who don't like PO files and prefer to maintain the .strings file. */
66 /* Real filename, used in error messages about the input file. */
67 static const char *real_file_name
;
69 /* File name and line number. */
70 extern lex_pos_ty gram_pos
;
72 /* The input file stream. */
76 /* Phase 1: Read a byte.
77 Max. 4 pushback characters. */
79 static unsigned char phase1_pushback
[4];
80 static int phase1_pushback_length
;
87 if (phase1_pushback_length
)
88 return phase1_pushback
[--phase1_pushback_length
];
95 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
104 phase1_ungetc (int c
)
107 phase1_pushback
[phase1_pushback_length
++] = c
;
111 /* Phase 2: Read an UCS-4 character.
112 Max. 2 pushback characters. */
114 /* End-of-file indicator for functions returning an UCS-4 character. */
117 static int phase2_pushback
[4];
118 static int phase2_pushback_length
;
120 /* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
121 with a BOM!), or otherwise the locale-dependent default encoding is used.
122 Since we don't want to depend on the locale here, we use ISO-8859-1
132 static enum enc encoding
;
137 if (phase2_pushback_length
)
138 return phase2_pushback
[--phase2_pushback_length
];
140 if (encoding
== enc_undetermined
)
142 /* Determine the input file's encoding. */
152 encoding
= enc_iso8859_1
;
154 else if (c0
== 0xfe && c1
== 0xff)
155 encoding
= enc_ucs2be
;
156 else if (c0
== 0xff && c1
== 0xfe)
157 encoding
= enc_ucs2le
;
167 encoding
= enc_iso8859_1
;
169 else if (c0
== 0xef && c1
== 0xbb && c2
== 0xbf)
176 encoding
= enc_iso8859_1
;
184 /* Read an UCS-2BE encoded character. */
194 return (c0
<< 8) + c1
;
198 /* Read an UCS-2LE encoded character. */
208 return c0
+ (c1
<< 8);
212 /* Read an UTF-8 encoded character. */
214 unsigned char buf
[6];
235 && ((buf
[1] ^ 0x80) < 0x40))
245 && ((buf
[1] ^ 0x80) < 0x40)
246 && ((buf
[2] ^ 0x80) < 0x40))
256 && ((buf
[1] ^ 0x80) < 0x40)
257 && ((buf
[2] ^ 0x80) < 0x40)
258 && ((buf
[3] ^ 0x80) < 0x40))
268 && ((buf
[1] ^ 0x80) < 0x40)
269 && ((buf
[2] ^ 0x80) < 0x40)
270 && ((buf
[3] ^ 0x80) < 0x40)
271 && ((buf
[4] ^ 0x80) < 0x40))
280 u8_mbtouc (&uc
, buf
, count
);
285 /* Read an ISO-8859-1 encoded character. */
287 int c
= phase1_getc ();
300 phase2_ungetc (int c
)
303 phase2_pushback
[phase2_pushback_length
++] = c
;
307 /* Phase 3: Read an UCS-4 character, with line number handling. */
312 int c
= phase2_getc ();
315 gram_pos
.line_number
++;
321 phase3_ungetc (int c
)
324 --gram_pos
.line_number
;
329 /* Convert from UCS-4 to UTF-8. */
331 conv_from_ucs4 (const int *buffer
, size_t buflen
)
333 unsigned char *utf8_string
;
337 /* Each UCS-4 word needs 6 bytes at worst. */
338 utf8_string
= (unsigned char *) xmalloc (6 * buflen
+ 1);
340 for (pos
= 0, q
= utf8_string
; pos
< buflen
; )
346 n
= u8_uctomb (q
, uc
, 6);
351 assert (q
- utf8_string
<= 6 * buflen
);
353 return (char *) utf8_string
;
357 /* Parse a string enclosed in double-quotes. Input is UCS-4 encoded.
358 Return the string in UTF-8 encoding, or NULL if the input doesn't represent
359 a valid string enclosed in double-quotes. */
361 parse_escaped_string (const int *string
, size_t length
)
364 static size_t bufmax
;
365 static size_t buflen
;
366 const int *string_limit
= string
+ length
;
369 if (string
== string_limit
)
377 if (string
== string_limit
)
384 if (string
== string_limit
)
387 if (c
>= '0' && c
<= '7')
393 n
= n
* 8 + (c
- '0');
396 if (string
== string_limit
)
399 if (!(c
>= '0' && c
<= '7'))
405 else if (c
== 'u' || c
== 'U')
409 for (j
= 0; j
< 4; j
++)
411 if (string
== string_limit
)
414 if (c
>= '0' && c
<= '9')
415 n
= n
* 16 + (c
- '0');
416 else if (c
>= 'A' && c
<= 'F')
417 n
= n
* 16 + (c
- 'A' + 10);
418 else if (c
>= 'a' && c
<= 'f')
419 n
= n
* 16 + (c
- 'a' + 10);
429 case 'a': c
= '\a'; break;
430 case 'b': c
= '\b'; break;
431 case 't': c
= '\t'; break;
432 case 'r': c
= '\r'; break;
433 case 'n': c
= '\n'; break;
434 case 'v': c
= '\v'; break;
435 case 'f': c
= '\f'; break;
438 if (buflen
>= bufmax
)
440 bufmax
= 2 * bufmax
+ 10;
441 buffer
= xrealloc (buffer
, bufmax
* sizeof (int));
443 buffer
[buflen
++] = c
;
446 return conv_from_ucs4 (buffer
, buflen
);
450 /* Accumulating flag comments. */
452 static char *special_comment
;
455 special_comment_reset ()
457 if (special_comment
!= NULL
)
458 free (special_comment
);
459 special_comment
= NULL
;
463 special_comment_add (const char *flag
)
465 if (special_comment
== NULL
)
466 special_comment
= xstrdup (flag
);
469 size_t total_len
= strlen (special_comment
) + 2 + strlen (flag
) + 1;
470 special_comment
= xrealloc (special_comment
, total_len
);
471 strcat (special_comment
, ", ");
472 strcat (special_comment
, flag
);
477 special_comment_finish ()
479 if (special_comment
!= NULL
)
481 po_callback_comment_special (special_comment
);
482 free (special_comment
);
483 special_comment
= NULL
;
488 /* Accumulating comments. */
491 static size_t bufmax
;
492 static size_t buflen
;
493 static bool next_is_obsolete
;
494 static bool next_is_fuzzy
;
495 static char *fuzzy_msgstr
;
496 static bool expect_fuzzy_msgstr_as_c_comment
;
497 static bool expect_fuzzy_msgstr_as_cxx_comment
;
508 if (buflen
>= bufmax
)
510 bufmax
= 2 * bufmax
+ 10;
511 buffer
= xrealloc (buffer
, bufmax
* sizeof (int));
513 buffer
[buflen
++] = c
;
517 comment_line_end (size_t chars_to_remove
, bool test_for_fuzzy_msgstr
)
521 buflen
-= chars_to_remove
;
522 /* Drop trailing white space, but not EOLs. */
524 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
527 /* At special positions we interpret a comment of the form
529 with an optional trailing semicolon as being the fuzzy msgstr, not a
531 if (test_for_fuzzy_msgstr
532 && buflen
> 2 && buffer
[0] == '=' && buffer
[1] == ' '
534 parse_escaped_string (buffer
+ 2,
535 buflen
- (buffer
[buflen
- 1] == ';') - 2)))
538 line
= conv_from_ucs4 (buffer
, buflen
);
540 if (strcmp (line
, "Flag: untranslated") == 0)
542 special_comment_add ("fuzzy");
543 next_is_fuzzy
= true;
545 else if (strcmp (line
, "Flag: unmatched") == 0)
546 next_is_obsolete
= true;
547 else if (strlen (line
) >= 6 && memcmp (line
, "Flag: ", 6) == 0)
548 special_comment_add (line
+ 6);
549 else if (strlen (line
) >= 9 && memcmp (line
, "Comment: ", 9) == 0)
550 /* A comment extracted from the source. */
551 po_callback_comment_dot (line
+ 9);
555 unsigned long number
;
558 if (strlen (line
) >= 6 && memcmp (line
, "File: ", 6) == 0
559 && (last_colon
= strrchr (line
+ 6, ':')) != NULL
560 && *(last_colon
+ 1) != '\0'
561 && (number
= strtoul (last_colon
+ 1, &endp
, 10), *endp
== '\0'))
563 /* A "File: <filename>:<number>" type comment. */
565 po_callback_comment_filepos (line
+ 6, number
);
568 po_callback_comment (line
);
573 /* Phase 4: Replace each comment that is not inside a string with a space
592 /* C style comment. */
595 size_t trailing_stars
;
599 last_was_star
= false;
601 seen_newline
= false;
602 /* Drop additional stars at the beginning of the comment. */
608 last_was_star
= true;
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
623 comment_line_end (1, false);
625 last_was_star
= false;
630 last_was_star
= true;
637 /* Drop additional stars at the end of the comment. */
638 comment_line_end (trailing_stars
+ 1,
639 expect_fuzzy_msgstr_as_c_comment
646 last_was_star
= false;
656 /* C++ style comment. */
661 if (c
== '\n' || c
== UEOF
)
663 /* We skip all leading white space, but not EOLs. */
664 if (!(buflen
== 0 && (c
== ' ' || c
== '\t')))
667 comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment
);
673 phase4_ungetc (int c
)
679 /* Return true if a character is considered as whitespace. */
681 is_whitespace (int c
)
683 return (c
== ' ' || c
== '\t' || c
== '\r' || c
== '\n' || c
== '\f'
687 /* Return true if a character needs quoting, i.e. cannot be used in unquoted
692 if ((c
>= '0' && c
<= '9')
693 || (c
>= 'A' && c
<= 'Z') || (c
>= 'a' && c
<= 'z'))
697 case '!': case '#': case '$': case '%': case '&': case '*':
698 case '+': case '-': case '.': case '/': case ':': case '?':
699 case '@': case '|': case '~': case '_': case '^':
707 /* Read a key or value string.
708 Return the string in UTF-8 encoding, or NULL if no string is seen.
709 Return the start position of the string in *pos. */
711 read_string (lex_pos_ty
*pos
)
714 static size_t bufmax
;
715 static size_t buflen
;
718 /* Skip whitespace before the string. */
721 while (is_whitespace (c
));
724 /* No more string. */
731 /* Read a string enclosed in double-quotes. */
735 if (c
== UEOF
|| c
== '"')
742 if (c
>= '0' && c
<= '7')
748 n
= n
* 8 + (c
- '0');
752 if (!(c
>= '0' && c
<= '7'))
760 else if (c
== 'u' || c
== 'U')
764 for (j
= 0; j
< 4; j
++)
767 if (c
>= '0' && c
<= '9')
768 n
= n
* 16 + (c
- '0');
769 else if (c
>= 'A' && c
<= 'F')
770 n
= n
* 16 + (c
- 'A' + 10);
771 else if (c
>= 'a' && c
<= 'f')
772 n
= n
* 16 + (c
- 'a' + 10);
784 case 'a': c
= '\a'; break;
785 case 'b': c
= '\b'; break;
786 case 't': c
= '\t'; break;
787 case 'r': c
= '\r'; break;
788 case 'n': c
= '\n'; break;
789 case 'v': c
= '\v'; break;
790 case 'f': c
= '\f'; break;
793 if (buflen
>= bufmax
)
795 bufmax
= 2 * bufmax
+ 10;
796 buffer
= xrealloc (buffer
, bufmax
* sizeof (int));
798 buffer
[buflen
++] = c
;
802 error_with_progname
= false;
803 error (0, 0, _("%s:%lu: warning: unterminated string"),
804 real_file_name
, (unsigned long) gram_pos
.line_number
);
805 error_with_progname
= true;
810 /* Read a token outside quotes. */
813 error_with_progname
= false;
814 error (0, 0, _("%s:%lu: warning: syntax error"),
815 real_file_name
, (unsigned long) gram_pos
.line_number
);
816 error_with_progname
= true;
818 for (; c
!= UEOF
&& !is_quotable (c
); c
= phase4_getc ())
820 if (buflen
>= bufmax
)
822 bufmax
= 2 * bufmax
+ 10;
823 buffer
= xrealloc (buffer
, bufmax
* sizeof (int));
825 buffer
[buflen
++] = c
;
829 return conv_from_ucs4 (buffer
, buflen
);
833 /* Read a .strings file from a stream, and dispatch to the various
834 abstract_po_reader_class_ty methods. */
836 stringtable_parse (abstract_po_reader_ty
*pop
, FILE *file
,
837 const char *real_filename
, const char *logical_filename
)
840 real_file_name
= real_filename
;
841 gram_pos
.file_name
= xstrdup (real_file_name
);
842 gram_pos
.line_number
= 1;
843 encoding
= enc_undetermined
;
844 expect_fuzzy_msgstr_as_c_comment
= false;
845 expect_fuzzy_msgstr_as_cxx_comment
= false;
850 lex_pos_ty msgid_pos
;
852 lex_pos_ty msgstr_pos
;
855 /* Prepare for next msgid/msgstr pair. */
856 special_comment_reset ();
857 next_is_obsolete
= false;
858 next_is_fuzzy
= false;
861 /* Read the key and all the comments preceding it. */
862 msgid
= read_string (&msgid_pos
);
866 special_comment_finish ();
868 /* Skip whitespace. */
871 while (is_whitespace (c
));
873 /* Expect a '=' or ';'. */
876 error_with_progname
= false;
877 error (0, 0, _("%s:%lu: warning: unterminated key/value pair"),
878 real_file_name
, (unsigned long) gram_pos
.line_number
);
879 error_with_progname
= true;
884 /* "key"; is an abbreviation for "key"=""; and does not
885 necessarily designate an untranslated entry. */
887 msgstr_pos
= msgid_pos
;
888 po_callback_message (msgid
, &msgid_pos
, NULL
,
889 msgstr
, strlen (msgstr
) + 1, &msgstr_pos
,
890 false, next_is_obsolete
);
894 /* Read the value. */
895 msgstr
= read_string (&msgstr_pos
);
898 error_with_progname
= false;
899 error (0, 0, _("%s:%lu: warning: unterminated key/value pair"),
900 real_file_name
, (unsigned long) gram_pos
.line_number
);
901 error_with_progname
= true;
905 /* Skip whitespace. But for fuzzy key/value pairs, look for the
906 tentative msgstr in the form of a C style comment. */
907 expect_fuzzy_msgstr_as_c_comment
= next_is_fuzzy
;
911 if (fuzzy_msgstr
!= NULL
)
912 expect_fuzzy_msgstr_as_c_comment
= false;
914 while (is_whitespace (c
));
915 expect_fuzzy_msgstr_as_c_comment
= false;
920 /* But for fuzzy key/value pairs, look for the tentative msgstr
921 in the form of a C++ style comment. */
922 if (fuzzy_msgstr
== NULL
&& next_is_fuzzy
)
929 expect_fuzzy_msgstr_as_cxx_comment
= true;
932 expect_fuzzy_msgstr_as_cxx_comment
= false;
934 if (fuzzy_msgstr
!= NULL
&& strcmp (msgstr
, msgid
) == 0)
935 msgstr
= fuzzy_msgstr
;
937 /* A key/value pair. */
938 po_callback_message (msgid
, &msgid_pos
, NULL
,
939 msgstr
, strlen (msgstr
) + 1, &msgstr_pos
,
940 false, next_is_obsolete
);
944 error_with_progname
= false;
946 %s:%lu: warning: syntax error, expected ';' after string"),
947 real_file_name
, (unsigned long) gram_pos
.line_number
);
948 error_with_progname
= true;
954 error_with_progname
= false;
956 %s:%lu: warning: syntax error, expected '=' or ';' after string"),
957 real_file_name
, (unsigned long) gram_pos
.line_number
);
958 error_with_progname
= true;
964 real_file_name
= NULL
;
965 gram_pos
.line_number
= 0;