1 /* Reading Java .properties files.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24 #include "read-properties.h"
34 #include "error-progname.h"
36 #include "read-po-abstract.h"
39 #include "msgl-ascii.h"
40 #include "utf16-ucs4.h"
41 #include "ucs4-utf8.h"
44 #define _(str) gettext (str)
46 /* The format of the Java .properties files is documented in the JDK
47 documentation for class java.util.Properties. In the case of .properties
48 files for PropertyResourceBundle, each non-comment line contains a
49 key/value pair in the form "key = value" or "key : value" or "key value",
50 where the key is the msgid and the value is the msgstr. Messages with
51 plurals are not supported in this format. */
53 /* Handling of comments: We copy all comments from the .properties file to
54 the PO file. This is not really needed; it's a service for translators
55 who don't like PO files and prefer to maintain the .properties file. */
57 /* Real filename, used in error messages about the input file. */
58 static const char *real_file_name
;
60 /* File name and line number. */
61 extern lex_pos_ty gram_pos
;
63 /* The input file stream. */
67 /* Phase 1: Read an ISO-8859-1 character.
68 Max. 1 pushback character. */
80 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
96 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
97 Max. 2 pushback characters. */
99 static unsigned char phase2_pushback
[2];
100 static int phase2_pushback_length
;
107 if (phase2_pushback_length
)
108 c
= phase2_pushback
[--phase2_pushback_length
];
115 int c2
= phase1_getc ();
124 gram_pos
.line_number
++;
130 phase2_ungetc (int c
)
133 --gram_pos
.line_number
;
135 phase2_pushback
[phase2_pushback_length
++] = c
;
139 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
140 with handling of continuation lines.
141 Max. 1 pushback character. */
146 int c
= phase2_getc ();
160 /* Skip the backslash-newline and all whitespace that follows it. */
163 while (c
== ' ' || c
== '\t' || c
== '\r' || c
== '\f');
168 phase3_ungetc (int c
)
174 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
175 with handling of continuation lines and of \uxxxx sequences. */
180 int c
= phase3_getc ();
186 int c2
= phase3_getc ();
201 for (i
= 0; i
< 4; i
++)
203 int c1
= phase3_getc ();
205 if (c1
>= '0' && c1
<= '9')
206 n
= (n
<< 4) + (c1
- '0');
207 else if (c1
>= 'A' && c1
<= 'F')
208 n
= (n
<< 4) + (c1
- 'A' + 10);
209 else if (c1
>= 'a' && c1
<= 'f')
210 n
= (n
<< 4) + (c1
- 'a' + 10);
214 error_with_progname
= false;
215 error (0, 0, _("%s:%lu: warning: invalid \\uxxxx syntax for Unicode character"),
216 real_file_name
, (unsigned long) gram_pos
.line_number
);
217 error_with_progname
= true;
231 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
233 conv_from_iso_8859_1 (char *string
)
235 if (is_ascii_string (string
))
239 size_t length
= strlen (string
);
240 /* Each ISO-8859-1 character needs 2 bytes at worst. */
241 unsigned char *utf8_string
= (unsigned char *) xmalloc (2 * length
+ 1);
242 unsigned char *q
= utf8_string
;
243 const char *str
= string
;
244 const char *str_limit
= str
+ length
;
246 while (str
< str_limit
)
248 unsigned int uc
= (unsigned char) *str
++;
249 int n
= u8_uctomb (q
, uc
, 6);
254 assert (q
- utf8_string
<= 2 * length
);
256 return (char *) utf8_string
;
261 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
262 encoding. May destructively modify the argument string. */
264 conv_from_java (char *string
)
266 /* This conversion can only shrink the string, never increase its size.
267 So there is no need to xmalloc the result freshly. */
268 const char *p
= string
;
269 unsigned char *q
= (unsigned char *) string
;
273 if (p
[0] == '\\' && p
[1] == 'u')
278 for (i
= 0; i
< 4; i
++)
280 int c1
= (unsigned char) p
[2 + i
];
282 if (c1
>= '0' && c1
<= '9')
283 n
= (n
<< 4) + (c1
- '0');
284 else if (c1
>= 'A' && c1
<= 'F')
285 n
= (n
<< 4) + (c1
- 'A' + 10);
286 else if (c1
>= 'a' && c1
<= 'f')
287 n
= (n
<< 4) + (c1
- 'a' + 10);
296 if (n
>= 0xd800 && n
< 0xdc00)
298 if (p
[6] == '\\' && p
[7] == 'u')
302 for (i
= 0; i
< 4; i
++)
304 int c1
= (unsigned char) p
[8 + i
];
306 if (c1
>= '0' && c1
<= '9')
307 m
= (m
<< 4) + (c1
- '0');
308 else if (c1
>= 'A' && c1
<= 'F')
309 m
= (m
<< 4) + (c1
- 'A' + 10);
310 else if (c1
>= 'a' && c1
<= 'f')
311 m
= (m
<< 4) + (c1
- 'a' + 10);
316 if (i
== 4 && (m
>= 0xdc00 && m
< 0xe000))
318 /* Combine two UTF-16 words to a character. */
319 uc
= 0x10000 + ((n
- 0xd800) << 10) + (m
- 0xdc00);
334 q
+= u8_uctomb (q
, uc
, 6);
339 *q
++ = (unsigned char) *p
++;
346 /* Reads a key or value string.
347 Returns the string in UTF-8 encoding, or NULL if the end of the logical
350 - when returning NULL, after the end of the logical line,
351 - otherwise, if in_key is true, after the whitespace and possibly the
352 separator that follows after the string,
353 - otherwise, if in_key is false, after the end of the logical line. */
356 read_escaped_string (bool in_key
)
358 static unsigned short *buffer
;
359 static size_t bufmax
;
360 static size_t buflen
;
363 /* Skip whitespace before the string. */
366 while (c
== ' ' || c
== '\t' || c
== '\r' || c
== '\f');
368 if (c
== EOF
|| c
== '\n')
372 /* Start accumulating the string. We store the string in UTF-16 before
373 converting it to UTF-8. Why not converting every character directly to
374 UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
375 we must combine them to a single UTF-8 character. */
379 if (in_key
&& (c
== '=' || c
== ':'
380 || c
== ' ' || c
== '\t' || c
== '\r' || c
== '\f'))
382 /* Skip whitespace after the string. */
383 while (c
== ' ' || c
== '\t' || c
== '\r' || c
== '\f')
385 /* Skip '=' or ':' separator. */
386 if (!(c
== '=' || c
== ':'))
393 /* Read the next UTF-16 codepoint. */
397 /* Append it to the buffer. */
398 if (buflen
>= bufmax
)
401 buffer
= xrealloc (buffer
, bufmax
* sizeof (unsigned short));
403 buffer
[buflen
++] = c
;
406 if (c
== EOF
|| c
== '\n')
414 /* Now convert from UTF-16 to UTF-8. */
417 unsigned char *utf8_string
;
420 /* Each UTF-16 word needs 3 bytes at worst. */
421 utf8_string
= (unsigned char *) xmalloc (3 * buflen
+ 1);
422 for (pos
= 0, q
= utf8_string
; pos
< buflen
; )
427 pos
+= u16_mbtouc (&uc
, buffer
+ pos
, buflen
- pos
);
428 n
= u8_uctomb (q
, uc
, 6);
433 assert (q
- utf8_string
<= 3 * buflen
);
435 return (char *) utf8_string
;
440 /* Read a .properties file from a stream, and dispatch to the various
441 abstract_po_reader_class_ty methods. */
443 properties_parse (abstract_po_reader_ty
*this, FILE *file
,
444 const char *real_filename
, const char *logical_filename
)
447 real_file_name
= real_filename
;
448 gram_pos
.file_name
= xstrdup (real_file_name
);
449 gram_pos
.line_number
= 1;
468 /* For compatibility with write-properties.c, we treat '!' not
469 followed by space as a fuzzy or untranslated message. */
470 int c2
= phase2_getc ();
471 if (c2
== ' ' || c2
== '\n' || c2
== EOF
)
482 /* A comment line. */
484 static size_t bufmax
;
485 static size_t buflen
;
492 if (buflen
>= bufmax
)
495 buffer
= xrealloc (buffer
, bufmax
);
498 if (c
== EOF
|| c
== '\n')
501 buffer
[buflen
++] = c
;
503 buffer
[buflen
] = '\0';
505 po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer
)));
509 /* A key/value pair. */
511 lex_pos_ty msgid_pos
;
513 msgid_pos
= gram_pos
;
514 msgid
= read_escaped_string (true);
516 /* Skip blank line. */
521 lex_pos_ty msgstr_pos
;
524 msgstr_pos
= gram_pos
;
525 msgstr
= read_escaped_string (false);
527 msgstr
= xstrdup ("");
529 /* Be sure to make the message fuzzy if it was commented out
530 and if it is not already header/fuzzy/untranslated. */
531 force_fuzzy
= (hidden
&& msgid
[0] != '\0' && msgstr
[0] != '\0');
533 po_callback_message (msgid
, &msgid_pos
, NULL
,
534 msgstr
, strlen (msgstr
) + 1, &msgstr_pos
,
541 real_file_name
= NULL
;
542 gram_pos
.line_number
= 0;