Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / read-properties.c
blobfb90f0c5fb723069420df56321c4bc61102453b1
1 /* Reading Java .properties files.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
23 /* Specification. */
24 #include "read-properties.h"
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
33 #include "error.h"
34 #include "error-progname.h"
35 #include "message.h"
36 #include "read-po-abstract.h"
37 #include "xalloc.h"
38 #include "exit.h"
39 #include "msgl-ascii.h"
40 #include "utf16-ucs4.h"
41 #include "ucs4-utf8.h"
42 #include "gettext.h"
44 #define _(str) gettext (str)
46 /* The format of the Java .properties files is documented in the JDK
47 documentation for class java.util.Properties. In the case of .properties
48 files for PropertyResourceBundle, each non-comment line contains a
49 key/value pair in the form "key = value" or "key : value" or "key value",
50 where the key is the msgid and the value is the msgstr. Messages with
51 plurals are not supported in this format. */
53 /* Handling of comments: We copy all comments from the .properties file to
54 the PO file. This is not really needed; it's a service for translators
55 who don't like PO files and prefer to maintain the .properties file. */
57 /* Real filename, used in error messages about the input file. */
58 static const char *real_file_name;
60 /* File name and line number. */
61 extern lex_pos_ty gram_pos;
63 /* The input file stream. */
64 static FILE *fp;
67 /* Phase 1: Read an ISO-8859-1 character.
68 Max. 1 pushback character. */
70 static int
71 phase1_getc ()
73 int c;
75 c = getc (fp);
77 if (c == EOF)
79 if (ferror (fp))
80 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
81 real_file_name);
82 return EOF;
85 return c;
88 static inline void
89 phase1_ungetc (int c)
91 if (c != EOF)
92 ungetc (c, fp);
96 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
97 Max. 2 pushback characters. */
99 static unsigned char phase2_pushback[2];
100 static int phase2_pushback_length;
102 static int
103 phase2_getc ()
105 int c;
107 if (phase2_pushback_length)
108 c = phase2_pushback[--phase2_pushback_length];
109 else
111 c = phase1_getc ();
113 if (c == '\r')
115 int c2 = phase1_getc ();
116 if (c2 == '\n')
117 c = c2;
118 else
119 phase1_ungetc (c2);
123 if (c == '\n')
124 gram_pos.line_number++;
126 return c;
129 static void
130 phase2_ungetc (int c)
132 if (c == '\n')
133 --gram_pos.line_number;
134 if (c != EOF)
135 phase2_pushback[phase2_pushback_length++] = c;
139 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
140 with handling of continuation lines.
141 Max. 1 pushback character. */
143 static int
144 phase3_getc ()
146 int c = phase2_getc ();
148 for (;;)
150 if (c != '\\')
151 return c;
153 c = phase2_getc ();
154 if (c != '\n')
156 phase2_ungetc (c);
157 return '\\';
160 /* Skip the backslash-newline and all whitespace that follows it. */
162 c = phase2_getc ();
163 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
167 static inline void
168 phase3_ungetc (int c)
170 phase2_ungetc (c);
174 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
175 with handling of continuation lines and of \uxxxx sequences. */
177 static int
178 phase4_getuc ()
180 int c = phase3_getc ();
182 if (c == EOF)
183 return -1;
184 if (c == '\\')
186 int c2 = phase3_getc ();
188 if (c2 == 't')
189 return '\t';
190 if (c2 == 'n')
191 return '\n';
192 if (c2 == 'r')
193 return '\r';
194 if (c2 == 'f')
195 return '\f';
196 if (c2 == 'u')
198 unsigned int n = 0;
199 int i;
201 for (i = 0; i < 4; i++)
203 int c1 = phase3_getc ();
205 if (c1 >= '0' && c1 <= '9')
206 n = (n << 4) + (c1 - '0');
207 else if (c1 >= 'A' && c1 <= 'F')
208 n = (n << 4) + (c1 - 'A' + 10);
209 else if (c1 >= 'a' && c1 <= 'f')
210 n = (n << 4) + (c1 - 'a' + 10);
211 else
213 phase3_ungetc (c1);
214 error_with_progname = false;
215 error (0, 0, _("%s:%lu: warning: invalid \\uxxxx syntax for Unicode character"),
216 real_file_name, (unsigned long) gram_pos.line_number);
217 error_with_progname = true;
218 return 'u';
221 return n;
224 return c2;
226 else
227 return c;
231 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
232 static char *
233 conv_from_iso_8859_1 (char *string)
235 if (is_ascii_string (string))
236 return string;
237 else
239 size_t length = strlen (string);
240 /* Each ISO-8859-1 character needs 2 bytes at worst. */
241 unsigned char *utf8_string = (unsigned char *) xmalloc (2 * length + 1);
242 unsigned char *q = utf8_string;
243 const char *str = string;
244 const char *str_limit = str + length;
246 while (str < str_limit)
248 unsigned int uc = (unsigned char) *str++;
249 int n = u8_uctomb (q, uc, 6);
250 assert (n > 0);
251 q += n;
253 *q = '\0';
254 assert (q - utf8_string <= 2 * length);
256 return (char *) utf8_string;
261 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
262 encoding. May destructively modify the argument string. */
263 static char *
264 conv_from_java (char *string)
266 /* This conversion can only shrink the string, never increase its size.
267 So there is no need to xmalloc the result freshly. */
268 const char *p = string;
269 unsigned char *q = (unsigned char *) string;
271 while (*p != '\0')
273 if (p[0] == '\\' && p[1] == 'u')
275 unsigned int n = 0;
276 int i;
278 for (i = 0; i < 4; i++)
280 int c1 = (unsigned char) p[2 + i];
282 if (c1 >= '0' && c1 <= '9')
283 n = (n << 4) + (c1 - '0');
284 else if (c1 >= 'A' && c1 <= 'F')
285 n = (n << 4) + (c1 - 'A' + 10);
286 else if (c1 >= 'a' && c1 <= 'f')
287 n = (n << 4) + (c1 - 'a' + 10);
288 else
289 goto just_one_byte;
292 if (i == 4)
294 unsigned int uc;
296 if (n >= 0xd800 && n < 0xdc00)
298 if (p[6] == '\\' && p[7] == 'u')
300 unsigned int m = 0;
302 for (i = 0; i < 4; i++)
304 int c1 = (unsigned char) p[8 + i];
306 if (c1 >= '0' && c1 <= '9')
307 m = (m << 4) + (c1 - '0');
308 else if (c1 >= 'A' && c1 <= 'F')
309 m = (m << 4) + (c1 - 'A' + 10);
310 else if (c1 >= 'a' && c1 <= 'f')
311 m = (m << 4) + (c1 - 'a' + 10);
312 else
313 goto just_one_byte;
316 if (i == 4 && (m >= 0xdc00 && m < 0xe000))
318 /* Combine two UTF-16 words to a character. */
319 uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
320 p += 12;
322 else
323 goto just_one_byte;
325 else
326 goto just_one_byte;
328 else
330 uc = n;
331 p += 6;
334 q += u8_uctomb (q, uc, 6);
335 continue;
338 just_one_byte:
339 *q++ = (unsigned char) *p++;
341 *q = '\0';
342 return string;
346 /* Reads a key or value string.
347 Returns the string in UTF-8 encoding, or NULL if the end of the logical
348 line is reached.
349 Parsing ends:
350 - when returning NULL, after the end of the logical line,
351 - otherwise, if in_key is true, after the whitespace and possibly the
352 separator that follows after the string,
353 - otherwise, if in_key is false, after the end of the logical line. */
355 static char *
356 read_escaped_string (bool in_key)
358 static unsigned short *buffer;
359 static size_t bufmax;
360 static size_t buflen;
361 int c;
363 /* Skip whitespace before the string. */
365 c = phase3_getc ();
366 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
368 if (c == EOF || c == '\n')
369 /* Empty string. */
370 return NULL;
372 /* Start accumulating the string. We store the string in UTF-16 before
373 converting it to UTF-8. Why not converting every character directly to
374 UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
375 we must combine them to a single UTF-8 character. */
376 buflen = 0;
377 for (;;)
379 if (in_key && (c == '=' || c == ':'
380 || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
382 /* Skip whitespace after the string. */
383 while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
384 c = phase3_getc ();
385 /* Skip '=' or ':' separator. */
386 if (!(c == '=' || c == ':'))
387 phase3_ungetc (c);
388 break;
391 phase3_ungetc (c);
393 /* Read the next UTF-16 codepoint. */
394 c = phase4_getuc ();
395 if (c < 0)
396 break;
397 /* Append it to the buffer. */
398 if (buflen >= bufmax)
400 bufmax += 100;
401 buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
403 buffer[buflen++] = c;
405 c = phase3_getc ();
406 if (c == EOF || c == '\n')
408 if (in_key)
409 phase3_ungetc (c);
410 break;
414 /* Now convert from UTF-16 to UTF-8. */
416 size_t pos;
417 unsigned char *utf8_string;
418 unsigned char *q;
420 /* Each UTF-16 word needs 3 bytes at worst. */
421 utf8_string = (unsigned char *) xmalloc (3 * buflen + 1);
422 for (pos = 0, q = utf8_string; pos < buflen; )
424 unsigned int uc;
425 int n;
427 pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
428 n = u8_uctomb (q, uc, 6);
429 assert (n > 0);
430 q += n;
432 *q = '\0';
433 assert (q - utf8_string <= 3 * buflen);
435 return (char *) utf8_string;
440 /* Read a .properties file from a stream, and dispatch to the various
441 abstract_po_reader_class_ty methods. */
442 void
443 properties_parse (abstract_po_reader_ty *this, FILE *file,
444 const char *real_filename, const char *logical_filename)
446 fp = file;
447 real_file_name = real_filename;
448 gram_pos.file_name = xstrdup (real_file_name);
449 gram_pos.line_number = 1;
451 for (;;)
453 int c;
454 bool comment;
455 bool hidden;
457 c = phase2_getc ();
459 if (c == EOF)
460 break;
462 comment = false;
463 hidden = false;
464 if (c == '#')
465 comment = true;
466 else if (c == '!')
468 /* For compatibility with write-properties.c, we treat '!' not
469 followed by space as a fuzzy or untranslated message. */
470 int c2 = phase2_getc ();
471 if (c2 == ' ' || c2 == '\n' || c2 == EOF)
472 comment = true;
473 else
474 hidden = true;
475 phase2_ungetc (c2);
477 else
478 phase2_ungetc (c);
480 if (comment)
482 /* A comment line. */
483 static char *buffer;
484 static size_t bufmax;
485 static size_t buflen;
487 buflen = 0;
488 for (;;)
490 c = phase2_getc ();
492 if (buflen >= bufmax)
494 bufmax += 100;
495 buffer = xrealloc (buffer, bufmax);
498 if (c == EOF || c == '\n')
499 break;
501 buffer[buflen++] = c;
503 buffer[buflen] = '\0';
505 po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
507 else
509 /* A key/value pair. */
510 char *msgid;
511 lex_pos_ty msgid_pos;
513 msgid_pos = gram_pos;
514 msgid = read_escaped_string (true);
515 if (msgid == NULL)
516 /* Skip blank line. */
518 else
520 char *msgstr;
521 lex_pos_ty msgstr_pos;
522 bool force_fuzzy;
524 msgstr_pos = gram_pos;
525 msgstr = read_escaped_string (false);
526 if (msgstr == NULL)
527 msgstr = xstrdup ("");
529 /* Be sure to make the message fuzzy if it was commented out
530 and if it is not already header/fuzzy/untranslated. */
531 force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
533 po_callback_message (msgid, &msgid_pos, NULL,
534 msgstr, strlen (msgstr) + 1, &msgstr_pos,
535 force_fuzzy, false);
540 fp = NULL;
541 real_file_name = NULL;
542 gram_pos.line_number = 0;