gnu/dist/gettext/gettext-tools/src/read-properties.c

   1 /* Reading Java .properties files.
   2    Copyright (C) 2003 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2003.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include <config.h>
  21 #endif
  22
  23 /* Specification.  */
  24 #include "read-properties.h"
  25
  26 #include <assert.h>
  27 #include <errno.h>
  28 #include <stdbool.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "error.h"
  34 #include "error-progname.h"
  35 #include "message.h"
  36 #include "read-po-abstract.h"
  37 #include "xalloc.h"
  38 #include "exit.h"
  39 #include "msgl-ascii.h"
  40 #include "utf16-ucs4.h"
  41 #include "ucs4-utf8.h"
  42 #include "gettext.h"
  43
  44 #define _(str) gettext (str)
  45
  46 /* The format of the Java .properties files is documented in the JDK
  47    documentation for class java.util.Properties.  In the case of .properties
  48    files for PropertyResourceBundle, each non-comment line contains a
  49    key/value pair in the form "key = value" or "key : value" or "key value",
  50    where the key is the msgid and the value is the msgstr.  Messages with
  51    plurals are not supported in this format.  */
  52
  53 /* Handling of comments: We copy all comments from the .properties file to
  54    the PO file. This is not really needed; it's a service for translators
  55    who don't like PO files and prefer to maintain the .properties file.  */
  56
  57 /* Real filename, used in error messages about the input file.  */
  58 static const char *real_file_name;
  59
  60 /* File name and line number.  */
  61 extern lex_pos_ty gram_pos;
  62
  63 /* The input file stream.  */
  64 static FILE *fp;
  65
  66
  67 /* Phase 1: Read an ISO-8859-1 character.
  68    Max. 1 pushback character.  */
  69
  70 static int
  71 phase1_getc ()
  72 {
  73   int c;
  74
  75   c = getc (fp);
  76
  77   if (c == EOF)
  78     {
  79       if (ferror (fp))
  80         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
  81                real_file_name);
  82       return EOF;
  83     }
  84
  85   return c;
  86 }
  87
  88 static inline void
  89 phase1_ungetc (int c)
  90 {
  91   if (c != EOF)
  92     ungetc (c, fp);
  93 }
  94
  95
  96 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
  97    Max. 2 pushback characters.  */
  98
  99 static unsigned char phase2_pushback[2];
 100 static int phase2_pushback_length;
 101
 102 static int
 103 phase2_getc ()
 104 {
 105   int c;
 106
 107   if (phase2_pushback_length)
 108     c = phase2_pushback[--phase2_pushback_length];
 109   else
 110     {
 111       c = phase1_getc ();
 112
 113       if (c == '\r')
 114         {
 115           int c2 = phase1_getc ();
 116           if (c2 == '\n')
 117             c = c2;
 118           else
 119             phase1_ungetc (c2);
 120         }
 121     }
 122
 123   if (c == '\n')
 124     gram_pos.line_number++;
 125
 126   return c;
 127 }
 128
 129 static void
 130 phase2_ungetc (int c)
 131 {
 132   if (c == '\n')
 133     --gram_pos.line_number;
 134   if (c != EOF)
 135     phase2_pushback[phase2_pushback_length++] = c;
 136 }
 137
 138
 139 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
 140    with handling of continuation lines.
 141    Max. 1 pushback character.  */
 142
 143 static int
 144 phase3_getc ()
 145 {
 146   int c = phase2_getc ();
 147
 148   for (;;)
 149     {
 150       if (c != '\\')
 151         return c;
 152
 153       c = phase2_getc ();
 154       if (c != '\n')
 155         {
 156           phase2_ungetc (c);
 157           return '\\';
 158         }
 159
 160       /* Skip the backslash-newline and all whitespace that follows it.  */
 161       do
 162         c = phase2_getc ();
 163       while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
 164     }
 165 }
 166
 167 static inline void
 168 phase3_ungetc (int c)
 169 {
 170   phase2_ungetc (c);
 171 }
 172
 173
 174 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
 175    with handling of continuation lines and of \uxxxx sequences.  */
 176
 177 static int
 178 phase4_getuc ()
 179 {
 180   int c = phase3_getc ();
 181
 182   if (c == EOF)
 183     return -1;
 184   if (c == '\\')
 185     {
 186       int c2 = phase3_getc ();
 187
 188       if (c2 == 't')
 189         return '\t';
 190       if (c2 == 'n')
 191         return '\n';
 192       if (c2 == 'r')
 193         return '\r';
 194       if (c2 == 'f')
 195         return '\f';
 196       if (c2 == 'u')
 197         {
 198           unsigned int n = 0;
 199           int i;
 200
 201           for (i = 0; i < 4; i++)
 202             {
 203               int c1 = phase3_getc ();
 204
 205               if (c1 >= '0' && c1 <= '9')
 206                 n = (n << 4) + (c1 - '0');
 207               else if (c1 >= 'A' && c1 <= 'F')
 208                 n = (n << 4) + (c1 - 'A' + 10);
 209               else if (c1 >= 'a' && c1 <= 'f')
 210                 n = (n << 4) + (c1 - 'a' + 10);
 211               else
 212                 {
 213                   phase3_ungetc (c1);
 214                   error_with_progname = false;
 215                   error (0, 0, _("%s:%lu: warning: invalid \\uxxxx syntax for Unicode character"),
 216                          real_file_name, (unsigned long) gram_pos.line_number);
 217                   error_with_progname = true;
 218                   return 'u';
 219                 }
 220             }
 221           return n;
 222         }
 223
 224       return c2;
 225     }
 226   else
 227     return c;
 228 }
 229
 230
 231 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
 232 static char *
 233 conv_from_iso_8859_1 (char *string)
 234 {
 235   if (is_ascii_string (string))
 236     return string;
 237   else
 238     {
 239       size_t length = strlen (string);
 240       /* Each ISO-8859-1 character needs 2 bytes at worst.  */
 241       unsigned char *utf8_string = (unsigned char *) xmalloc (2 * length + 1);
 242       unsigned char *q = utf8_string;
 243       const char *str = string;
 244       const char *str_limit = str + length;
 245
 246       while (str < str_limit)
 247         {
 248           unsigned int uc = (unsigned char) *str++;
 249           int n = u8_uctomb (q, uc, 6);
 250           assert (n > 0);
 251           q += n;
 252         }
 253       *q = '\0';
 254       assert (q - utf8_string <= 2 * length);
 255
 256       return (char *) utf8_string;
 257     }
 258 }
 259
 260
 261 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
 262    encoding.  May destructively modify the argument string.  */
 263 static char *
 264 conv_from_java (char *string)
 265 {
 266   /* This conversion can only shrink the string, never increase its size.
 267      So there is no need to xmalloc the result freshly.  */
 268   const char *p = string;
 269   unsigned char *q = (unsigned char *) string;
 270
 271   while (*p != '\0')
 272     {
 273       if (p[0] == '\\' && p[1] == 'u')
 274         {
 275           unsigned int n = 0;
 276           int i;
 277
 278           for (i = 0; i < 4; i++)
 279             {
 280               int c1 = (unsigned char) p[2 + i];
 281
 282               if (c1 >= '0' && c1 <= '9')
 283                 n = (n << 4) + (c1 - '0');
 284               else if (c1 >= 'A' && c1 <= 'F')
 285                 n = (n << 4) + (c1 - 'A' + 10);
 286               else if (c1 >= 'a' && c1 <= 'f')
 287                 n = (n << 4) + (c1 - 'a' + 10);
 288               else
 289                 goto just_one_byte;
 290             }
 291
 292           if (i == 4)
 293             {
 294               unsigned int uc;
 295
 296               if (n >= 0xd800 && n < 0xdc00)
 297                 {
 298                   if (p[6] == '\\' && p[7] == 'u')
 299                     {
 300                       unsigned int m = 0;
 301
 302                       for (i = 0; i < 4; i++)
 303                         {
 304                           int c1 = (unsigned char) p[8 + i];
 305
 306                           if (c1 >= '0' && c1 <= '9')
 307                             m = (m << 4) + (c1 - '0');
 308                           else if (c1 >= 'A' && c1 <= 'F')
 309                             m = (m << 4) + (c1 - 'A' + 10);
 310                           else if (c1 >= 'a' && c1 <= 'f')
 311                             m = (m << 4) + (c1 - 'a' + 10);
 312                           else
 313                             goto just_one_byte;
 314                         }
 315
 316                       if (i == 4 && (m >= 0xdc00 && m < 0xe000))
 317                         {
 318                           /* Combine two UTF-16 words to a character.  */
 319                           uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
 320                           p += 12;
 321                         }
 322                       else
 323                         goto just_one_byte;
 324                     }
 325                   else
 326                     goto just_one_byte;
 327                 }
 328               else
 329                 {
 330                   uc = n;
 331                   p += 6;
 332                 }
 333
 334               q += u8_uctomb (q, uc, 6);
 335               continue;
 336             }
 337         }
 338       just_one_byte:
 339         *q++ = (unsigned char) *p++;
 340     }
 341   *q = '\0';
 342   return string;
 343 }
 344
 345
 346 /* Reads a key or value string.
 347    Returns the string in UTF-8 encoding, or NULL if the end of the logical
 348    line is reached.
 349    Parsing ends:
 350      - when returning NULL, after the end of the logical line,
 351      - otherwise, if in_key is true, after the whitespace and possibly the
 352        separator that follows after the string,
 353      - otherwise, if in_key is false, after the end of the logical line. */
 354
 355 static char *
 356 read_escaped_string (bool in_key)
 357 {
 358   static unsigned short *buffer;
 359   static size_t bufmax;
 360   static size_t buflen;
 361   int c;
 362
 363   /* Skip whitespace before the string.  */
 364   do
 365     c = phase3_getc ();
 366   while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
 367
 368   if (c == EOF || c == '\n')
 369     /* Empty string.  */
 370     return NULL;
 371
 372   /* Start accumulating the string.  We store the string in UTF-16 before
 373      converting it to UTF-8.  Why not converting every character directly to
 374      UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
 375      we must combine them to a single UTF-8 character.  */
 376   buflen = 0;
 377   for (;;)
 378     {
 379       if (in_key && (c == '=' || c == ':'
 380                      || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
 381         {
 382           /* Skip whitespace after the string.  */
 383           while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
 384             c = phase3_getc ();
 385           /* Skip '=' or ':' separator.  */
 386           if (!(c == '=' || c == ':'))
 387             phase3_ungetc (c);
 388           break;
 389         }
 390
 391       phase3_ungetc (c);
 392
 393       /* Read the next UTF-16 codepoint.  */
 394       c = phase4_getuc ();
 395       if (c < 0)
 396         break;
 397       /* Append it to the buffer.  */
 398       if (buflen >= bufmax)
 399         {
 400           bufmax += 100;
 401           buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
 402         }
 403       buffer[buflen++] = c;
 404
 405       c = phase3_getc ();
 406       if (c == EOF || c == '\n')
 407         {
 408           if (in_key)
 409             phase3_ungetc (c);
 410           break;
 411         }
 412     }
 413
 414   /* Now convert from UTF-16 to UTF-8.  */
 415   {
 416     size_t pos;
 417     unsigned char *utf8_string;
 418     unsigned char *q;
 419
 420     /* Each UTF-16 word needs 3 bytes at worst.  */
 421     utf8_string = (unsigned char *) xmalloc (3 * buflen + 1);
 422     for (pos = 0, q = utf8_string; pos < buflen; )
 423       {
 424         unsigned int uc;
 425         int n;
 426
 427         pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
 428         n = u8_uctomb (q, uc, 6);
 429         assert (n > 0);
 430         q += n;
 431       }
 432     *q = '\0';
 433     assert (q - utf8_string <= 3 * buflen);
 434
 435     return (char *) utf8_string;
 436   }
 437 }
 438
 439
 440 /* Read a .properties file from a stream, and dispatch to the various
 441    abstract_po_reader_class_ty methods.  */
 442 void
 443 properties_parse (abstract_po_reader_ty *this, FILE *file,
 444                   const char *real_filename, const char *logical_filename)
 445 {
 446   fp = file;
 447   real_file_name = real_filename;
 448   gram_pos.file_name = xstrdup (real_file_name);
 449   gram_pos.line_number = 1;
 450
 451   for (;;)
 452     {
 453       int c;
 454       bool comment;
 455       bool hidden;
 456
 457       c = phase2_getc ();
 458
 459       if (c == EOF)
 460         break;
 461
 462       comment = false;
 463       hidden = false;
 464       if (c == '#')
 465         comment = true;
 466       else if (c == '!')
 467         {
 468           /* For compatibility with write-properties.c, we treat '!' not
 469              followed by space as a fuzzy or untranslated message.  */
 470           int c2 = phase2_getc ();
 471           if (c2 == ' ' || c2 == '\n' || c2 == EOF)
 472             comment = true;
 473           else
 474             hidden = true;
 475           phase2_ungetc (c2);
 476         }
 477       else
 478         phase2_ungetc (c);
 479
 480       if (comment)
 481         {
 482           /* A comment line.  */
 483           static char *buffer;
 484           static size_t bufmax;
 485           static size_t buflen;
 486
 487           buflen = 0;
 488           for (;;)
 489             {
 490               c = phase2_getc ();
 491
 492               if (buflen >= bufmax)
 493                 {
 494                   bufmax += 100;
 495                   buffer = xrealloc (buffer, bufmax);
 496                 }
 497
 498               if (c == EOF || c == '\n')
 499                 break;
 500
 501               buffer[buflen++] = c;
 502             }
 503           buffer[buflen] = '\0';
 504
 505           po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
 506         }
 507       else
 508         {
 509           /* A key/value pair.  */
 510           char *msgid;
 511           lex_pos_ty msgid_pos;
 512
 513           msgid_pos = gram_pos;
 514           msgid = read_escaped_string (true);
 515           if (msgid == NULL)
 516             /* Skip blank line.  */
 517             ;
 518           else
 519             {
 520               char *msgstr;
 521               lex_pos_ty msgstr_pos;
 522               bool force_fuzzy;
 523
 524               msgstr_pos = gram_pos;
 525               msgstr = read_escaped_string (false);
 526               if (msgstr == NULL)
 527                 msgstr = xstrdup ("");
 528
 529               /* Be sure to make the message fuzzy if it was commented out
 530                  and if it is not already header/fuzzy/untranslated.  */
 531               force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
 532
 533               po_callback_message (msgid, &msgid_pos, NULL,
 534                                    msgstr, strlen (msgstr) + 1, &msgstr_pos,
 535                                    force_fuzzy, false);
 536             }
 537         }
 538     }
 539
 540   fp = NULL;
 541   real_file_name = NULL;
 542   gram_pos.line_number = 0;
 543 }