gnu/dist/gettext/gettext-tools/src/po-charset.c

   1 /* Charset handling while reading PO files.
   2    Copyright (C) 2001-2005 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23 #include <alloca.h>
  24
  25 /* Specification.  */
  26 #include "po-charset.h"
  27
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "xallocsa.h"
  32 #include "xerror.h"
  33 #include "po-error.h"
  34 #include "basename.h"
  35 #include "progname.h"
  36 #include "strstr.h"
  37 #include "c-strcase.h"
  38 #include "gettext.h"
  39
  40 #define _(str) gettext (str)
  41
  42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  43
  44 static const char ascii[] = "ASCII";
  45
  46 /* The canonicalized encoding name for ASCII.  */
  47 const char *po_charset_ascii = ascii;
  48
  49 static const char utf8[] = "UTF-8";
  50
  51 /* The canonicalized encoding name for UTF-8.  */
  52 const char *po_charset_utf8 = utf8;
  53
  54 /* Canonicalize an encoding name.  */
  55 const char *
  56 po_charset_canonicalize (const char *charset)
  57 {
  58   /* The list of charsets supported by glibc's iconv() and by the portable
  59      iconv() across platforms.  Taken from intl/config.charset.  */
  60   static const char *standard_charsets[] =
  61   {
  62     ascii, "ANSI_X3.4-1968", "US-ASCII",        /* i = 0..2 */
  63     "ISO-8859-1", "ISO_8859-1",                 /* i = 3, 4 */
  64     "ISO-8859-2", "ISO_8859-2",
  65     "ISO-8859-3", "ISO_8859-3",
  66     "ISO-8859-4", "ISO_8859-4",
  67     "ISO-8859-5", "ISO_8859-5",
  68     "ISO-8859-6", "ISO_8859-6",
  69     "ISO-8859-7", "ISO_8859-7",
  70     "ISO-8859-8", "ISO_8859-8",
  71     "ISO-8859-9", "ISO_8859-9",
  72     "ISO-8859-13", "ISO_8859-13",
  73     "ISO-8859-14", "ISO_8859-14",
  74     "ISO-8859-15", "ISO_8859-15",               /* i = 25, 26 */
  75     "KOI8-R",
  76     "KOI8-U",
  77     "KOI8-T",
  78     "CP850",
  79     "CP866",
  80     "CP874",
  81     "CP932",
  82     "CP949",
  83     "CP950",
  84     "CP1250",
  85     "CP1251",
  86     "CP1252",
  87     "CP1253",
  88     "CP1254",
  89     "CP1255",
  90     "CP1256",
  91     "CP1257",
  92     "GB2312",
  93     "EUC-JP",
  94     "EUC-KR",
  95     "EUC-TW",
  96     "BIG5",
  97     "BIG5-HKSCS",
  98     "GBK",
  99     "GB18030",
 100     "SHIFT_JIS",
 101     "JOHAB",
 102     "TIS-620",
 103     "VISCII",
 104     "GEORGIAN-PS",
 105     utf8
 106   };
 107   size_t i;
 108
 109   for (i = 0; i < SIZEOF (standard_charsets); i++)
 110     if (c_strcasecmp (charset, standard_charsets[i]) == 0)
 111       return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
 112   return NULL;
 113 }
 114
 115 /* Test for ASCII compatibility.  */
 116 bool
 117 po_charset_ascii_compatible (const char *canon_charset)
 118 {
 119   /* There are only a few exceptions to ASCII compatibility.  */
 120   if (strcmp (canon_charset, "SHIFT_JIS") == 0
 121       || strcmp (canon_charset, "JOHAB") == 0
 122       || strcmp (canon_charset, "VISCII") == 0)
 123     return false;
 124   else
 125     return true;
 126 }
 127
 128 /* Test for a weird encoding, i.e. an encoding which has double-byte
 129    characters ending in 0x5C.  */
 130 bool po_is_charset_weird (const char *canon_charset)
 131 {
 132   static const char *weird_charsets[] =
 133   {
 134     "BIG5",
 135     "BIG5-HKSCS",
 136     "GBK",
 137     "GB18030",
 138     "SHIFT_JIS",
 139     "JOHAB"
 140   };
 141   size_t i;
 142
 143   for (i = 0; i < SIZEOF (weird_charsets); i++)
 144     if (strcmp (canon_charset, weird_charsets[i]) == 0)
 145       return true;
 146   return false;
 147 }
 148
 149 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
 150    An encoding has CJK structure if every valid character stream is composed
 151    of single bytes in the range 0x{00..7F} and of byte pairs in the range
 152    0x{80..FF}{30..FF}.  */
 153 bool po_is_charset_weird_cjk (const char *canon_charset)
 154 {
 155   static const char *weird_cjk_charsets[] =
 156   {                     /* single bytes   double bytes       */
 157     "BIG5",             /* 0x{00..7F},    0x{A1..F9}{40..FE} */
 158     "BIG5-HKSCS",       /* 0x{00..7F},    0x{88..FE}{40..FE} */
 159     "GBK",              /* 0x{00..7F},    0x{81..FE}{40..FE} */
 160     "GB18030",          /* 0x{00..7F},    0x{81..FE}{30..FE} */
 161     "SHIFT_JIS",        /* 0x{00..7F},    0x{81..F9}{40..FC} */
 162     "JOHAB"             /* 0x{00..7F},    0x{84..F9}{31..FE} */
 163   };
 164   size_t i;
 165
 166   for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
 167     if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
 168       return true;
 169   return false;
 170 }
 171
 172
 173 /* The PO file's encoding, as specified in the header entry.  */
 174 const char *po_lex_charset;
 175
 176 #if HAVE_ICONV
 177 /* Converter from the PO file's encoding to UTF-8.  */
 178 iconv_t po_lex_iconv;
 179 #endif
 180 /* If no converter is available, some information about the structure of the
 181    PO file's encoding.  */
 182 bool po_lex_weird_cjk;
 183
 184 void
 185 po_lex_charset_init ()
 186 {
 187   po_lex_charset = NULL;
 188 #if HAVE_ICONV
 189   po_lex_iconv = (iconv_t)(-1);
 190 #endif
 191   po_lex_weird_cjk = false;
 192 }
 193
 194 void
 195 po_lex_charset_set (const char *header_entry, const char *filename)
 196 {
 197   /* Verify the validity of CHARSET.  It is necessary
 198      1. for the correct treatment of multibyte characters containing
 199         0x5C bytes in the PO lexer,
 200      2. so that at run time, gettext() can call iconv() to convert
 201         msgstr.  */
 202   const char *charsetstr = strstr (header_entry, "charset=");
 203
 204   if (charsetstr != NULL)
 205     {
 206       size_t len;
 207       char *charset;
 208       const char *canon_charset;
 209
 210       charsetstr += strlen ("charset=");
 211       len = strcspn (charsetstr, " \t\n");
 212       charset = (char *) xallocsa (len + 1);
 213       memcpy (charset, charsetstr, len);
 214       charset[len] = '\0';
 215
 216       canon_charset = po_charset_canonicalize (charset);
 217       if (canon_charset == NULL)
 218         {
 219           /* Don't warn for POT files, because POT files usually contain
 220              only ASCII msgids.  */
 221           size_t filenamelen = strlen (filename);
 222
 223           if (!(filenamelen >= 4
 224                 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
 225                 && strcmp (charset, "CHARSET") == 0))
 226             po_multiline_warning (xasprintf (_("%s: warning: "), filename),
 227                                   xasprintf (_("\
 228 Charset \"%s\" is not a portable encoding name.\n\
 229 Message conversion to user's charset might not work.\n"),
 230                                              charset));
 231         }
 232       else
 233         {
 234           const char *envval;
 235
 236           po_lex_charset = canon_charset;
 237 #if HAVE_ICONV
 238           if (po_lex_iconv != (iconv_t)(-1))
 239             iconv_close (po_lex_iconv);
 240 #endif
 241
 242           /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
 243              don't know about multibyte encodings, and require a spurious
 244              backslash after every multibyte character whose last byte is
 245              0x5C.  Some programs, like vim, distribute PO files in this
 246              broken format.  GNU msgfmt must continue to support this old
 247              PO file format when the Makefile requests it.  */
 248           envval = getenv ("OLD_PO_FILE_INPUT");
 249           if (envval != NULL && *envval != '\0')
 250             {
 251               /* Assume the PO file is in old format, with extraneous
 252                  backslashes.  */
 253 #if HAVE_ICONV
 254               po_lex_iconv = (iconv_t)(-1);
 255 #endif
 256               po_lex_weird_cjk = false;
 257             }
 258           else
 259             {
 260               /* Use iconv() to parse multibyte characters.  */
 261 #if HAVE_ICONV
 262               /* Avoid glibc-2.1 bug with EUC-KR.  */
 263 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
 264               if (strcmp (po_lex_charset, "EUC-KR") == 0)
 265                 po_lex_iconv = (iconv_t)(-1);
 266               else
 267 # endif
 268               /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
 269                  GBK, GB18030.  */
 270 # if defined __sun && !defined _LIBICONV_VERSION
 271               if (   strcmp (po_lex_charset, "GB2312") == 0
 272                   || strcmp (po_lex_charset, "EUC-TW") == 0
 273                   || strcmp (po_lex_charset, "BIG5") == 0
 274                   || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
 275                   || strcmp (po_lex_charset, "GBK") == 0
 276                   || strcmp (po_lex_charset, "GB18030") == 0)
 277                 po_lex_iconv = (iconv_t)(-1);
 278               else
 279 # endif
 280               po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
 281               if (po_lex_iconv == (iconv_t)(-1))
 282                 {
 283                   const char *note;
 284
 285                   /* Test for a charset which has double-byte characters
 286                      ending in 0x5C.  For these encodings, the string parser
 287                      is likely to be confused if it can't see the character
 288                      boundaries.  */
 289                   po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
 290                   if (po_is_charset_weird (po_lex_charset)
 291                       && !po_lex_weird_cjk)
 292                     note = _("Continuing anyway, expect parse errors.");
 293                   else
 294                     note = _("Continuing anyway.");
 295
 296                   po_multiline_warning (xasprintf (_("%s: warning: "), filename),
 297                                         xasprintf (_("\
 298 Charset \"%s\" is not supported. %s relies on iconv(),\n\
 299 and iconv() does not support \"%s\".\n"),
 300                                                    po_lex_charset,
 301                                                    basename (program_name),
 302                                                    po_lex_charset));
 303
 304 # if !defined _LIBICONV_VERSION
 305                   po_multiline_warning (NULL,
 306                                         xasprintf (_("\
 307 Installing GNU libiconv and then reinstalling GNU gettext\n\
 308 would fix this problem.\n")));
 309 # endif
 310
 311                   po_multiline_warning (NULL, xasprintf (_("%s\n"), note));
 312                 }
 313 #else
 314               /* Test for a charset which has double-byte characters
 315                  ending in 0x5C.  For these encodings, the string parser
 316                  is likely to be confused if it can't see the character
 317                  boundaries.  */
 318               po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
 319               if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
 320                 {
 321                   const char *note =
 322                     _("Continuing anyway, expect parse errors.");
 323
 324                   po_multiline_warning (xasprintf (_("%s: warning: "), filename),
 325                                         xasprintf (_("\
 326 Charset \"%s\" is not supported. %s relies on iconv().\n\
 327 This version was built without iconv().\n"),
 328                                                    po_lex_charset,
 329                                                    basename (program_name)));
 330
 331                   po_multiline_warning (NULL,
 332                                         xasprintf (_("\
 333 Installing GNU libiconv and then reinstalling GNU gettext\n\
 334 would fix this problem.\n")));
 335
 336                   po_multiline_warning (NULL, xasprintf (_("%s\n"), note));
 337                 }
 338 #endif
 339             }
 340         }
 341       freesa (charset);
 342     }
 343   else
 344     {
 345       /* Don't warn for POT files, because POT files usually contain
 346          only ASCII msgids.  */
 347       size_t filenamelen = strlen (filename);
 348
 349       if (!(filenamelen >= 4
 350             && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
 351         po_multiline_warning (xasprintf (_("%s: warning: "), filename),
 352                               xasprintf (_("\
 353 Charset missing in header.\n\
 354 Message conversion to user's charset will not work.\n")));
 355     }
 356 }
 357
 358 void
 359 po_lex_charset_close ()
 360 {
 361   po_lex_charset = NULL;
 362 #if HAVE_ICONV
 363   if (po_lex_iconv != (iconv_t)(-1))
 364     {
 365       iconv_close (po_lex_iconv);
 366       po_lex_iconv = (iconv_t)(-1);
 367     }
 368 #endif
 369   po_lex_weird_cjk = false;
 370 }