Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / po-charset.c
blobb7533d6fe6d4773633276750a793bb93eafc192a
1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2005 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 #include <alloca.h>
25 /* Specification. */
26 #include "po-charset.h"
28 #include <stdlib.h>
29 #include <string.h>
31 #include "xallocsa.h"
32 #include "xerror.h"
33 #include "po-error.h"
34 #include "basename.h"
35 #include "progname.h"
36 #include "strstr.h"
37 #include "c-strcase.h"
38 #include "gettext.h"
40 #define _(str) gettext (str)
42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44 static const char ascii[] = "ASCII";
46 /* The canonicalized encoding name for ASCII. */
47 const char *po_charset_ascii = ascii;
49 static const char utf8[] = "UTF-8";
51 /* The canonicalized encoding name for UTF-8. */
52 const char *po_charset_utf8 = utf8;
54 /* Canonicalize an encoding name. */
55 const char *
56 po_charset_canonicalize (const char *charset)
58 /* The list of charsets supported by glibc's iconv() and by the portable
59 iconv() across platforms. Taken from intl/config.charset. */
60 static const char *standard_charsets[] =
62 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
63 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
64 "ISO-8859-2", "ISO_8859-2",
65 "ISO-8859-3", "ISO_8859-3",
66 "ISO-8859-4", "ISO_8859-4",
67 "ISO-8859-5", "ISO_8859-5",
68 "ISO-8859-6", "ISO_8859-6",
69 "ISO-8859-7", "ISO_8859-7",
70 "ISO-8859-8", "ISO_8859-8",
71 "ISO-8859-9", "ISO_8859-9",
72 "ISO-8859-13", "ISO_8859-13",
73 "ISO-8859-14", "ISO_8859-14",
74 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
75 "KOI8-R",
76 "KOI8-U",
77 "KOI8-T",
78 "CP850",
79 "CP866",
80 "CP874",
81 "CP932",
82 "CP949",
83 "CP950",
84 "CP1250",
85 "CP1251",
86 "CP1252",
87 "CP1253",
88 "CP1254",
89 "CP1255",
90 "CP1256",
91 "CP1257",
92 "GB2312",
93 "EUC-JP",
94 "EUC-KR",
95 "EUC-TW",
96 "BIG5",
97 "BIG5-HKSCS",
98 "GBK",
99 "GB18030",
100 "SHIFT_JIS",
101 "JOHAB",
102 "TIS-620",
103 "VISCII",
104 "GEORGIAN-PS",
105 utf8
107 size_t i;
109 for (i = 0; i < SIZEOF (standard_charsets); i++)
110 if (c_strcasecmp (charset, standard_charsets[i]) == 0)
111 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
112 return NULL;
115 /* Test for ASCII compatibility. */
116 bool
117 po_charset_ascii_compatible (const char *canon_charset)
119 /* There are only a few exceptions to ASCII compatibility. */
120 if (strcmp (canon_charset, "SHIFT_JIS") == 0
121 || strcmp (canon_charset, "JOHAB") == 0
122 || strcmp (canon_charset, "VISCII") == 0)
123 return false;
124 else
125 return true;
128 /* Test for a weird encoding, i.e. an encoding which has double-byte
129 characters ending in 0x5C. */
130 bool po_is_charset_weird (const char *canon_charset)
132 static const char *weird_charsets[] =
134 "BIG5",
135 "BIG5-HKSCS",
136 "GBK",
137 "GB18030",
138 "SHIFT_JIS",
139 "JOHAB"
141 size_t i;
143 for (i = 0; i < SIZEOF (weird_charsets); i++)
144 if (strcmp (canon_charset, weird_charsets[i]) == 0)
145 return true;
146 return false;
149 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
150 An encoding has CJK structure if every valid character stream is composed
151 of single bytes in the range 0x{00..7F} and of byte pairs in the range
152 0x{80..FF}{30..FF}. */
153 bool po_is_charset_weird_cjk (const char *canon_charset)
155 static const char *weird_cjk_charsets[] =
156 { /* single bytes double bytes */
157 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
158 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
159 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
160 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
161 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
162 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
164 size_t i;
166 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
167 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
168 return true;
169 return false;
173 /* The PO file's encoding, as specified in the header entry. */
174 const char *po_lex_charset;
176 #if HAVE_ICONV
177 /* Converter from the PO file's encoding to UTF-8. */
178 iconv_t po_lex_iconv;
179 #endif
180 /* If no converter is available, some information about the structure of the
181 PO file's encoding. */
182 bool po_lex_weird_cjk;
184 void
185 po_lex_charset_init ()
187 po_lex_charset = NULL;
188 #if HAVE_ICONV
189 po_lex_iconv = (iconv_t)(-1);
190 #endif
191 po_lex_weird_cjk = false;
194 void
195 po_lex_charset_set (const char *header_entry, const char *filename)
197 /* Verify the validity of CHARSET. It is necessary
198 1. for the correct treatment of multibyte characters containing
199 0x5C bytes in the PO lexer,
200 2. so that at run time, gettext() can call iconv() to convert
201 msgstr. */
202 const char *charsetstr = strstr (header_entry, "charset=");
204 if (charsetstr != NULL)
206 size_t len;
207 char *charset;
208 const char *canon_charset;
210 charsetstr += strlen ("charset=");
211 len = strcspn (charsetstr, " \t\n");
212 charset = (char *) xallocsa (len + 1);
213 memcpy (charset, charsetstr, len);
214 charset[len] = '\0';
216 canon_charset = po_charset_canonicalize (charset);
217 if (canon_charset == NULL)
219 /* Don't warn for POT files, because POT files usually contain
220 only ASCII msgids. */
221 size_t filenamelen = strlen (filename);
223 if (!(filenamelen >= 4
224 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
225 && strcmp (charset, "CHARSET") == 0))
226 po_multiline_warning (xasprintf (_("%s: warning: "), filename),
227 xasprintf (_("\
228 Charset \"%s\" is not a portable encoding name.\n\
229 Message conversion to user's charset might not work.\n"),
230 charset));
232 else
234 const char *envval;
236 po_lex_charset = canon_charset;
237 #if HAVE_ICONV
238 if (po_lex_iconv != (iconv_t)(-1))
239 iconv_close (po_lex_iconv);
240 #endif
242 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
243 don't know about multibyte encodings, and require a spurious
244 backslash after every multibyte character whose last byte is
245 0x5C. Some programs, like vim, distribute PO files in this
246 broken format. GNU msgfmt must continue to support this old
247 PO file format when the Makefile requests it. */
248 envval = getenv ("OLD_PO_FILE_INPUT");
249 if (envval != NULL && *envval != '\0')
251 /* Assume the PO file is in old format, with extraneous
252 backslashes. */
253 #if HAVE_ICONV
254 po_lex_iconv = (iconv_t)(-1);
255 #endif
256 po_lex_weird_cjk = false;
258 else
260 /* Use iconv() to parse multibyte characters. */
261 #if HAVE_ICONV
262 /* Avoid glibc-2.1 bug with EUC-KR. */
263 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
264 if (strcmp (po_lex_charset, "EUC-KR") == 0)
265 po_lex_iconv = (iconv_t)(-1);
266 else
267 # endif
268 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
269 GBK, GB18030. */
270 # if defined __sun && !defined _LIBICONV_VERSION
271 if ( strcmp (po_lex_charset, "GB2312") == 0
272 || strcmp (po_lex_charset, "EUC-TW") == 0
273 || strcmp (po_lex_charset, "BIG5") == 0
274 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
275 || strcmp (po_lex_charset, "GBK") == 0
276 || strcmp (po_lex_charset, "GB18030") == 0)
277 po_lex_iconv = (iconv_t)(-1);
278 else
279 # endif
280 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
281 if (po_lex_iconv == (iconv_t)(-1))
283 const char *note;
285 /* Test for a charset which has double-byte characters
286 ending in 0x5C. For these encodings, the string parser
287 is likely to be confused if it can't see the character
288 boundaries. */
289 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
290 if (po_is_charset_weird (po_lex_charset)
291 && !po_lex_weird_cjk)
292 note = _("Continuing anyway, expect parse errors.");
293 else
294 note = _("Continuing anyway.");
296 po_multiline_warning (xasprintf (_("%s: warning: "), filename),
297 xasprintf (_("\
298 Charset \"%s\" is not supported. %s relies on iconv(),\n\
299 and iconv() does not support \"%s\".\n"),
300 po_lex_charset,
301 basename (program_name),
302 po_lex_charset));
304 # if !defined _LIBICONV_VERSION
305 po_multiline_warning (NULL,
306 xasprintf (_("\
307 Installing GNU libiconv and then reinstalling GNU gettext\n\
308 would fix this problem.\n")));
309 # endif
311 po_multiline_warning (NULL, xasprintf (_("%s\n"), note));
313 #else
314 /* Test for a charset which has double-byte characters
315 ending in 0x5C. For these encodings, the string parser
316 is likely to be confused if it can't see the character
317 boundaries. */
318 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
319 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
321 const char *note =
322 _("Continuing anyway, expect parse errors.");
324 po_multiline_warning (xasprintf (_("%s: warning: "), filename),
325 xasprintf (_("\
326 Charset \"%s\" is not supported. %s relies on iconv().\n\
327 This version was built without iconv().\n"),
328 po_lex_charset,
329 basename (program_name)));
331 po_multiline_warning (NULL,
332 xasprintf (_("\
333 Installing GNU libiconv and then reinstalling GNU gettext\n\
334 would fix this problem.\n")));
336 po_multiline_warning (NULL, xasprintf (_("%s\n"), note));
338 #endif
341 freesa (charset);
343 else
345 /* Don't warn for POT files, because POT files usually contain
346 only ASCII msgids. */
347 size_t filenamelen = strlen (filename);
349 if (!(filenamelen >= 4
350 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
351 po_multiline_warning (xasprintf (_("%s: warning: "), filename),
352 xasprintf (_("\
353 Charset missing in header.\n\
354 Message conversion to user's charset will not work.\n")));
358 void
359 po_lex_charset_close ()
361 po_lex_charset = NULL;
362 #if HAVE_ICONV
363 if (po_lex_iconv != (iconv_t)(-1))
365 iconv_close (po_lex_iconv);
366 po_lex_iconv = (iconv_t)(-1);
368 #endif
369 po_lex_weird_cjk = false;