1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2005 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
26 #include "po-charset.h"
37 #include "c-strcase.h"
40 #define _(str) gettext (str)
42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44 static const char ascii
[] = "ASCII";
46 /* The canonicalized encoding name for ASCII. */
47 const char *po_charset_ascii
= ascii
;
49 static const char utf8
[] = "UTF-8";
51 /* The canonicalized encoding name for UTF-8. */
52 const char *po_charset_utf8
= utf8
;
54 /* Canonicalize an encoding name. */
56 po_charset_canonicalize (const char *charset
)
58 /* The list of charsets supported by glibc's iconv() and by the portable
59 iconv() across platforms. Taken from intl/config.charset. */
60 static const char *standard_charsets
[] =
62 ascii
, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
63 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
64 "ISO-8859-2", "ISO_8859-2",
65 "ISO-8859-3", "ISO_8859-3",
66 "ISO-8859-4", "ISO_8859-4",
67 "ISO-8859-5", "ISO_8859-5",
68 "ISO-8859-6", "ISO_8859-6",
69 "ISO-8859-7", "ISO_8859-7",
70 "ISO-8859-8", "ISO_8859-8",
71 "ISO-8859-9", "ISO_8859-9",
72 "ISO-8859-13", "ISO_8859-13",
73 "ISO-8859-14", "ISO_8859-14",
74 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
109 for (i
= 0; i
< SIZEOF (standard_charsets
); i
++)
110 if (c_strcasecmp (charset
, standard_charsets
[i
]) == 0)
111 return standard_charsets
[i
< 3 ? 0 : i
< 27 ? ((i
- 3) & ~1) + 3 : i
];
115 /* Test for ASCII compatibility. */
117 po_charset_ascii_compatible (const char *canon_charset
)
119 /* There are only a few exceptions to ASCII compatibility. */
120 if (strcmp (canon_charset
, "SHIFT_JIS") == 0
121 || strcmp (canon_charset
, "JOHAB") == 0
122 || strcmp (canon_charset
, "VISCII") == 0)
128 /* Test for a weird encoding, i.e. an encoding which has double-byte
129 characters ending in 0x5C. */
130 bool po_is_charset_weird (const char *canon_charset
)
132 static const char *weird_charsets
[] =
143 for (i
= 0; i
< SIZEOF (weird_charsets
); i
++)
144 if (strcmp (canon_charset
, weird_charsets
[i
]) == 0)
149 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
150 An encoding has CJK structure if every valid character stream is composed
151 of single bytes in the range 0x{00..7F} and of byte pairs in the range
152 0x{80..FF}{30..FF}. */
153 bool po_is_charset_weird_cjk (const char *canon_charset
)
155 static const char *weird_cjk_charsets
[] =
156 { /* single bytes double bytes */
157 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
158 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
159 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
160 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
161 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
162 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
166 for (i
= 0; i
< SIZEOF (weird_cjk_charsets
); i
++)
167 if (strcmp (canon_charset
, weird_cjk_charsets
[i
]) == 0)
173 /* The PO file's encoding, as specified in the header entry. */
174 const char *po_lex_charset
;
177 /* Converter from the PO file's encoding to UTF-8. */
178 iconv_t po_lex_iconv
;
180 /* If no converter is available, some information about the structure of the
181 PO file's encoding. */
182 bool po_lex_weird_cjk
;
185 po_lex_charset_init ()
187 po_lex_charset
= NULL
;
189 po_lex_iconv
= (iconv_t
)(-1);
191 po_lex_weird_cjk
= false;
195 po_lex_charset_set (const char *header_entry
, const char *filename
)
197 /* Verify the validity of CHARSET. It is necessary
198 1. for the correct treatment of multibyte characters containing
199 0x5C bytes in the PO lexer,
200 2. so that at run time, gettext() can call iconv() to convert
202 const char *charsetstr
= strstr (header_entry
, "charset=");
204 if (charsetstr
!= NULL
)
208 const char *canon_charset
;
210 charsetstr
+= strlen ("charset=");
211 len
= strcspn (charsetstr
, " \t\n");
212 charset
= (char *) xallocsa (len
+ 1);
213 memcpy (charset
, charsetstr
, len
);
216 canon_charset
= po_charset_canonicalize (charset
);
217 if (canon_charset
== NULL
)
219 /* Don't warn for POT files, because POT files usually contain
220 only ASCII msgids. */
221 size_t filenamelen
= strlen (filename
);
223 if (!(filenamelen
>= 4
224 && memcmp (filename
+ filenamelen
- 4, ".pot", 4) == 0
225 && strcmp (charset
, "CHARSET") == 0))
226 po_multiline_warning (xasprintf (_("%s: warning: "), filename
),
228 Charset \"%s\" is not a portable encoding name.\n\
229 Message conversion to user's charset might not work.\n"),
236 po_lex_charset
= canon_charset
;
238 if (po_lex_iconv
!= (iconv_t
)(-1))
239 iconv_close (po_lex_iconv
);
242 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
243 don't know about multibyte encodings, and require a spurious
244 backslash after every multibyte character whose last byte is
245 0x5C. Some programs, like vim, distribute PO files in this
246 broken format. GNU msgfmt must continue to support this old
247 PO file format when the Makefile requests it. */
248 envval
= getenv ("OLD_PO_FILE_INPUT");
249 if (envval
!= NULL
&& *envval
!= '\0')
251 /* Assume the PO file is in old format, with extraneous
254 po_lex_iconv
= (iconv_t
)(-1);
256 po_lex_weird_cjk
= false;
260 /* Use iconv() to parse multibyte characters. */
262 /* Avoid glibc-2.1 bug with EUC-KR. */
263 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
264 if (strcmp (po_lex_charset
, "EUC-KR") == 0)
265 po_lex_iconv
= (iconv_t
)(-1);
268 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
270 # if defined __sun && !defined _LIBICONV_VERSION
271 if ( strcmp (po_lex_charset
, "GB2312") == 0
272 || strcmp (po_lex_charset
, "EUC-TW") == 0
273 || strcmp (po_lex_charset
, "BIG5") == 0
274 || strcmp (po_lex_charset
, "BIG5-HKSCS") == 0
275 || strcmp (po_lex_charset
, "GBK") == 0
276 || strcmp (po_lex_charset
, "GB18030") == 0)
277 po_lex_iconv
= (iconv_t
)(-1);
280 po_lex_iconv
= iconv_open ("UTF-8", po_lex_charset
);
281 if (po_lex_iconv
== (iconv_t
)(-1))
285 /* Test for a charset which has double-byte characters
286 ending in 0x5C. For these encodings, the string parser
287 is likely to be confused if it can't see the character
289 po_lex_weird_cjk
= po_is_charset_weird_cjk (po_lex_charset
);
290 if (po_is_charset_weird (po_lex_charset
)
291 && !po_lex_weird_cjk
)
292 note
= _("Continuing anyway, expect parse errors.");
294 note
= _("Continuing anyway.");
296 po_multiline_warning (xasprintf (_("%s: warning: "), filename
),
298 Charset \"%s\" is not supported. %s relies on iconv(),\n\
299 and iconv() does not support \"%s\".\n"),
301 basename (program_name
),
304 # if !defined _LIBICONV_VERSION
305 po_multiline_warning (NULL
,
307 Installing GNU libiconv and then reinstalling GNU gettext\n\
308 would fix this problem.\n")));
311 po_multiline_warning (NULL
, xasprintf (_("%s\n"), note
));
314 /* Test for a charset which has double-byte characters
315 ending in 0x5C. For these encodings, the string parser
316 is likely to be confused if it can't see the character
318 po_lex_weird_cjk
= po_is_charset_weird_cjk (po_lex_charset
);
319 if (po_is_charset_weird (po_lex_charset
) && !po_lex_weird_cjk
)
322 _("Continuing anyway, expect parse errors.");
324 po_multiline_warning (xasprintf (_("%s: warning: "), filename
),
326 Charset \"%s\" is not supported. %s relies on iconv().\n\
327 This version was built without iconv().\n"),
329 basename (program_name
)));
331 po_multiline_warning (NULL
,
333 Installing GNU libiconv and then reinstalling GNU gettext\n\
334 would fix this problem.\n")));
336 po_multiline_warning (NULL
, xasprintf (_("%s\n"), note
));
345 /* Don't warn for POT files, because POT files usually contain
346 only ASCII msgids. */
347 size_t filenamelen
= strlen (filename
);
349 if (!(filenamelen
>= 4
350 && memcmp (filename
+ filenamelen
- 4, ".pot", 4) == 0))
351 po_multiline_warning (xasprintf (_("%s: warning: "), filename
),
353 Charset missing in header.\n\
354 Message conversion to user's charset will not work.\n")));
359 po_lex_charset_close ()
361 po_lex_charset
= NULL
;
363 if (po_lex_iconv
!= (iconv_t
)(-1))
365 iconv_close (po_lex_iconv
);
366 po_lex_iconv
= (iconv_t
)(-1);
369 po_lex_weird_cjk
= false;