Fix error creation and warning
[claws.git] / src / common / codeconv.c
blob2337b4e12dc7ba194b3239d7b77f6709f68394b8
1 /*
2 * Claws Mail -- a GTK based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2012 Hiroyuki Yamamoto and the Claws Mail team
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #include "claws-features.h"
23 #endif
25 #include "defs.h"
27 #include <glib.h>
28 #include <glib/gi18n.h>
29 #include <string.h>
30 #include <ctype.h>
31 #include <stdlib.h>
32 #include <errno.h>
34 #if HAVE_LOCALE_H
35 # include <locale.h>
36 #endif
38 #include "codeconv.h"
39 #include "unmime.h"
40 #include "quoted-printable.h"
41 #include "utils.h"
43 /* For unknown reasons the inconv.m4 macro undefs that macro if no
44 const is needed. This would break the code below so we define it. */
45 #ifndef ICONV_CONST
46 #define ICONV_CONST
47 #endif
49 typedef enum
51 JIS_ASCII,
52 JIS_KANJI,
53 JIS_HWKANA,
54 JIS_AUXKANJI
55 } JISState;
57 #define SUBST_CHAR 0x5f;
58 #define ESC '\033'
60 #define iseuckanji(c) \
61 (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xfe)
62 #define iseuchwkana1(c) \
63 (((c) & 0xff) == 0x8e)
64 #define iseuchwkana2(c) \
65 (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
66 #define iseucaux(c) \
67 (((c) & 0xff) == 0x8f)
68 #define issjiskanji1(c) \
69 ((((c) & 0xff) >= 0x81 && ((c) & 0xff) <= 0x9f) || \
70 (((c) & 0xff) >= 0xe0 && ((c) & 0xff) <= 0xfc))
71 #define issjiskanji2(c) \
72 ((((c) & 0xff) >= 0x40 && ((c) & 0xff) <= 0x7e) || \
73 (((c) & 0xff) >= 0x80 && ((c) & 0xff) <= 0xfc))
74 #define issjishwkana(c) \
75 (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
77 #define K_IN() \
78 if (state != JIS_KANJI) { \
79 *out++ = ESC; \
80 *out++ = '$'; \
81 *out++ = 'B'; \
82 state = JIS_KANJI; \
85 #define K_OUT() \
86 if (state != JIS_ASCII) { \
87 *out++ = ESC; \
88 *out++ = '('; \
89 *out++ = 'B'; \
90 state = JIS_ASCII; \
93 #define HW_IN() \
94 if (state != JIS_HWKANA) { \
95 *out++ = ESC; \
96 *out++ = '('; \
97 *out++ = 'I'; \
98 state = JIS_HWKANA; \
101 #define AUX_IN() \
102 if (state != JIS_AUXKANJI) { \
103 *out++ = ESC; \
104 *out++ = '$'; \
105 *out++ = '('; \
106 *out++ = 'D'; \
107 state = JIS_AUXKANJI; \
110 static CodeConvFunc conv_get_code_conv_func (const gchar *src_charset_str,
111 const gchar *dest_charset_str);
113 static gchar *conv_iconv_strdup_with_cd (const gchar *inbuf,
114 iconv_t cd);
116 static gchar *conv_iconv_strdup (const gchar *inbuf,
117 const gchar *src_code,
118 const gchar *dest_code);
120 static CharSet conv_get_locale_charset (void);
121 static CharSet conv_get_outgoing_charset (void);
122 static CharSet conv_guess_ja_encoding(const gchar *str);
123 static gboolean conv_is_ja_locale (void);
125 static gint conv_jistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf);
126 static gint conv_euctojis(gchar *outbuf, gint outlen, const gchar *inbuf);
127 static gint conv_sjistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf);
129 static gint conv_jistoutf8(gchar *outbuf, gint outlen, const gchar *inbuf);
130 static gint conv_sjistoutf8(gchar *outbuf, gint outlen, const gchar *inbuf);
131 static gint conv_euctoutf8(gchar *outbuf, gint outlen, const gchar *inbuf);
132 static gint conv_anytoutf8(gchar *outbuf, gint outlen, const gchar *inbuf);
134 static gint conv_utf8toeuc(gchar *outbuf, gint outlen, const gchar *inbuf);
135 static gint conv_utf8tojis(gchar *outbuf, gint outlen, const gchar *inbuf);
137 static void conv_unreadable_8bit(gchar *str);
139 static gint conv_jistodisp(gchar *outbuf, gint outlen, const gchar *inbuf);
140 static gint conv_sjistodisp(gchar *outbuf, gint outlen, const gchar *inbuf);
141 static gint conv_euctodisp(gchar *outbuf, gint outlen, const gchar *inbuf);
143 static gint conv_anytodisp(gchar *outbuf, gint outlen, const gchar *inbuf);
144 static gint conv_ustodisp(gchar *outbuf, gint outlen, const gchar *inbuf);
145 static gint conv_noconv(gchar *outbuf, gint outlen, const gchar *inbuf);
147 static gboolean codeconv_strict_mode = FALSE;
148 static gboolean codeconv_allow_jisx0201_kana = FALSE;
149 static gboolean codeconv_broken_are_utf8 = FALSE;
151 void codeconv_set_strict(gboolean mode)
153 codeconv_strict_mode = mode;
156 void codeconv_set_allow_jisx0201_kana(gboolean allow)
158 codeconv_allow_jisx0201_kana = allow;
161 void codeconv_set_broken_are_utf8(gboolean are)
163 codeconv_broken_are_utf8 = are;
166 static gint conv_jistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
168 const guchar *in = inbuf;
169 gchar *out = outbuf;
170 JISState state = JIS_ASCII;
172 cm_return_val_if_fail(outbuf != NULL, 0);
175 * Loop outputs up to 3 bytes in each pass (aux kanji) and we
176 * need 1 byte to terminate the output
178 while (*in != '\0' && (out - outbuf) < outlen - 4) {
179 if (*in == ESC) {
180 in++;
181 if (*in == '$') {
182 if (*(in + 1) == '@' || *(in + 1) == 'B') {
183 state = JIS_KANJI;
184 in += 2;
185 } else if (*(in + 1) == '(' &&
186 *(in + 2) == 'D') {
187 state = JIS_AUXKANJI;
188 in += 3;
189 } else {
190 /* unknown escape sequence */
191 state = JIS_ASCII;
193 } else if (*in == '(') {
194 if (*(in + 1) == 'B' || *(in + 1) == 'J') {
195 state = JIS_ASCII;
196 in += 2;
197 } else if (*(in + 1) == 'I') {
198 state = JIS_HWKANA;
199 in += 2;
200 } else {
201 /* unknown escape sequence */
202 state = JIS_ASCII;
204 } else {
205 /* unknown escape sequence */
206 state = JIS_ASCII;
208 } else if (*in == 0x0e) {
209 state = JIS_HWKANA;
210 in++;
211 } else if (*in == 0x0f) {
212 state = JIS_ASCII;
213 in++;
214 } else {
215 switch (state) {
216 case JIS_ASCII:
217 *out++ = *in++;
218 break;
219 case JIS_KANJI:
220 *out++ = *in++ | 0x80;
221 if (*in == '\0') break;
222 *out++ = *in++ | 0x80;
223 break;
224 case JIS_HWKANA:
225 *out++ = 0x8e;
226 *out++ = *in++ | 0x80;
227 break;
228 case JIS_AUXKANJI:
229 *out++ = 0x8f;
230 *out++ = *in++ | 0x80;
231 if (*in == '\0') break;
232 *out++ = *in++ | 0x80;
233 break;
238 *out = '\0';
239 return 0;
242 #define JIS_HWDAKUTEN 0x5e
243 #define JIS_HWHANDAKUTEN 0x5f
245 static gint conv_jis_hantozen(guchar *outbuf, guchar jis_code, guchar sound_sym)
247 static guint16 h2z_tbl[] = {
248 /* 0x20 - 0x2f */
249 0x0000, 0x2123, 0x2156, 0x2157, 0x2122, 0x2126, 0x2572, 0x2521,
250 0x2523, 0x2525, 0x2527, 0x2529, 0x2563, 0x2565, 0x2567, 0x2543,
251 /* 0x30 - 0x3f */
252 0x213c, 0x2522, 0x2524, 0x2526, 0x2528, 0x252a, 0x252b, 0x252d,
253 0x252f, 0x2531, 0x2533, 0x2535, 0x2537, 0x2539, 0x253b, 0x253d,
254 /* 0x40 - 0x4f */
255 0x253f, 0x2541, 0x2544, 0x2546, 0x2548, 0x254a, 0x254b, 0x254c,
256 0x254d, 0x254e, 0x254f, 0x2552, 0x2555, 0x2558, 0x255b, 0x255e,
257 /* 0x50 - 0x5f */
258 0x255f, 0x2560, 0x2561, 0x2562, 0x2564, 0x2566, 0x2568, 0x2569,
259 0x256a, 0x256b, 0x256c, 0x256d, 0x256f, 0x2573, 0x212b, 0x212c
262 static guint16 dakuten_tbl[] = {
263 /* 0x30 - 0x3f */
264 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x252c, 0x252e,
265 0x2530, 0x2532, 0x2534, 0x2536, 0x2538, 0x253a, 0x253c, 0x253e,
266 /* 0x40 - 0x4f */
267 0x2540, 0x2542, 0x2545, 0x2547, 0x2549, 0x0000, 0x0000, 0x0000,
268 0x0000, 0x0000, 0x2550, 0x2553, 0x2556, 0x2559, 0x255c, 0x0000
271 static guint16 handakuten_tbl[] = {
272 /* 0x4a - 0x4e */
273 0x2551, 0x2554, 0x2557, 0x255a, 0x255d
276 guint16 out_code;
278 cm_return_val_if_fail(outbuf != NULL, 0);
280 jis_code &= 0x7f;
281 sound_sym &= 0x7f;
283 if (jis_code < 0x21 || jis_code > 0x5f)
284 return 0;
286 if (sound_sym == JIS_HWDAKUTEN &&
287 jis_code >= 0x36 && jis_code <= 0x4e) {
288 out_code = dakuten_tbl[jis_code - 0x30];
289 if (out_code != 0) {
290 *outbuf = out_code >> 8;
291 *(outbuf + 1) = out_code & 0xff;
292 return 2;
296 if (sound_sym == JIS_HWHANDAKUTEN &&
297 jis_code >= 0x4a && jis_code <= 0x4e) {
298 out_code = handakuten_tbl[jis_code - 0x4a];
299 *outbuf = out_code >> 8;
300 *(outbuf + 1) = out_code & 0xff;
301 return 2;
304 out_code = h2z_tbl[jis_code - 0x20];
305 *outbuf = out_code >> 8;
306 *(outbuf + 1) = out_code & 0xff;
307 return 1;
310 static gint conv_euctojis(gchar *outbuf, gint outlen, const gchar *inbuf)
312 const guchar *in = inbuf;
313 gchar *out = outbuf;
314 JISState state = JIS_ASCII;
316 cm_return_val_if_fail(outbuf != NULL, 0);
319 * Loop outputs up to 6 bytes in each pass (aux shift + aux
320 * kanji) and we need up to 4 bytes to terminate the output
321 * (ASCII shift + null)
323 while (*in != '\0' && (out - outbuf) < outlen - 10) {
324 if (IS_ASCII(*in)) {
325 K_OUT();
326 *out++ = *in++;
327 } else if (iseuckanji(*in)) {
328 if (iseuckanji(*(in + 1))) {
329 K_IN();
330 *out++ = *in++ & 0x7f;
331 *out++ = *in++ & 0x7f;
332 } else {
333 K_OUT();
334 *out++ = SUBST_CHAR;
335 in++;
336 if (*in != '\0' && !IS_ASCII(*in)) {
337 *out++ = SUBST_CHAR;
338 in++;
341 } else if (iseuchwkana1(*in)) {
342 if (iseuchwkana2(*(in + 1))) {
343 if (codeconv_allow_jisx0201_kana) {
344 HW_IN();
345 in++;
346 *out++ = *in++ & 0x7f;
347 } else {
348 guchar jis_ch[2];
349 gint len;
351 if (iseuchwkana1(*(in + 2)) &&
352 iseuchwkana2(*(in + 3)))
353 len = conv_jis_hantozen
354 (jis_ch,
355 *(in + 1), *(in + 3));
356 else
357 len = conv_jis_hantozen
358 (jis_ch,
359 *(in + 1), '\0');
360 if (len == 0)
361 in += 2;
362 else {
363 K_IN();
364 in += len * 2;
365 *out++ = jis_ch[0];
366 *out++ = jis_ch[1];
369 } else {
370 K_OUT();
371 in++;
372 if (*in != '\0' && !IS_ASCII(*in)) {
373 *out++ = SUBST_CHAR;
374 in++;
377 } else if (iseucaux(*in)) {
378 in++;
379 if (iseuckanji(*in) && iseuckanji(*(in + 1))) {
380 AUX_IN();
381 *out++ = *in++ & 0x7f;
382 *out++ = *in++ & 0x7f;
383 } else {
384 K_OUT();
385 if (*in != '\0' && !IS_ASCII(*in)) {
386 *out++ = SUBST_CHAR;
387 in++;
388 if (*in != '\0' && !IS_ASCII(*in)) {
389 *out++ = SUBST_CHAR;
390 in++;
394 } else {
395 K_OUT();
396 *out++ = SUBST_CHAR;
397 in++;
401 K_OUT();
402 *out = '\0';
403 return 0;
406 static gint conv_sjistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
408 const guchar *in = inbuf;
409 gchar *out = outbuf;
411 cm_return_val_if_fail(outbuf != NULL, 0);
414 * Loop outputs up to 2 bytes in each pass and we need 1 byte
415 * to terminate the output
417 while (*in != '\0' && (out - outbuf) < outlen - 3) {
418 if (IS_ASCII(*in)) {
419 *out++ = *in++;
420 } else if (issjiskanji1(*in)) {
421 if (issjiskanji2(*(in + 1))) {
422 guchar out1 = *in;
423 guchar out2 = *(in + 1);
424 guchar row;
426 row = out1 < 0xa0 ? 0x70 : 0xb0;
427 if (out2 < 0x9f) {
428 out1 = (out1 - row) * 2 - 1;
429 out2 -= out2 > 0x7f ? 0x20 : 0x1f;
430 } else {
431 out1 = (out1 - row) * 2;
432 out2 -= 0x7e;
435 *out++ = out1 | 0x80;
436 *out++ = out2 | 0x80;
437 in += 2;
438 } else {
439 *out++ = SUBST_CHAR;
440 in++;
441 if (*in != '\0' && !IS_ASCII(*in)) {
442 *out++ = SUBST_CHAR;
443 in++;
446 } else if (issjishwkana(*in)) {
447 *out++ = 0x8e;
448 *out++ = *in++;
449 } else {
450 *out++ = SUBST_CHAR;
451 in++;
455 *out = '\0';
456 return 0;
459 static gint conv_jistoutf8(gchar *outbuf, gint outlen, const gchar *inbuf)
461 gchar *eucstr;
463 cm_return_val_if_fail(inbuf != NULL, 0);
464 cm_return_val_if_fail(outbuf != NULL, 0);
466 Xalloca(eucstr, outlen, return -1);
468 if (conv_jistoeuc(eucstr, outlen, inbuf) <0)
469 return -1;
470 if (conv_euctoutf8(outbuf, outlen, eucstr) < 0)
471 return -1;
472 return 0;
475 static gint conv_sjistoutf8(gchar *outbuf, gint outlen, const gchar *inbuf)
477 gchar *tmpstr;
479 cm_return_val_if_fail(inbuf != NULL, 0);
480 cm_return_val_if_fail(outbuf != NULL, 0);
482 tmpstr = conv_iconv_strdup(inbuf, CS_SHIFT_JIS, CS_UTF_8);
483 if (tmpstr) {
484 strncpy2(outbuf, tmpstr, outlen);
485 g_free(tmpstr);
486 return 0;
487 } else {
488 strncpy2(outbuf, inbuf, outlen);
489 return -1;
493 static gint conv_euctoutf8(gchar *outbuf, gint outlen, const gchar *inbuf)
495 static iconv_t cd = (iconv_t)-1;
496 static gboolean iconv_ok = TRUE;
497 gchar *tmpstr;
499 cm_return_val_if_fail(inbuf != NULL, 0);
500 cm_return_val_if_fail(outbuf != NULL, 0);
502 if (cd == (iconv_t)-1) {
503 if (!iconv_ok) {
504 strncpy2(outbuf, inbuf, outlen);
505 return -1;
507 cd = iconv_open(CS_UTF_8, CS_EUC_JP_MS);
508 if (cd == (iconv_t)-1) {
509 cd = iconv_open(CS_UTF_8, CS_EUC_JP);
510 if (cd == (iconv_t)-1) {
511 g_warning("conv_euctoutf8(): %s",
512 g_strerror(errno));
513 iconv_ok = FALSE;
514 strncpy2(outbuf, inbuf, outlen);
515 return -1;
520 tmpstr = conv_iconv_strdup_with_cd(inbuf, cd);
521 if (tmpstr) {
522 strncpy2(outbuf, tmpstr, outlen);
523 g_free(tmpstr);
524 return 0;
525 } else {
526 strncpy2(outbuf, inbuf, outlen);
527 return -1;
531 static gint conv_anytoutf8(gchar *outbuf, gint outlen, const gchar *inbuf)
533 gint r = -1;
535 cm_return_val_if_fail(inbuf != NULL, 0);
536 cm_return_val_if_fail(outbuf != NULL, 0);
538 switch (conv_guess_ja_encoding(inbuf)) {
539 case C_ISO_2022_JP:
540 r = conv_jistoutf8(outbuf, outlen, inbuf);
541 break;
542 case C_SHIFT_JIS:
543 r = conv_sjistoutf8(outbuf, outlen, inbuf);
544 break;
545 case C_EUC_JP:
546 r = conv_euctoutf8(outbuf, outlen, inbuf);
547 break;
548 default:
549 r = 0;
550 strncpy2(outbuf, inbuf, outlen);
551 break;
554 return r;
557 static gint conv_utf8toeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
559 static iconv_t cd = (iconv_t)-1;
560 static gboolean iconv_ok = TRUE;
561 gchar *tmpstr;
563 cm_return_val_if_fail(inbuf != NULL, 0);
564 cm_return_val_if_fail(outbuf != NULL, 0);
566 if (cd == (iconv_t)-1) {
567 if (!iconv_ok) {
568 strncpy2(outbuf, inbuf, outlen);
569 return -1;
571 cd = iconv_open(CS_EUC_JP_MS, CS_UTF_8);
572 if (cd == (iconv_t)-1) {
573 cd = iconv_open(CS_EUC_JP, CS_UTF_8);
574 if (cd == (iconv_t)-1) {
575 g_warning("conv_utf8toeuc(): %s",
576 g_strerror(errno));
577 iconv_ok = FALSE;
578 strncpy2(outbuf, inbuf, outlen);
579 return -1;
584 tmpstr = conv_iconv_strdup_with_cd(inbuf, cd);
585 if (tmpstr) {
586 strncpy2(outbuf, tmpstr, outlen);
587 g_free(tmpstr);
588 return 0;
589 } else {
590 strncpy2(outbuf, inbuf, outlen);
591 return -1;
595 static gint conv_utf8tojis(gchar *outbuf, gint outlen, const gchar *inbuf)
597 gchar *eucstr;
599 cm_return_val_if_fail(inbuf != NULL, 0);
600 cm_return_val_if_fail(outbuf != NULL, 0);
602 Xalloca(eucstr, outlen, return -1);
604 if (conv_utf8toeuc(eucstr, outlen, inbuf) < 0)
605 return -1;
606 if (conv_euctojis(outbuf, outlen, eucstr) < 0)
607 return -1;
609 return 0;
612 static void conv_unreadable_8bit(gchar *str)
614 register guchar *p = str;
616 while (*p != '\0') {
617 /* convert CR+LF -> LF */
618 if (*p == '\r' && *(p + 1) == '\n')
619 memmove(p, p + 1, strlen(p));
620 else if (!IS_ASCII(*p)) *p = SUBST_CHAR;
621 p++;
625 static CharSet conv_guess_ja_encoding(const gchar *str)
627 const guchar *p = str;
628 CharSet guessed = C_US_ASCII;
630 while (*p != '\0') {
631 if (*p == ESC && (*(p + 1) == '$' || *(p + 1) == '(')) {
632 if (guessed == C_US_ASCII)
633 return C_ISO_2022_JP;
634 p += 2;
635 } else if (IS_ASCII(*p)) {
636 p++;
637 } else if (iseuckanji(*p) && iseuckanji(*(p + 1))) {
638 if (*p >= 0xfd && *p <= 0xfe)
639 return C_EUC_JP;
640 else if (guessed == C_SHIFT_JIS) {
641 if ((issjiskanji1(*p) &&
642 issjiskanji2(*(p + 1))) ||
643 issjishwkana(*p))
644 guessed = C_SHIFT_JIS;
645 else
646 guessed = C_EUC_JP;
647 } else
648 guessed = C_EUC_JP;
649 p += 2;
650 } else if (issjiskanji1(*p) && issjiskanji2(*(p + 1))) {
651 if (iseuchwkana1(*p) && iseuchwkana2(*(p + 1)))
652 guessed = C_SHIFT_JIS;
653 else
654 return C_SHIFT_JIS;
655 p += 2;
656 } else if (issjishwkana(*p)) {
657 guessed = C_SHIFT_JIS;
658 p++;
659 } else {
660 p++;
664 return guessed;
667 static gint conv_jistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
669 cm_return_val_if_fail(inbuf != NULL, 0);
670 cm_return_val_if_fail(outbuf != NULL, 0);
672 return conv_jistoutf8(outbuf, outlen, inbuf);
675 static gint conv_sjistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
677 cm_return_val_if_fail(inbuf != NULL, 0);
678 cm_return_val_if_fail(outbuf != NULL, 0);
680 return conv_sjistoutf8(outbuf, outlen, inbuf);
683 static gint conv_euctodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
685 cm_return_val_if_fail(inbuf != NULL, 0);
686 cm_return_val_if_fail(outbuf != NULL, 0);
688 return conv_euctoutf8(outbuf, outlen, inbuf);
691 void conv_utf8todisp(gchar *outbuf, gint outlen, const gchar *inbuf)
693 cm_return_if_fail(inbuf != NULL);
694 cm_return_if_fail(outbuf != NULL);
696 if (g_utf8_validate(inbuf, -1, NULL) == TRUE)
697 strncpy2(outbuf, inbuf, outlen);
698 else
699 conv_ustodisp(outbuf, outlen, inbuf);
702 static gint conv_anytodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
704 gint r = 0;
706 cm_return_val_if_fail(inbuf != NULL, 0);
707 cm_return_val_if_fail(outbuf != NULL, 0);
709 if (conv_anytoutf8(outbuf, outlen, inbuf) < 0)
710 r = -1;
711 if (g_utf8_validate(outbuf, -1, NULL) != TRUE)
712 conv_unreadable_8bit(outbuf);
713 return r;
716 static gint conv_ustodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
718 cm_return_val_if_fail(inbuf != NULL, 0);
719 cm_return_val_if_fail(outbuf != NULL, 0);
721 strncpy2(outbuf, inbuf, outlen);
722 conv_unreadable_8bit(outbuf);
724 return 0;
727 void conv_localetodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
729 gchar *tmpstr;
731 cm_return_if_fail(inbuf != NULL);
732 cm_return_if_fail(outbuf != NULL);
734 codeconv_set_strict(TRUE);
735 tmpstr = conv_iconv_strdup(inbuf, conv_get_locale_charset_str(),
736 CS_INTERNAL);
737 codeconv_set_strict(FALSE);
738 if (tmpstr && g_utf8_validate(tmpstr, -1, NULL)) {
739 strncpy2(outbuf, tmpstr, outlen);
740 g_free(tmpstr);
741 return;
742 } else if (tmpstr && !g_utf8_validate(tmpstr, -1, NULL)) {
743 g_free(tmpstr);
744 codeconv_set_strict(TRUE);
745 tmpstr = conv_iconv_strdup(inbuf,
746 conv_get_locale_charset_str_no_utf8(),
747 CS_INTERNAL);
748 codeconv_set_strict(FALSE);
750 if (tmpstr && g_utf8_validate(tmpstr, -1, NULL)) {
751 strncpy2(outbuf, tmpstr, outlen);
752 g_free(tmpstr);
753 return;
754 } else {
755 g_free(tmpstr);
756 conv_utf8todisp(outbuf, outlen, inbuf);
760 static gint conv_noconv(gchar *outbuf, gint outlen, const gchar *inbuf)
762 cm_return_val_if_fail(inbuf != NULL, 0);
763 cm_return_val_if_fail(outbuf != NULL, 0);
765 strncpy2(outbuf, inbuf, outlen);
766 return 0;
769 static const gchar *
770 conv_get_fallback_for_private_encoding(const gchar *encoding)
772 if (encoding) {
773 if ((encoding[0] == 'X' || encoding[0] == 'x') &&
774 encoding[1] == '-') {
775 if (!g_ascii_strcasecmp(encoding, CS_X_MACCYR))
776 return CS_MACCYR;
777 if (!g_ascii_strcasecmp(encoding, CS_X_GBK))
778 return CS_GBK;
780 else if(!g_ascii_strcasecmp(encoding, CS_ISO_8859_8_I)) {
782 * ISO-8859-8-I is a variant which fully
783 * agrees with ISO-8859-8 on character
784 * codings, and differs only in directionality
785 * implications, which are ignored here
786 * anyway; and is not recognized by iconv
788 return CS_ISO_8859_8;
792 return encoding;
795 CodeConverter *conv_code_converter_new(const gchar *src_charset)
797 CodeConverter *conv;
799 src_charset = conv_get_fallback_for_private_encoding(src_charset);
801 conv = g_new0(CodeConverter, 1);
802 conv->code_conv_func = conv_get_code_conv_func(src_charset, NULL);
803 conv->charset_str = g_strdup(src_charset);
804 conv->charset = conv_get_charset_from_str(src_charset);
806 return conv;
809 void conv_code_converter_destroy(CodeConverter *conv)
811 g_free(conv->charset_str);
812 g_free(conv);
815 gint conv_convert(CodeConverter *conv, gchar *outbuf, gint outlen,
816 const gchar *inbuf)
818 cm_return_val_if_fail(inbuf != NULL, -1);
819 cm_return_val_if_fail(outbuf != NULL, -1);
821 if (conv->code_conv_func != conv_noconv)
822 return conv->code_conv_func(outbuf, outlen, inbuf);
823 else {
824 gchar *str;
826 str = conv_iconv_strdup(inbuf, conv->charset_str, NULL);
827 if (!str)
828 return -1;
829 else {
830 strncpy2(outbuf, str, outlen);
831 g_free(str);
835 return 0;
838 gchar *conv_codeset_strdup(const gchar *inbuf,
839 const gchar *src_code, const gchar *dest_code)
841 gchar *buf;
842 size_t len;
843 CodeConvFunc conv_func;
845 cm_return_val_if_fail(inbuf != NULL, NULL);
847 if (!g_strcmp0(src_code, dest_code)) {
848 CharSet dest_charset = conv_get_charset_from_str(dest_code);
849 if (codeconv_strict_mode && dest_charset == C_UTF_8) {
850 /* ensure valid UTF-8 if target is UTF-8 */
851 if (!g_utf8_validate(inbuf, -1, NULL)) {
852 return NULL;
855 /* otherwise, try for a lucky day */
856 return g_strdup(inbuf);
859 src_code = conv_get_fallback_for_private_encoding(src_code);
860 conv_func = conv_get_code_conv_func(src_code, dest_code);
861 if (conv_func == conv_ustodisp
862 && codeconv_strict_mode
863 && !is_ascii_str(inbuf))
864 return NULL;
866 if (conv_func != conv_noconv) {
867 len = (strlen(inbuf) + 1) * 3;
868 buf = g_malloc(len);
870 if (conv_func(buf, len, inbuf) == 0 || !codeconv_strict_mode)
871 return g_realloc(buf, strlen(buf) + 1);
872 else {
873 g_free(buf);
874 return NULL;
878 return conv_iconv_strdup(inbuf, src_code, dest_code);
881 static CodeConvFunc conv_get_code_conv_func(const gchar *src_charset_str,
882 const gchar *dest_charset_str)
884 CodeConvFunc code_conv = conv_noconv;
885 CharSet src_charset;
886 CharSet dest_charset;
888 if (!src_charset_str)
889 src_charset = conv_get_locale_charset();
890 else
891 src_charset = conv_get_charset_from_str(src_charset_str);
893 /* auto detection mode */
894 if (!src_charset_str && !dest_charset_str) {
895 if (conv_is_ja_locale())
896 return conv_anytodisp;
897 else
898 return conv_noconv;
901 dest_charset = conv_get_charset_from_str(dest_charset_str);
903 if (dest_charset == C_US_ASCII)
904 return conv_ustodisp;
906 switch (src_charset) {
907 case C_US_ASCII:
908 case C_ISO_8859_1:
909 case C_ISO_8859_2:
910 case C_ISO_8859_3:
911 case C_ISO_8859_4:
912 case C_ISO_8859_5:
913 case C_ISO_8859_6:
914 case C_ISO_8859_7:
915 case C_ISO_8859_8:
916 case C_ISO_8859_9:
917 case C_ISO_8859_10:
918 case C_ISO_8859_11:
919 case C_ISO_8859_13:
920 case C_ISO_8859_14:
921 case C_ISO_8859_15:
922 break;
923 case C_ISO_2022_JP:
924 case C_ISO_2022_JP_2:
925 case C_ISO_2022_JP_3:
926 if (dest_charset == C_AUTO)
927 code_conv = conv_jistodisp;
928 else if (dest_charset == C_EUC_JP)
929 code_conv = conv_jistoeuc;
930 else if (dest_charset == C_UTF_8)
931 code_conv = conv_jistoutf8;
932 break;
933 case C_SHIFT_JIS:
934 if (dest_charset == C_AUTO)
935 code_conv = conv_sjistodisp;
936 else if (dest_charset == C_EUC_JP)
937 code_conv = conv_sjistoeuc;
938 else if (dest_charset == C_UTF_8)
939 code_conv = conv_sjistoutf8;
940 break;
941 case C_EUC_JP:
942 if (dest_charset == C_AUTO)
943 code_conv = conv_euctodisp;
944 else if (dest_charset == C_ISO_2022_JP ||
945 dest_charset == C_ISO_2022_JP_2 ||
946 dest_charset == C_ISO_2022_JP_3)
947 code_conv = conv_euctojis;
948 else if (dest_charset == C_UTF_8)
949 code_conv = conv_euctoutf8;
950 break;
951 case C_UTF_8:
952 if (dest_charset == C_EUC_JP)
953 code_conv = conv_utf8toeuc;
954 else if (dest_charset == C_ISO_2022_JP ||
955 dest_charset == C_ISO_2022_JP_2 ||
956 dest_charset == C_ISO_2022_JP_3)
957 code_conv = conv_utf8tojis;
958 break;
959 default:
960 break;
963 return code_conv;
966 static gchar *conv_iconv_strdup(const gchar *inbuf,
967 const gchar *src_code, const gchar *dest_code)
969 iconv_t cd;
970 gchar *outbuf;
972 cm_return_val_if_fail(inbuf != NULL, NULL);
974 if (!src_code && !dest_code &&
975 g_utf8_validate(inbuf, -1, NULL))
976 return g_strdup(inbuf);
978 if (!src_code)
979 src_code = conv_get_outgoing_charset_str();
980 if (!dest_code)
981 dest_code = CS_INTERNAL;
983 /* don't convert if src and dest codeset are identical */
984 if (!strcasecmp(src_code, dest_code))
985 return g_strdup(inbuf);
987 /* don't convert if dest codeset is US-ASCII */
988 if (!strcasecmp(src_code, CS_US_ASCII))
989 return g_strdup(inbuf);
991 /* don't convert if dest codeset is US-ASCII */
992 if (!strcasecmp(dest_code, CS_US_ASCII))
993 return g_strdup(inbuf);
995 cd = iconv_open(dest_code, src_code);
996 if (cd == (iconv_t)-1)
997 return NULL;
999 outbuf = conv_iconv_strdup_with_cd(inbuf, cd);
1001 iconv_close(cd);
1003 return outbuf;
1006 gchar *conv_iconv_strdup_with_cd(const gchar *inbuf, iconv_t cd)
1008 const gchar *inbuf_p;
1009 gchar *outbuf;
1010 gchar *outbuf_p;
1011 size_t in_size;
1012 size_t in_left;
1013 size_t out_size;
1014 size_t out_left;
1015 size_t n_conv;
1016 size_t len;
1018 cm_return_val_if_fail(inbuf != NULL, NULL);
1020 inbuf_p = inbuf;
1021 in_size = strlen(inbuf);
1022 in_left = in_size;
1023 out_size = (in_size + 1) * 2;
1024 outbuf = g_malloc(out_size);
1025 outbuf_p = outbuf;
1026 out_left = out_size;
1028 #define EXPAND_BUF() \
1030 len = outbuf_p - outbuf; \
1031 out_size *= 2; \
1032 outbuf = g_realloc(outbuf, out_size); \
1033 outbuf_p = outbuf + len; \
1034 out_left = out_size - len; \
1037 while ((n_conv = iconv(cd, (ICONV_CONST gchar **)&inbuf_p, &in_left,
1038 &outbuf_p, &out_left)) == (size_t)-1) {
1039 if (EILSEQ == errno) {
1040 if (codeconv_strict_mode) {
1041 g_free(outbuf);
1042 return NULL;
1044 //g_print("iconv(): at %d: %s\n", in_size - in_left, g_strerror(errno));
1045 inbuf_p++;
1046 in_left--;
1047 if (out_left == 0) {
1048 EXPAND_BUF();
1050 *outbuf_p++ = SUBST_CHAR;
1051 out_left--;
1052 } else if (EINVAL == errno) {
1053 break;
1054 } else if (E2BIG == errno) {
1055 EXPAND_BUF();
1056 } else {
1057 g_warning("conv_iconv_strdup(): %s",
1058 g_strerror(errno));
1059 break;
1063 while ((n_conv = iconv(cd, NULL, NULL, &outbuf_p, &out_left)) ==
1064 (size_t)-1) {
1065 if (E2BIG == errno) {
1066 EXPAND_BUF();
1067 } else {
1068 g_warning("conv_iconv_strdup(): %s",
1069 g_strerror(errno));
1070 break;
1074 #undef EXPAND_BUF
1076 len = outbuf_p - outbuf;
1077 outbuf = g_realloc(outbuf, len + 1);
1078 outbuf[len] = '\0';
1080 return outbuf;
1083 static const struct {
1084 CharSet charset;
1085 gchar *const name;
1086 } charsets[] = {
1087 {C_US_ASCII, CS_US_ASCII},
1088 {C_US_ASCII, CS_ANSI_X3_4_1968},
1089 {C_UTF_8, CS_UTF_8},
1090 {C_UTF_7, CS_UTF_7},
1091 {C_ISO_8859_1, CS_ISO_8859_1},
1092 {C_ISO_8859_2, CS_ISO_8859_2},
1093 {C_ISO_8859_3, CS_ISO_8859_3},
1094 {C_ISO_8859_4, CS_ISO_8859_4},
1095 {C_ISO_8859_5, CS_ISO_8859_5},
1096 {C_ISO_8859_6, CS_ISO_8859_6},
1097 {C_ISO_8859_7, CS_ISO_8859_7},
1098 {C_ISO_8859_8, CS_ISO_8859_8},
1099 {C_ISO_8859_9, CS_ISO_8859_9},
1100 {C_ISO_8859_10, CS_ISO_8859_10},
1101 {C_ISO_8859_11, CS_ISO_8859_11},
1102 {C_ISO_8859_13, CS_ISO_8859_13},
1103 {C_ISO_8859_14, CS_ISO_8859_14},
1104 {C_ISO_8859_15, CS_ISO_8859_15},
1105 {C_BALTIC, CS_BALTIC},
1106 {C_CP1250, CS_CP1250},
1107 {C_CP1251, CS_CP1251},
1108 {C_CP1252, CS_CP1252},
1109 {C_CP1253, CS_CP1253},
1110 {C_CP1254, CS_CP1254},
1111 {C_CP1255, CS_CP1255},
1112 {C_CP1256, CS_CP1256},
1113 {C_CP1257, CS_CP1257},
1114 {C_CP1258, CS_CP1258},
1115 {C_WINDOWS_1250, CS_WINDOWS_1250},
1116 {C_WINDOWS_1251, CS_WINDOWS_1251},
1117 {C_WINDOWS_1252, CS_WINDOWS_1252},
1118 {C_WINDOWS_1253, CS_WINDOWS_1253},
1119 {C_WINDOWS_1254, CS_WINDOWS_1254},
1120 {C_WINDOWS_1255, CS_WINDOWS_1255},
1121 {C_WINDOWS_1256, CS_WINDOWS_1256},
1122 {C_WINDOWS_1257, CS_WINDOWS_1257},
1123 {C_WINDOWS_1258, CS_WINDOWS_1258},
1124 {C_KOI8_R, CS_KOI8_R},
1125 {C_MACCYR, CS_MACCYR},
1126 {C_KOI8_T, CS_KOI8_T},
1127 {C_KOI8_U, CS_KOI8_U},
1128 {C_ISO_2022_JP, CS_ISO_2022_JP},
1129 {C_ISO_2022_JP_2, CS_ISO_2022_JP_2},
1130 {C_ISO_2022_JP_3, CS_ISO_2022_JP_3},
1131 {C_EUC_JP, CS_EUC_JP},
1132 {C_EUC_JP, CS_EUCJP},
1133 {C_EUC_JP_MS, CS_EUC_JP_MS},
1134 {C_SHIFT_JIS, CS_SHIFT_JIS},
1135 {C_SHIFT_JIS, CS_SHIFT__JIS},
1136 {C_SHIFT_JIS, CS_SJIS},
1137 {C_ISO_2022_KR, CS_ISO_2022_KR},
1138 {C_EUC_KR, CS_EUC_KR},
1139 {C_ISO_2022_CN, CS_ISO_2022_CN},
1140 {C_EUC_CN, CS_EUC_CN},
1141 {C_GB18030, CS_GB18030},
1142 {C_GB2312, CS_GB2312},
1143 {C_GBK, CS_GBK},
1144 {C_EUC_TW, CS_EUC_TW},
1145 {C_BIG5, CS_BIG5},
1146 {C_BIG5_HKSCS, CS_BIG5_HKSCS},
1147 {C_TIS_620, CS_TIS_620},
1148 {C_WINDOWS_874, CS_WINDOWS_874},
1149 {C_GEORGIAN_PS, CS_GEORGIAN_PS},
1150 {C_TCVN5712_1, CS_TCVN5712_1},
1153 static const struct {
1154 gchar *const locale;
1155 CharSet charset;
1156 CharSet out_charset;
1157 } locale_table[] = {
1158 {"ja_JP.eucJP" , C_EUC_JP , C_ISO_2022_JP},
1159 {"ja_JP.EUC-JP" , C_EUC_JP , C_ISO_2022_JP},
1160 {"ja_JP.EUC" , C_EUC_JP , C_ISO_2022_JP},
1161 {"ja_JP.ujis" , C_EUC_JP , C_ISO_2022_JP},
1162 {"ja_JP.SJIS" , C_SHIFT_JIS , C_ISO_2022_JP},
1163 {"ja_JP.JIS" , C_ISO_2022_JP , C_ISO_2022_JP},
1164 #ifdef G_OS_WIN32
1165 {"ja_JP" , C_SHIFT_JIS , C_ISO_2022_JP},
1166 #else
1167 {"ja_JP" , C_EUC_JP , C_ISO_2022_JP},
1168 #endif
1169 {"ko_KR.EUC-KR" , C_EUC_KR , C_EUC_KR},
1170 {"ko_KR" , C_EUC_KR , C_EUC_KR},
1171 {"zh_CN.GB18030" , C_GB18030 , C_GB18030},
1172 {"zh_CN.GB2312" , C_GB2312 , C_GB2312},
1173 {"zh_CN.GBK" , C_GBK , C_GBK},
1174 {"zh_CN" , C_GB18030 , C_GB18030},
1175 {"zh_HK" , C_BIG5_HKSCS , C_BIG5_HKSCS},
1176 {"zh_TW.eucTW" , C_EUC_TW , C_BIG5},
1177 {"zh_TW.EUC-TW" , C_EUC_TW , C_BIG5},
1178 {"zh_TW.Big5" , C_BIG5 , C_BIG5},
1179 {"zh_TW" , C_BIG5 , C_BIG5},
1181 {"ru_RU.KOI8-R" , C_KOI8_R , C_KOI8_R},
1182 {"ru_RU.KOI8R" , C_KOI8_R , C_KOI8_R},
1183 {"ru_RU.CP1251" , C_WINDOWS_1251, C_KOI8_R},
1184 #ifdef G_OS_WIN32
1185 {"ru_RU" , C_WINDOWS_1251, C_KOI8_R},
1186 #else
1187 {"ru_RU" , C_ISO_8859_5 , C_KOI8_R},
1188 #endif
1189 {"tg_TJ" , C_KOI8_T , C_KOI8_T},
1190 {"ru_UA" , C_KOI8_U , C_KOI8_U},
1191 {"uk_UA.CP1251" , C_WINDOWS_1251, C_KOI8_U},
1192 {"uk_UA" , C_KOI8_U , C_KOI8_U},
1194 {"be_BY" , C_WINDOWS_1251, C_WINDOWS_1251},
1195 {"bg_BG" , C_WINDOWS_1251, C_WINDOWS_1251},
1197 {"yi_US" , C_WINDOWS_1255, C_WINDOWS_1255},
1199 {"af_ZA" , C_ISO_8859_1 , C_ISO_8859_1},
1200 {"br_FR" , C_ISO_8859_1 , C_ISO_8859_1},
1201 {"ca_ES" , C_ISO_8859_1 , C_ISO_8859_1},
1202 {"da_DK" , C_ISO_8859_1 , C_ISO_8859_1},
1203 {"de_AT" , C_ISO_8859_1 , C_ISO_8859_1},
1204 {"de_BE" , C_ISO_8859_1 , C_ISO_8859_1},
1205 {"de_CH" , C_ISO_8859_1 , C_ISO_8859_1},
1206 {"de_DE" , C_ISO_8859_1 , C_ISO_8859_1},
1207 {"de_LU" , C_ISO_8859_1 , C_ISO_8859_1},
1208 {"en_AU" , C_ISO_8859_1 , C_ISO_8859_1},
1209 {"en_BW" , C_ISO_8859_1 , C_ISO_8859_1},
1210 {"en_CA" , C_ISO_8859_1 , C_ISO_8859_1},
1211 {"en_DK" , C_ISO_8859_1 , C_ISO_8859_1},
1212 {"en_GB" , C_ISO_8859_1 , C_ISO_8859_1},
1213 {"en_HK" , C_ISO_8859_1 , C_ISO_8859_1},
1214 {"en_IE" , C_ISO_8859_1 , C_ISO_8859_1},
1215 {"en_NZ" , C_ISO_8859_1 , C_ISO_8859_1},
1216 {"en_PH" , C_ISO_8859_1 , C_ISO_8859_1},
1217 {"en_SG" , C_ISO_8859_1 , C_ISO_8859_1},
1218 {"en_US" , C_ISO_8859_1 , C_ISO_8859_1},
1219 {"en_ZA" , C_ISO_8859_1 , C_ISO_8859_1},
1220 {"en_ZW" , C_ISO_8859_1 , C_ISO_8859_1},
1221 {"es_AR" , C_ISO_8859_1 , C_ISO_8859_1},
1222 {"es_BO" , C_ISO_8859_1 , C_ISO_8859_1},
1223 {"es_CL" , C_ISO_8859_1 , C_ISO_8859_1},
1224 {"es_CO" , C_ISO_8859_1 , C_ISO_8859_1},
1225 {"es_CR" , C_ISO_8859_1 , C_ISO_8859_1},
1226 {"es_DO" , C_ISO_8859_1 , C_ISO_8859_1},
1227 {"es_EC" , C_ISO_8859_1 , C_ISO_8859_1},
1228 {"es_ES" , C_ISO_8859_1 , C_ISO_8859_1},
1229 {"es_GT" , C_ISO_8859_1 , C_ISO_8859_1},
1230 {"es_HN" , C_ISO_8859_1 , C_ISO_8859_1},
1231 {"es_MX" , C_ISO_8859_1 , C_ISO_8859_1},
1232 {"es_NI" , C_ISO_8859_1 , C_ISO_8859_1},
1233 {"es_PA" , C_ISO_8859_1 , C_ISO_8859_1},
1234 {"es_PE" , C_ISO_8859_1 , C_ISO_8859_1},
1235 {"es_PR" , C_ISO_8859_1 , C_ISO_8859_1},
1236 {"es_PY" , C_ISO_8859_1 , C_ISO_8859_1},
1237 {"es_SV" , C_ISO_8859_1 , C_ISO_8859_1},
1238 {"es_US" , C_ISO_8859_1 , C_ISO_8859_1},
1239 {"es_UY" , C_ISO_8859_1 , C_ISO_8859_1},
1240 {"es_VE" , C_ISO_8859_1 , C_ISO_8859_1},
1241 {"et_EE" , C_ISO_8859_1 , C_ISO_8859_1},
1242 {"eu_ES" , C_ISO_8859_1 , C_ISO_8859_1},
1243 {"fi_FI" , C_ISO_8859_1 , C_ISO_8859_1},
1244 {"fo_FO" , C_ISO_8859_1 , C_ISO_8859_1},
1245 {"fr_BE" , C_ISO_8859_1 , C_ISO_8859_1},
1246 {"fr_CA" , C_ISO_8859_1 , C_ISO_8859_1},
1247 {"fr_CH" , C_ISO_8859_1 , C_ISO_8859_1},
1248 {"fr_FR" , C_ISO_8859_1 , C_ISO_8859_1},
1249 {"fr_LU" , C_ISO_8859_1 , C_ISO_8859_1},
1250 {"ga_IE" , C_ISO_8859_1 , C_ISO_8859_1},
1251 {"gl_ES" , C_ISO_8859_1 , C_ISO_8859_1},
1252 {"gv_GB" , C_ISO_8859_1 , C_ISO_8859_1},
1253 {"id_ID" , C_ISO_8859_1 , C_ISO_8859_1},
1254 {"is_IS" , C_ISO_8859_1 , C_ISO_8859_1},
1255 {"it_CH" , C_ISO_8859_1 , C_ISO_8859_1},
1256 {"it_IT" , C_ISO_8859_1 , C_ISO_8859_1},
1257 {"kl_GL" , C_ISO_8859_1 , C_ISO_8859_1},
1258 {"kw_GB" , C_ISO_8859_1 , C_ISO_8859_1},
1259 {"ms_MY" , C_ISO_8859_1 , C_ISO_8859_1},
1260 {"nl_BE" , C_ISO_8859_1 , C_ISO_8859_1},
1261 {"nl_NL" , C_ISO_8859_1 , C_ISO_8859_1},
1262 {"nb_NO" , C_ISO_8859_1 , C_ISO_8859_1},
1263 {"nn_NO" , C_ISO_8859_1 , C_ISO_8859_1},
1264 {"no_NO" , C_ISO_8859_1 , C_ISO_8859_1},
1265 {"oc_FR" , C_ISO_8859_1 , C_ISO_8859_1},
1266 {"pt_BR" , C_ISO_8859_1 , C_ISO_8859_1},
1267 {"pt_PT" , C_ISO_8859_1 , C_ISO_8859_1},
1268 {"sq_AL" , C_ISO_8859_1 , C_ISO_8859_1},
1269 {"sv_FI" , C_ISO_8859_1 , C_ISO_8859_1},
1270 {"sv_SE" , C_ISO_8859_1 , C_ISO_8859_1},
1271 {"tl_PH" , C_ISO_8859_1 , C_ISO_8859_1},
1272 {"uz_UZ" , C_ISO_8859_1 , C_ISO_8859_1},
1273 {"wa_BE" , C_ISO_8859_1 , C_ISO_8859_1},
1275 {"bs_BA" , C_ISO_8859_2 , C_ISO_8859_2},
1276 {"cs_CZ" , C_ISO_8859_2 , C_ISO_8859_2},
1277 {"hr_HR" , C_ISO_8859_2 , C_ISO_8859_2},
1278 {"hu_HU" , C_ISO_8859_2 , C_ISO_8859_2},
1279 {"pl_PL" , C_ISO_8859_2 , C_ISO_8859_2},
1280 {"ro_RO" , C_ISO_8859_2 , C_ISO_8859_2},
1281 {"sk_SK" , C_ISO_8859_2 , C_ISO_8859_2},
1282 {"sl_SI" , C_ISO_8859_2 , C_ISO_8859_2},
1284 {"sr_YU@cyrillic" , C_ISO_8859_5 , C_ISO_8859_5},
1285 {"sr_YU" , C_ISO_8859_2 , C_ISO_8859_2},
1287 {"mt_MT" , C_ISO_8859_3 , C_ISO_8859_3},
1289 {"lt_LT.iso88594" , C_ISO_8859_4 , C_ISO_8859_4},
1290 {"lt_LT.ISO8859-4" , C_ISO_8859_4 , C_ISO_8859_4},
1291 {"lt_LT.ISO_8859-4" , C_ISO_8859_4 , C_ISO_8859_4},
1292 {"lt_LT" , C_ISO_8859_13 , C_ISO_8859_13},
1294 {"mk_MK" , C_ISO_8859_5 , C_ISO_8859_5},
1296 {"ar_AE" , C_ISO_8859_6 , C_ISO_8859_6},
1297 {"ar_BH" , C_ISO_8859_6 , C_ISO_8859_6},
1298 {"ar_DZ" , C_ISO_8859_6 , C_ISO_8859_6},
1299 {"ar_EG" , C_ISO_8859_6 , C_ISO_8859_6},
1300 {"ar_IQ" , C_ISO_8859_6 , C_ISO_8859_6},
1301 {"ar_JO" , C_ISO_8859_6 , C_ISO_8859_6},
1302 {"ar_KW" , C_ISO_8859_6 , C_ISO_8859_6},
1303 {"ar_LB" , C_ISO_8859_6 , C_ISO_8859_6},
1304 {"ar_LY" , C_ISO_8859_6 , C_ISO_8859_6},
1305 {"ar_MA" , C_ISO_8859_6 , C_ISO_8859_6},
1306 {"ar_OM" , C_ISO_8859_6 , C_ISO_8859_6},
1307 {"ar_QA" , C_ISO_8859_6 , C_ISO_8859_6},
1308 {"ar_SA" , C_ISO_8859_6 , C_ISO_8859_6},
1309 {"ar_SD" , C_ISO_8859_6 , C_ISO_8859_6},
1310 {"ar_SY" , C_ISO_8859_6 , C_ISO_8859_6},
1311 {"ar_TN" , C_ISO_8859_6 , C_ISO_8859_6},
1312 {"ar_YE" , C_ISO_8859_6 , C_ISO_8859_6},
1314 {"el_GR" , C_ISO_8859_7 , C_ISO_8859_7},
1315 {"he_IL" , C_ISO_8859_8 , C_ISO_8859_8},
1316 {"iw_IL" , C_ISO_8859_8 , C_ISO_8859_8},
1317 {"tr_TR" , C_ISO_8859_9 , C_ISO_8859_9},
1319 {"lv_LV" , C_ISO_8859_13 , C_ISO_8859_13},
1320 {"mi_NZ" , C_ISO_8859_13 , C_ISO_8859_13},
1322 {"cy_GB" , C_ISO_8859_14 , C_ISO_8859_14},
1324 {"ar_IN" , C_UTF_8 , C_UTF_8},
1325 {"en_IN" , C_UTF_8 , C_UTF_8},
1326 {"se_NO" , C_UTF_8 , C_UTF_8},
1327 {"ta_IN" , C_UTF_8 , C_UTF_8},
1328 {"te_IN" , C_UTF_8 , C_UTF_8},
1329 {"ur_PK" , C_UTF_8 , C_UTF_8},
1331 {"th_TH" , C_TIS_620 , C_TIS_620},
1332 /* {"th_TH" , C_WINDOWS_874}, */
1333 /* {"th_TH" , C_ISO_8859_11}, */
1335 {"ka_GE" , C_GEORGIAN_PS , C_GEORGIAN_PS},
1336 {"vi_VN.TCVN" , C_TCVN5712_1 , C_TCVN5712_1},
1338 {"C" , C_US_ASCII , C_US_ASCII},
1339 {"POSIX" , C_US_ASCII , C_US_ASCII},
1340 {"ANSI_X3.4-1968" , C_US_ASCII , C_US_ASCII},
1343 static GHashTable *conv_get_charset_to_str_table(void)
1345 static GHashTable *table;
1346 gint i;
1348 if (table)
1349 return table;
1351 table = g_hash_table_new(NULL, g_direct_equal);
1353 for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
1354 if (g_hash_table_lookup(table, GUINT_TO_POINTER(charsets[i].charset))
1355 == NULL) {
1356 g_hash_table_insert
1357 (table, GUINT_TO_POINTER(charsets[i].charset),
1358 charsets[i].name);
1362 return table;
1365 static GHashTable *conv_get_charset_from_str_table(void)
1367 static GHashTable *table;
1368 gint i;
1370 if (table)
1371 return table;
1373 table = g_hash_table_new(str_case_hash, str_case_equal);
1375 for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
1376 g_hash_table_insert(table, charsets[i].name,
1377 GUINT_TO_POINTER(charsets[i].charset));
1380 return table;
1383 const gchar *conv_get_charset_str(CharSet charset)
1385 GHashTable *table;
1387 table = conv_get_charset_to_str_table();
1388 return g_hash_table_lookup(table, GUINT_TO_POINTER(charset));
1391 CharSet conv_get_charset_from_str(const gchar *charset)
1393 GHashTable *table;
1395 if (!charset) return C_AUTO;
1397 table = conv_get_charset_from_str_table();
1398 return GPOINTER_TO_UINT(g_hash_table_lookup(table, charset));
1401 static CharSet conv_get_locale_charset(void)
1403 static CharSet cur_charset = C_UNINITIALIZED;
1404 const gchar *cur_locale;
1405 const gchar *p;
1406 gint i;
1408 if (cur_charset != C_UNINITIALIZED)
1409 return cur_charset;
1411 cur_locale = conv_get_current_locale();
1412 if (!cur_locale) {
1413 cur_charset = C_US_ASCII;
1414 return cur_charset;
1417 if (strcasestr(cur_locale, "UTF-8") ||
1418 strcasestr(cur_locale, "utf8")) {
1419 cur_charset = C_UTF_8;
1420 return cur_charset;
1423 if ((p = strcasestr(cur_locale, "@euro")) && p[5] == '\0') {
1424 cur_charset = C_ISO_8859_15;
1425 return cur_charset;
1428 for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
1429 const gchar *p;
1431 /* "ja_JP.EUC" matches with "ja_JP.eucJP", "ja_JP.EUC" and
1432 "ja_JP". "ja_JP" matches with "ja_JP.xxxx" and "ja" */
1433 if (!g_ascii_strncasecmp(cur_locale, locale_table[i].locale,
1434 strlen(locale_table[i].locale))) {
1435 cur_charset = locale_table[i].charset;
1436 return cur_charset;
1437 } else if ((p = strchr(locale_table[i].locale, '_')) &&
1438 !strchr(p + 1, '.')) {
1439 if (strlen(cur_locale) == 2 &&
1440 !g_ascii_strncasecmp(cur_locale, locale_table[i].locale, 2)) {
1441 cur_charset = locale_table[i].charset;
1442 return cur_charset;
1447 cur_charset = C_AUTO;
1448 return cur_charset;
1451 static CharSet conv_get_locale_charset_no_utf8(void)
1453 static CharSet cur_charset = C_UNINITIALIZED;
1454 const gchar *cur_locale;
1455 const gchar *p;
1456 gint i;
1458 if (codeconv_broken_are_utf8) {
1459 cur_charset = C_UTF_8;
1460 return cur_charset;
1463 cur_locale = conv_get_current_locale();
1464 if (!cur_locale) {
1465 cur_charset = C_US_ASCII;
1466 return cur_charset;
1469 if (strcasestr(cur_locale, "UTF-8") ||
1470 strcasestr(cur_locale, "utf8")) {
1471 cur_charset = C_UTF_8;
1472 return cur_charset;
1475 if ((p = strcasestr(cur_locale, "@euro")) && p[5] == '\0') {
1476 cur_charset = C_ISO_8859_15;
1477 return cur_charset;
1480 for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
1481 const gchar *p;
1483 /* "ja_JP.EUC" matches with "ja_JP.eucJP", "ja_JP.EUC" and
1484 "ja_JP". "ja_JP" matches with "ja_JP.xxxx" and "ja" */
1485 if (!g_ascii_strncasecmp(cur_locale, locale_table[i].locale,
1486 strlen(locale_table[i].locale))) {
1487 cur_charset = locale_table[i].charset;
1488 return cur_charset;
1489 } else if ((p = strchr(locale_table[i].locale, '_')) &&
1490 !strchr(p + 1, '.')) {
1491 if (strlen(cur_locale) == 2 &&
1492 !g_ascii_strncasecmp(cur_locale, locale_table[i].locale, 2)) {
1493 cur_charset = locale_table[i].charset;
1494 return cur_charset;
1499 cur_charset = C_AUTO;
1500 return cur_charset;
1503 const gchar *conv_get_locale_charset_str(void)
1505 static const gchar *codeset = NULL;
1507 if (!codeset)
1508 codeset = conv_get_charset_str(conv_get_locale_charset());
1510 return codeset ? codeset : CS_INTERNAL;
1513 const gchar *conv_get_locale_charset_str_no_utf8(void)
1515 static const gchar *codeset = NULL;
1517 if (!codeset)
1518 codeset = conv_get_charset_str(conv_get_locale_charset_no_utf8());
1520 return codeset ? codeset : CS_INTERNAL;
1523 static CharSet conv_get_outgoing_charset(void)
1525 static CharSet out_charset = C_UNINITIALIZED;
1526 const gchar *cur_locale;
1527 const gchar *p;
1528 gint i;
1530 if (out_charset != C_UNINITIALIZED)
1531 return out_charset;
1533 cur_locale = conv_get_current_locale();
1534 if (!cur_locale) {
1535 out_charset = C_AUTO;
1536 return out_charset;
1539 if (strcasestr(cur_locale, "UTF-8") ||
1540 strcasestr(cur_locale, "utf8")) {
1541 out_charset = C_UTF_8;
1542 return out_charset;
1545 if ((p = strcasestr(cur_locale, "@euro")) && p[5] == '\0') {
1546 out_charset = C_ISO_8859_15;
1547 return out_charset;
1550 for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
1551 const gchar *p;
1553 if (!g_ascii_strncasecmp(cur_locale, locale_table[i].locale,
1554 strlen(locale_table[i].locale))) {
1555 out_charset = locale_table[i].out_charset;
1556 break;
1557 } else if ((p = strchr(locale_table[i].locale, '_')) &&
1558 !strchr(p + 1, '.')) {
1559 if (strlen(cur_locale) == 2 &&
1560 !g_ascii_strncasecmp(cur_locale, locale_table[i].locale, 2)) {
1561 out_charset = locale_table[i].out_charset;
1562 break;
1567 return out_charset;
1570 const gchar *conv_get_outgoing_charset_str(void)
1572 CharSet out_charset;
1573 const gchar *str;
1575 out_charset = conv_get_outgoing_charset();
1576 str = conv_get_charset_str(out_charset);
1578 return str ? str : CS_UTF_8;
1581 const gchar *conv_get_current_locale(void)
1583 const gchar *cur_locale;
1585 #ifdef G_OS_WIN32
1586 cur_locale = g_win32_getlocale();
1587 #else
1588 cur_locale = g_getenv("LC_ALL");
1589 if (!cur_locale) cur_locale = g_getenv("LC_CTYPE");
1590 if (!cur_locale) cur_locale = g_getenv("LANG");
1591 if (!cur_locale) cur_locale = setlocale(LC_CTYPE, NULL);
1592 #endif /* G_OS_WIN32 */
1594 debug_print("current locale: %s\n",
1595 cur_locale ? cur_locale : "(none)");
1597 return cur_locale;
1600 static gboolean conv_is_ja_locale(void)
1602 static gint is_ja_locale = -1;
1603 const gchar *cur_locale;
1605 if (is_ja_locale != -1)
1606 return is_ja_locale != 0;
1608 is_ja_locale = 0;
1609 cur_locale = conv_get_current_locale();
1610 if (cur_locale) {
1611 if (g_ascii_strncasecmp(cur_locale, "ja", 2) == 0)
1612 is_ja_locale = 1;
1615 return is_ja_locale != 0;
1618 gchar *conv_unmime_header(const gchar *str, const gchar *default_encoding,
1619 gboolean addr_field)
1621 gchar buf[BUFFSIZE];
1623 cm_return_val_if_fail(str != NULL, NULL);
1625 if (is_ascii_str(str))
1626 return unmime_header(str, addr_field);
1628 if (default_encoding) {
1629 gchar *utf8_buf;
1631 utf8_buf = conv_codeset_strdup
1632 (str, default_encoding, CS_INTERNAL);
1633 if (utf8_buf) {
1634 gchar *decoded_str;
1636 decoded_str = unmime_header(utf8_buf, addr_field);
1637 g_free(utf8_buf);
1638 return decoded_str;
1642 if (conv_is_ja_locale())
1643 conv_anytodisp(buf, sizeof(buf), str);
1644 else
1645 conv_localetodisp(buf, sizeof(buf), str);
1647 return unmime_header(buf, addr_field);
1650 #define MAX_LINELEN 76
1651 #define MAX_HARD_LINELEN 996
1652 #define MIMESEP_BEGIN "=?"
1653 #define MIMESEP_END "?="
1655 #define LBREAK_IF_REQUIRED(cond, is_plain_text) \
1657 if (len - (destp - (guchar *)dest) < MAX_LINELEN + 2) { \
1658 *destp = '\0'; \
1659 return; \
1662 if ((cond) && *srcp) { \
1663 if (destp > (guchar *)dest && left < MAX_LINELEN - 1) { \
1664 if (isspace(*(destp - 1))) \
1665 destp--; \
1666 else if (is_plain_text && isspace(*srcp)) \
1667 srcp++; \
1668 if (*srcp) { \
1669 *destp++ = '\n'; \
1670 *destp++ = ' '; \
1671 left = MAX_LINELEN - 1; \
1673 } else if (destp == (guchar *)dest && left < 7) { \
1674 if (is_plain_text && isspace(*srcp)) \
1675 srcp++; \
1676 if (*srcp) { \
1677 *destp++ = '\n'; \
1678 *destp++ = ' '; \
1679 left = MAX_LINELEN - 1; \
1685 #define B64LEN(len) ((len) / 3 * 4 + ((len) % 3 ? 4 : 0))
1687 void conv_encode_header_full(gchar *dest, gint len, const gchar *src,
1688 gint header_len, gboolean addr_field,
1689 const gchar *out_encoding_)
1691 const gchar *cur_encoding;
1692 const gchar *out_encoding;
1693 gint mimestr_len;
1694 gchar *mimesep_enc;
1695 gint left;
1696 const guchar *srcp = src;
1697 guchar *destp = dest;
1698 gboolean use_base64;
1700 cm_return_if_fail(g_utf8_validate(src, -1, NULL) == TRUE);
1701 cm_return_if_fail(destp != NULL);
1703 if (MB_CUR_MAX > 1) {
1704 use_base64 = TRUE;
1705 mimesep_enc = "?B?";
1706 } else {
1707 use_base64 = FALSE;
1708 mimesep_enc = "?Q?";
1711 cur_encoding = CS_INTERNAL;
1713 if (out_encoding_)
1714 out_encoding = out_encoding_;
1715 else
1716 out_encoding = conv_get_outgoing_charset_str();
1718 if (!strcmp(out_encoding, CS_US_ASCII))
1719 out_encoding = CS_ISO_8859_1;
1721 mimestr_len = strlen(MIMESEP_BEGIN) + strlen(out_encoding) +
1722 strlen(mimesep_enc) + strlen(MIMESEP_END);
1724 left = MAX_LINELEN - header_len;
1726 while (*srcp) {
1727 LBREAK_IF_REQUIRED(left <= 0, TRUE);
1729 while (isspace(*srcp)) {
1730 *destp++ = *srcp++;
1731 left--;
1732 LBREAK_IF_REQUIRED(left <= 0, TRUE);
1735 /* output as it is if the next word is ASCII string */
1736 if (!is_next_nonascii(srcp)) {
1737 gint word_len;
1739 word_len = get_next_word_len(srcp);
1740 LBREAK_IF_REQUIRED(left < word_len, TRUE);
1741 while (word_len > 0) {
1742 LBREAK_IF_REQUIRED(left + (MAX_HARD_LINELEN - MAX_LINELEN) <= 0, TRUE)
1743 *destp++ = *srcp++;
1744 left--;
1745 word_len--;
1748 continue;
1751 /* don't include parentheses and quotes in encoded strings */
1752 if (addr_field && (*srcp == '(' || *srcp == ')' || *srcp == '"')) {
1753 LBREAK_IF_REQUIRED(left < 2, FALSE);
1754 *destp++ = *srcp++;
1755 left--;
1758 while (1) {
1759 gint mb_len = 0;
1760 gint cur_len = 0;
1761 gchar *part_str;
1762 gchar *out_str;
1763 gchar *enc_str;
1764 const guchar *p = srcp;
1765 gint out_str_len;
1766 gint out_enc_str_len;
1767 gint mime_block_len;
1768 gboolean cont = FALSE;
1770 while (*p != '\0') {
1771 if (isspace(*p) && !is_next_nonascii(p + 1))
1772 break;
1773 /* don't include parentheses in encoded
1774 strings */
1775 if (addr_field && (*p == '(' || *p == ')' || *p == '"'))
1776 break;
1778 mb_len = g_utf8_skip[*p];
1780 Xstrndup_a(part_str, srcp, cur_len + mb_len, );
1781 out_str = conv_codeset_strdup
1782 (part_str, cur_encoding, out_encoding);
1783 if (!out_str) {
1784 if (codeconv_strict_mode) {
1785 *dest = '\0';
1786 return;
1787 } else {
1788 g_warning("conv_encode_header_full(): code conversion failed");
1789 conv_unreadable_8bit(part_str);
1790 out_str = g_strdup(part_str);
1793 out_str_len = strlen(out_str);
1795 if (use_base64)
1796 out_enc_str_len = B64LEN(out_str_len);
1797 else
1798 out_enc_str_len =
1799 qp_get_q_encoding_len(out_str);
1801 g_free(out_str);
1803 if (mimestr_len + out_enc_str_len <= left) {
1804 cur_len += mb_len;
1805 p += mb_len;
1806 } else if (cur_len == 0) {
1807 left = 0;
1808 LBREAK_IF_REQUIRED(1, FALSE);
1809 continue;
1810 } else {
1811 cont = TRUE;
1812 break;
1816 if (cur_len > 0) {
1817 Xstrndup_a(part_str, srcp, cur_len, );
1818 out_str = conv_codeset_strdup
1819 (part_str, cur_encoding, out_encoding);
1820 if (!out_str) {
1821 g_warning("conv_encode_header_full(): code conversion failed");
1822 conv_unreadable_8bit(part_str);
1823 out_str = g_strdup(part_str);
1825 out_str_len = strlen(out_str);
1827 if (use_base64)
1828 out_enc_str_len = B64LEN(out_str_len);
1829 else
1830 out_enc_str_len =
1831 qp_get_q_encoding_len(out_str);
1833 if (use_base64)
1834 enc_str = g_base64_encode(out_str, out_str_len);
1835 else {
1836 Xalloca(enc_str, out_enc_str_len + 1, );
1837 qp_q_encode(enc_str, out_str);
1840 g_free(out_str);
1842 /* output MIME-encoded string block */
1843 mime_block_len = mimestr_len + strlen(enc_str);
1844 g_snprintf(destp, mime_block_len + 1,
1845 MIMESEP_BEGIN "%s%s%s" MIMESEP_END,
1846 out_encoding, mimesep_enc, enc_str);
1848 if (use_base64)
1849 g_free(enc_str);
1851 destp += mime_block_len;
1852 srcp += cur_len;
1854 left -= mime_block_len;
1857 LBREAK_IF_REQUIRED(cont, FALSE);
1859 if (cur_len == 0)
1860 break;
1864 *destp = '\0';
1867 void conv_encode_header(gchar *dest, gint len, const gchar *src,
1868 gint header_len, gboolean addr_field)
1870 conv_encode_header_full(dest,len,src,header_len,addr_field,NULL);
1873 #undef LBREAK_IF_REQUIRED
1874 #undef B64LEN
1876 gchar *conv_filename_from_utf8(const gchar *utf8_file)
1878 gchar *fs_file;
1879 GError *error = NULL;
1881 cm_return_val_if_fail(utf8_file != NULL, NULL);
1883 fs_file = g_filename_from_utf8(utf8_file, -1, NULL, NULL, &error);
1884 if (error) {
1885 debug_print("failed to convert encoding of file name: %s\n",
1886 error->message);
1887 g_error_free(error);
1889 if (!fs_file)
1890 fs_file = g_strdup(utf8_file);
1892 return fs_file;
1895 gchar *conv_filename_to_utf8(const gchar *fs_file)
1897 gchar *utf8_file = NULL;
1898 GError *error = NULL;
1900 cm_return_val_if_fail(fs_file != NULL, NULL);
1902 utf8_file = g_filename_to_utf8(fs_file, -1, NULL, NULL, &error);
1903 if (error) {
1904 g_warning("failed to convert encoding of file name: %s",
1905 error->message);
1906 g_error_free(error);
1909 if (!utf8_file || !g_utf8_validate(utf8_file, -1, NULL)) {
1910 g_free(utf8_file);
1911 utf8_file = g_strdup(fs_file);
1912 conv_unreadable_8bit(utf8_file);
1915 return utf8_file;