Replace FSF snail-mail address with URL.
[libiconv.git] / lib / loop_unicode.h
blobe857fa88d1ae03d9353d15f38f8e51d445089624
1 /*
2 * Copyright (C) 1999-2003, 2005-2006, 2008 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, see <http://www.gnu.org/licenses/>.
20 /* This file defines the conversion loop via Unicode as a pivot encoding. */
22 /* Attempt to transliterate wc. Return code as in xxx_wctomb. */
23 static int unicode_transliterate (conv_t cd, ucs4_t wc,
24 unsigned char* outptr, size_t outleft)
26 if (cd->oflags & HAVE_HANGUL_JAMO) {
27 /* Decompose Hangul into Jamo. Use double-width Jamo (contained
28 in all Korean encodings and ISO-2022-JP-2), not half-width Jamo
29 (contained in Unicode only). */
30 ucs4_t buf[3];
31 int ret = johab_hangul_decompose(cd,buf,wc);
32 if (ret != RET_ILUNI) {
33 /* we know 1 <= ret <= 3 */
34 state_t backup_state = cd->ostate;
35 unsigned char* backup_outptr = outptr;
36 size_t backup_outleft = outleft;
37 int i, sub_outcount;
38 for (i = 0; i < ret; i++) {
39 if (outleft == 0) {
40 sub_outcount = RET_TOOSMALL;
41 goto johab_hangul_failed;
43 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
44 if (sub_outcount <= RET_ILUNI)
45 goto johab_hangul_failed;
46 if (!(sub_outcount <= outleft)) abort();
47 outptr += sub_outcount; outleft -= sub_outcount;
49 return outptr-backup_outptr;
50 johab_hangul_failed:
51 cd->ostate = backup_state;
52 outptr = backup_outptr;
53 outleft = backup_outleft;
54 if (sub_outcount != RET_ILUNI)
55 return RET_TOOSMALL;
59 /* Try to use a variant, but postfix it with
60 U+303E IDEOGRAPHIC VARIATION INDICATOR
61 (cf. Ken Lunde's "CJKV information processing", p. 188). */
62 int indx = -1;
63 if (wc == 0x3006)
64 indx = 0;
65 else if (wc == 0x30f6)
66 indx = 1;
67 else if (wc >= 0x4e00 && wc < 0xa000)
68 indx = cjk_variants_indx[wc-0x4e00];
69 if (indx >= 0) {
70 for (;; indx++) {
71 ucs4_t buf[2];
72 unsigned short variant = cjk_variants[indx];
73 unsigned short last = variant & 0x8000;
74 variant &= 0x7fff;
75 variant += 0x3000;
76 buf[0] = variant; buf[1] = 0x303e;
78 state_t backup_state = cd->ostate;
79 unsigned char* backup_outptr = outptr;
80 size_t backup_outleft = outleft;
81 int i, sub_outcount;
82 for (i = 0; i < 2; i++) {
83 if (outleft == 0) {
84 sub_outcount = RET_TOOSMALL;
85 goto variant_failed;
87 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
88 if (sub_outcount <= RET_ILUNI)
89 goto variant_failed;
90 if (!(sub_outcount <= outleft)) abort();
91 outptr += sub_outcount; outleft -= sub_outcount;
93 return outptr-backup_outptr;
94 variant_failed:
95 cd->ostate = backup_state;
96 outptr = backup_outptr;
97 outleft = backup_outleft;
98 if (sub_outcount != RET_ILUNI)
99 return RET_TOOSMALL;
101 if (last)
102 break;
106 if (wc >= 0x2018 && wc <= 0x201a) {
107 /* Special case for quotation marks 0x2018, 0x2019, 0x201a */
108 ucs4_t substitute =
109 (cd->oflags & HAVE_QUOTATION_MARKS
110 ? (wc == 0x201a ? 0x2018 : wc)
111 : (cd->oflags & HAVE_ACCENTS
112 ? (wc==0x2019 ? 0x00b4 : 0x0060) /* use accents */
113 : 0x0027 /* use apostrophe */
114 ) );
115 int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,substitute,outleft);
116 if (outcount != RET_ILUNI)
117 return outcount;
120 /* Use the transliteration table. */
121 int indx = translit_index(wc);
122 if (indx >= 0) {
123 const unsigned int * cp = &translit_data[indx];
124 unsigned int num = *cp++;
125 state_t backup_state = cd->ostate;
126 unsigned char* backup_outptr = outptr;
127 size_t backup_outleft = outleft;
128 unsigned int i;
129 int sub_outcount;
130 for (i = 0; i < num; i++) {
131 if (outleft == 0) {
132 sub_outcount = RET_TOOSMALL;
133 goto translit_failed;
135 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,cp[i],outleft);
136 if (sub_outcount == RET_ILUNI)
137 /* Recursive transliteration. */
138 sub_outcount = unicode_transliterate(cd,cp[i],outptr,outleft);
139 if (sub_outcount <= RET_ILUNI)
140 goto translit_failed;
141 if (!(sub_outcount <= outleft)) abort();
142 outptr += sub_outcount; outleft -= sub_outcount;
144 return outptr-backup_outptr;
145 translit_failed:
146 cd->ostate = backup_state;
147 outptr = backup_outptr;
148 outleft = backup_outleft;
149 if (sub_outcount != RET_ILUNI)
150 return RET_TOOSMALL;
153 return RET_ILUNI;
156 #ifndef LIBICONV_PLUG
158 struct uc_to_mb_fallback_locals {
159 unsigned char* l_outbuf;
160 size_t l_outbytesleft;
161 int l_errno;
164 static void uc_to_mb_write_replacement (const char *buf, size_t buflen,
165 void* callback_arg)
167 struct uc_to_mb_fallback_locals * plocals =
168 (struct uc_to_mb_fallback_locals *) callback_arg;
169 /* Do nothing if already encountered an error in a previous call. */
170 if (plocals->l_errno == 0) {
171 /* Attempt to copy the passed buffer to the output buffer. */
172 if (plocals->l_outbytesleft < buflen)
173 plocals->l_errno = E2BIG;
174 else {
175 memcpy(plocals->l_outbuf, buf, buflen);
176 plocals->l_outbuf += buflen;
177 plocals->l_outbytesleft -= buflen;
182 struct mb_to_uc_fallback_locals {
183 conv_t l_cd;
184 unsigned char* l_outbuf;
185 size_t l_outbytesleft;
186 int l_errno;
189 static void mb_to_uc_write_replacement (const unsigned int *buf, size_t buflen,
190 void* callback_arg)
192 struct mb_to_uc_fallback_locals * plocals =
193 (struct mb_to_uc_fallback_locals *) callback_arg;
194 /* Do nothing if already encountered an error in a previous call. */
195 if (plocals->l_errno == 0) {
196 /* Attempt to convert the passed buffer to the target encoding. */
197 conv_t cd = plocals->l_cd;
198 unsigned char* outptr = plocals->l_outbuf;
199 size_t outleft = plocals->l_outbytesleft;
200 for (; buflen > 0; buf++, buflen--) {
201 ucs4_t wc = *buf;
202 int outcount;
203 if (outleft == 0) {
204 plocals->l_errno = E2BIG;
205 break;
207 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
208 if (outcount != RET_ILUNI)
209 goto outcount_ok;
210 /* Handle Unicode tag characters (range U+E0000..U+E007F). */
211 if ((wc >> 7) == (0xe0000 >> 7))
212 goto outcount_zero;
213 /* Try transliteration. */
214 if (cd->transliterate) {
215 outcount = unicode_transliterate(cd,wc,outptr,outleft);
216 if (outcount != RET_ILUNI)
217 goto outcount_ok;
219 if (cd->discard_ilseq) {
220 outcount = 0;
221 goto outcount_ok;
223 #ifndef LIBICONV_PLUG
224 else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
225 struct uc_to_mb_fallback_locals locals;
226 locals.l_outbuf = outptr;
227 locals.l_outbytesleft = outleft;
228 locals.l_errno = 0;
229 cd->fallbacks.uc_to_mb_fallback(wc,
230 uc_to_mb_write_replacement,
231 &locals,
232 cd->fallbacks.data);
233 if (locals.l_errno != 0) {
234 plocals->l_errno = locals.l_errno;
235 break;
237 outptr = locals.l_outbuf;
238 outleft = locals.l_outbytesleft;
239 outcount = 0;
240 goto outcount_ok;
242 #endif
243 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
244 if (outcount != RET_ILUNI)
245 goto outcount_ok;
246 plocals->l_errno = EILSEQ;
247 break;
248 outcount_ok:
249 if (outcount < 0) {
250 plocals->l_errno = E2BIG;
251 break;
253 #ifndef LIBICONV_PLUG
254 if (cd->hooks.uc_hook)
255 (*cd->hooks.uc_hook)(wc, cd->hooks.data);
256 #endif
257 if (!(outcount <= outleft)) abort();
258 outptr += outcount; outleft -= outcount;
259 outcount_zero: ;
261 plocals->l_outbuf = outptr;
262 plocals->l_outbytesleft = outleft;
266 #endif /* !LIBICONV_PLUG */
268 static size_t unicode_loop_convert (iconv_t icd,
269 const char* * inbuf, size_t *inbytesleft,
270 char* * outbuf, size_t *outbytesleft)
272 conv_t cd = (conv_t) icd;
273 size_t result = 0;
274 const unsigned char* inptr = (const unsigned char*) *inbuf;
275 size_t inleft = *inbytesleft;
276 unsigned char* outptr = (unsigned char*) *outbuf;
277 size_t outleft = *outbytesleft;
278 while (inleft > 0) {
279 state_t last_istate = cd->istate;
280 ucs4_t wc;
281 int incount;
282 int outcount;
283 incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft);
284 if (incount < 0) {
285 if ((unsigned int)(-1-incount) % 2 == (unsigned int)(-1-RET_ILSEQ) % 2) {
286 /* Case 1: invalid input, possibly after a shift sequence */
287 incount = DECODE_SHIFT_ILSEQ(incount);
288 if (cd->discard_ilseq) {
289 switch (cd->iindex) {
290 case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
291 case ei_utf32: case ei_utf32be: case ei_utf32le:
292 case ei_ucs4internal: case ei_ucs4swapped:
293 incount += 4; break;
294 case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
295 case ei_utf16: case ei_utf16be: case ei_utf16le:
296 case ei_ucs2internal: case ei_ucs2swapped:
297 incount += 2; break;
298 default:
299 incount += 1; break;
301 goto outcount_zero;
303 #ifndef LIBICONV_PLUG
304 else if (cd->fallbacks.mb_to_uc_fallback != NULL) {
305 unsigned int incount2;
306 struct mb_to_uc_fallback_locals locals;
307 switch (cd->iindex) {
308 case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
309 case ei_utf32: case ei_utf32be: case ei_utf32le:
310 case ei_ucs4internal: case ei_ucs4swapped:
311 incount2 = 4; break;
312 case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
313 case ei_utf16: case ei_utf16be: case ei_utf16le:
314 case ei_ucs2internal: case ei_ucs2swapped:
315 incount2 = 2; break;
316 default:
317 incount2 = 1; break;
319 locals.l_cd = cd;
320 locals.l_outbuf = outptr;
321 locals.l_outbytesleft = outleft;
322 locals.l_errno = 0;
323 cd->fallbacks.mb_to_uc_fallback((const char*)inptr+incount, incount2,
324 mb_to_uc_write_replacement,
325 &locals,
326 cd->fallbacks.data);
327 if (locals.l_errno != 0) {
328 inptr += incount; inleft -= incount;
329 errno = locals.l_errno;
330 result = -1;
331 break;
333 incount += incount2;
334 outptr = locals.l_outbuf;
335 outleft = locals.l_outbytesleft;
336 result += 1;
337 goto outcount_zero;
339 #endif
340 inptr += incount; inleft -= incount;
341 errno = EILSEQ;
342 result = -1;
343 break;
345 if (incount == RET_TOOFEW(0)) {
346 /* Case 2: not enough bytes available to detect anything */
347 errno = EINVAL;
348 result = -1;
349 break;
351 /* Case 3: k bytes read, but only a shift sequence */
352 incount = DECODE_TOOFEW(incount);
353 } else {
354 /* Case 4: k bytes read, making up a wide character */
355 if (outleft == 0) {
356 cd->istate = last_istate;
357 errno = E2BIG;
358 result = -1;
359 break;
361 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
362 if (outcount != RET_ILUNI)
363 goto outcount_ok;
364 /* Handle Unicode tag characters (range U+E0000..U+E007F). */
365 if ((wc >> 7) == (0xe0000 >> 7))
366 goto outcount_zero;
367 /* Try transliteration. */
368 result++;
369 if (cd->transliterate) {
370 outcount = unicode_transliterate(cd,wc,outptr,outleft);
371 if (outcount != RET_ILUNI)
372 goto outcount_ok;
374 if (cd->discard_ilseq) {
375 outcount = 0;
376 goto outcount_ok;
378 #ifndef LIBICONV_PLUG
379 else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
380 struct uc_to_mb_fallback_locals locals;
381 locals.l_outbuf = outptr;
382 locals.l_outbytesleft = outleft;
383 locals.l_errno = 0;
384 cd->fallbacks.uc_to_mb_fallback(wc,
385 uc_to_mb_write_replacement,
386 &locals,
387 cd->fallbacks.data);
388 if (locals.l_errno != 0) {
389 cd->istate = last_istate;
390 errno = locals.l_errno;
391 return -1;
393 outptr = locals.l_outbuf;
394 outleft = locals.l_outbytesleft;
395 outcount = 0;
396 goto outcount_ok;
398 #endif
399 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
400 if (outcount != RET_ILUNI)
401 goto outcount_ok;
402 cd->istate = last_istate;
403 errno = EILSEQ;
404 result = -1;
405 break;
406 outcount_ok:
407 if (outcount < 0) {
408 cd->istate = last_istate;
409 errno = E2BIG;
410 result = -1;
411 break;
413 #ifndef LIBICONV_PLUG
414 if (cd->hooks.uc_hook)
415 (*cd->hooks.uc_hook)(wc, cd->hooks.data);
416 #endif
417 if (!(outcount <= outleft)) abort();
418 outptr += outcount; outleft -= outcount;
420 outcount_zero:
421 if (!(incount <= inleft)) abort();
422 inptr += incount; inleft -= incount;
424 *inbuf = (const char*) inptr;
425 *inbytesleft = inleft;
426 *outbuf = (char*) outptr;
427 *outbytesleft = outleft;
428 return result;
431 static size_t unicode_loop_reset (iconv_t icd,
432 char* * outbuf, size_t *outbytesleft)
434 conv_t cd = (conv_t) icd;
435 if (outbuf == NULL || *outbuf == NULL) {
436 /* Reset the states. */
437 memset(&cd->istate,'\0',sizeof(state_t));
438 memset(&cd->ostate,'\0',sizeof(state_t));
439 return 0;
440 } else {
441 size_t result = 0;
442 if (cd->ifuncs.xxx_flushwc) {
443 state_t last_istate = cd->istate;
444 ucs4_t wc;
445 if (cd->ifuncs.xxx_flushwc(cd, &wc)) {
446 unsigned char* outptr = (unsigned char*) *outbuf;
447 size_t outleft = *outbytesleft;
448 int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
449 if (outcount != RET_ILUNI)
450 goto outcount_ok;
451 /* Handle Unicode tag characters (range U+E0000..U+E007F). */
452 if ((wc >> 7) == (0xe0000 >> 7))
453 goto outcount_zero;
454 /* Try transliteration. */
455 result++;
456 if (cd->transliterate) {
457 outcount = unicode_transliterate(cd,wc,outptr,outleft);
458 if (outcount != RET_ILUNI)
459 goto outcount_ok;
461 if (cd->discard_ilseq) {
462 outcount = 0;
463 goto outcount_ok;
465 #ifndef LIBICONV_PLUG
466 else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
467 struct uc_to_mb_fallback_locals locals;
468 locals.l_outbuf = outptr;
469 locals.l_outbytesleft = outleft;
470 locals.l_errno = 0;
471 cd->fallbacks.uc_to_mb_fallback(wc,
472 uc_to_mb_write_replacement,
473 &locals,
474 cd->fallbacks.data);
475 if (locals.l_errno != 0) {
476 cd->istate = last_istate;
477 errno = locals.l_errno;
478 return -1;
480 outptr = locals.l_outbuf;
481 outleft = locals.l_outbytesleft;
482 outcount = 0;
483 goto outcount_ok;
485 #endif
486 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
487 if (outcount != RET_ILUNI)
488 goto outcount_ok;
489 cd->istate = last_istate;
490 errno = EILSEQ;
491 return -1;
492 outcount_ok:
493 if (outcount < 0) {
494 cd->istate = last_istate;
495 errno = E2BIG;
496 return -1;
498 #ifndef LIBICONV_PLUG
499 if (cd->hooks.uc_hook)
500 (*cd->hooks.uc_hook)(wc, cd->hooks.data);
501 #endif
502 if (!(outcount <= outleft)) abort();
503 outptr += outcount;
504 outleft -= outcount;
505 outcount_zero:
506 *outbuf = (char*) outptr;
507 *outbytesleft = outleft;
510 if (cd->ofuncs.xxx_reset) {
511 unsigned char* outptr = (unsigned char*) *outbuf;
512 size_t outleft = *outbytesleft;
513 int outcount = cd->ofuncs.xxx_reset(cd,outptr,outleft);
514 if (outcount < 0) {
515 errno = E2BIG;
516 return -1;
518 if (!(outcount <= outleft)) abort();
519 *outbuf = (char*) (outptr + outcount);
520 *outbytesleft = outleft - outcount;
522 memset(&cd->istate,'\0',sizeof(state_t));
523 memset(&cd->ostate,'\0',sizeof(state_t));
524 return result;