missing project/build files
[client-tools.git] / src / external / 3rd / library / libxml / encoding.c
blobebb1eeca4dd591f411110d82e544886365b896d3
1 /*
2 * encoding.c : implements the encoding conversion functions needed for XML
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
16 * See Copyright for the status of this software.
18 * daniel@veillard.com
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
26 #define IN_LIBXML
27 #include "libxml.h"
29 #include <string.h>
31 #ifdef HAVE_CTYPE_H
32 #include <ctype.h>
33 #endif
34 #ifdef HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #ifdef LIBXML_ICONV_ENABLED
38 #ifdef HAVE_ERRNO_H
39 #include <errno.h>
40 #endif
41 #endif
42 #include <libxml/encoding.h>
43 #include <libxml/xmlmemory.h>
44 #ifdef LIBXML_HTML_ENABLED
45 #include <libxml/HTMLparser.h>
46 #endif
47 #include <libxml/globals.h>
48 #include <libxml/xmlerror.h>
50 static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51 static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
53 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54 typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55 struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
60 static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61 static int xmlCharEncodingAliasesNb = 0;
62 static int xmlCharEncodingAliasesMax = 0;
64 #ifdef LIBXML_ICONV_ENABLED
65 #if 0
66 #define DEBUG_ENCODING /* Define this to get encoding traces */
67 #endif
68 #endif
70 static int xmlLittleEndian = 1;
72 /************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
87 /**
88 * xmlUTF8Strlen:
89 * @utf: a sequence of UTF-8 encoded bytes
91 * compute the length of an UTF8 string, it doesn't do a full UTF8
92 * checking of the content of the string.
94 * Returns the number of characters in the string or -1 in case of error
96 int
97 xmlUTF8Strlen(const xmlChar *utf) {
98 int ret = 0;
100 if (utf == NULL)
101 return(-1);
103 while (*utf != 0) {
104 if (utf[0] & 0x80) {
105 if ((utf[1] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xe0) == 0xe0) {
108 if ((utf[2] & 0xc0) != 0x80)
109 return(-1);
110 if ((utf[0] & 0xf0) == 0xf0) {
111 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
112 return(-1);
113 utf += 4;
114 } else {
115 utf += 3;
117 } else {
118 utf += 2;
120 } else {
121 utf++;
123 ret++;
125 return(ret);
129 * xmlGetUTF8Char:
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
133 * Read one UTF8 Char from @utf
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
139 xmlGetUTF8Char(const unsigned char *utf, int *len) {
140 unsigned int c;
142 if (utf == NULL)
143 goto error;
144 if (len == NULL)
145 goto error;
146 if (*len < 1)
147 goto error;
149 c = utf[0];
150 if (c & 0x80) {
151 if (*len < 2)
152 goto error;
153 if ((utf[1] & 0xc0) != 0x80)
154 goto error;
155 if ((c & 0xe0) == 0xe0) {
156 if (*len < 3)
157 goto error;
158 if ((utf[2] & 0xc0) != 0x80)
159 goto error;
160 if ((c & 0xf0) == 0xf0) {
161 if (*len < 4)
162 goto error;
163 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
164 goto error;
165 *len = 4;
166 /* 4-byte code */
167 c = (utf[0] & 0x7) << 18;
168 c |= (utf[1] & 0x3f) << 12;
169 c |= (utf[2] & 0x3f) << 6;
170 c |= utf[3] & 0x3f;
171 } else {
172 /* 3-byte code */
173 *len = 3;
174 c = (utf[0] & 0xf) << 12;
175 c |= (utf[1] & 0x3f) << 6;
176 c |= utf[2] & 0x3f;
178 } else {
179 /* 2-byte code */
180 *len = 2;
181 c = (utf[0] & 0x1f) << 6;
182 c |= utf[1] & 0x3f;
184 } else {
185 /* 1-byte code */
186 *len = 1;
188 return(c);
190 error:
191 *len = 0;
192 return(-1);
196 * xmlCheckUTF8: Check utf-8 string for legality.
197 * @utf: Pointer to putative utf-8 encoded string.
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
203 * routine checks for the 4-byte maximum size, but does not check for
204 * 0x10ffff maximum value.
206 * Return value: true if @utf is valid.
209 xmlCheckUTF8(const unsigned char *utf)
211 int ix;
212 unsigned char c;
214 for (ix = 0; (c = utf[ix]);) {
215 if (c & 0x80) {
216 if ((utf[ix + 1] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xe0) == 0xe0) {
219 if ((utf[ix + 2] & 0xc0) != 0x80)
220 return(0);
221 if ((c & 0xf0) == 0xf0) {
222 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
223 return(0);
224 ix += 4;
225 /* 4-byte code */
226 } else
227 /* 3-byte code */
228 ix += 3;
229 } else
230 /* 2-byte code */
231 ix += 2;
232 } else
233 /* 1-byte code */
234 ix++;
236 return(1);
240 * xmlUTF8Strsize:
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
244 * storage size of an UTF8 string
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
252 xmlUTF8Strsize(const xmlChar *utf, int len) {
253 const xmlChar *ptr=utf;
254 xmlChar ch;
256 if (len <= 0)
257 return(0);
259 while ( len-- > 0) {
260 if ( !*ptr )
261 break;
262 if ( (ch = *ptr++) & 0x80)
263 while ( (ch<<=1) & 0x80 )
264 ptr++;
266 return (ptr - utf);
271 * xmlUTF8Strndup:
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
275 * a strndup for array of UTF8's
277 * Returns a new UTF8 * or NULL
279 xmlChar *
280 xmlUTF8Strndup(const xmlChar *utf, int len) {
281 xmlChar *ret;
282 int i;
284 if ((utf == NULL) || (len < 0)) return(NULL);
285 i = xmlUTF8Strsize(utf, len);
286 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
287 if (ret == NULL) {
288 xmlGenericError(xmlGenericErrorContext,
289 "malloc of %ld byte failed\n",
290 (len + 1) * (long)sizeof(xmlChar));
291 return(NULL);
293 memcpy(ret, utf, i * sizeof(xmlChar));
294 ret[i] = 0;
295 return(ret);
299 * xmlUTF8Strpos:
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
303 * a function to provide the equivalent of fetching a
304 * character from a string array
306 * Returns a pointer to the UTF8 character or NULL
308 xmlChar *
309 xmlUTF8Strpos(const xmlChar *utf, int pos) {
310 xmlChar ch;
312 if (utf == NULL) return(NULL);
313 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
314 return(NULL);
315 while (pos--) {
316 if ((ch=*utf++) == 0) return(NULL);
317 if ( ch & 0x80 ) {
318 /* if not simple ascii, verify proper format */
319 if ( (ch & 0xc0) != 0xc0 )
320 return(NULL);
321 /* then skip over remaining bytes for this char */
322 while ( (ch <<= 1) & 0x80 )
323 if ( (*utf++ & 0xc0) != 0x80 )
324 return(NULL);
327 return((xmlChar *)utf);
331 * xmlUTF8Strloc:
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
335 * a function to provide relative location of a UTF8 char
337 * Returns the relative character position of the desired char
338 * or -1 if not found
341 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
342 int i, size;
343 xmlChar ch;
345 if (utf==NULL || utfchar==NULL) return -1;
346 size = xmlUTF8Strsize(utfchar, 1);
347 for(i=0; (ch=*utf) != 0; i++) {
348 if (xmlStrncmp(utf, utfchar, size)==0)
349 return(i);
350 utf++;
351 if ( ch & 0x80 ) {
352 /* if not simple ascii, verify proper format */
353 if ( (ch & 0xc0) != 0xc0 )
354 return(-1);
355 /* then skip over remaining bytes for this char */
356 while ( (ch <<= 1) & 0x80 )
357 if ( (*utf++ & 0xc0) != 0x80 )
358 return(-1);
362 return(-1);
365 * xmlUTF8Strsub:
366 * @utf: a sequence of UTF-8 encoded bytes
367 * @start: relative pos of first char
368 * @len: total number to copy
370 * Note: positions are given in units of UTF-8 chars
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
376 xmlChar *
377 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
378 int i;
379 xmlChar ch;
381 if (utf == NULL) return(NULL);
382 if (start < 0) return(NULL);
383 if (len < 0) return(NULL);
386 * Skip over any leading chars
388 for (i = 0;i < start;i++) {
389 if ((ch=*utf++) == 0) return(NULL);
390 if ( ch & 0x80 ) {
391 /* if not simple ascii, verify proper format */
392 if ( (ch & 0xc0) != 0xc0 )
393 return(NULL);
394 /* then skip over remaining bytes for this char */
395 while ( (ch <<= 1) & 0x80 )
396 if ( (*utf++ & 0xc0) != 0x80 )
397 return(NULL);
401 return(xmlUTF8Strndup(utf, len));
404 /************************************************************************
406 * Conversions To/From UTF8 encoding *
408 ************************************************************************/
411 * asciiToUTF8:
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
421 * as the return value is positive, else unpredictable.
422 * The value of @outlen after return is the number of ocetes consumed.
424 static int
425 asciiToUTF8(unsigned char* out, int *outlen,
426 const unsigned char* in, int *inlen) {
427 unsigned char* outstart = out;
428 const unsigned char* base = in;
429 const unsigned char* processed = in;
430 unsigned char* outend = out + *outlen;
431 const unsigned char* inend;
432 unsigned int c;
433 int bits;
435 inend = in + (*inlen);
436 while ((in < inend) && (out - outstart + 5 < *outlen)) {
437 c= *in++;
439 /* assertion: c is a single UTF-4 value */
440 if (out >= outend)
441 break;
442 if (c < 0x80) { *out++= c; bits= -6; }
443 else {
444 *outlen = out - outstart;
445 *inlen = processed - base;
446 return(-1);
449 for ( ; bits >= 0; bits-= 6) {
450 if (out >= outend)
451 break;
452 *out++= ((c >> bits) & 0x3F) | 0x80;
454 processed = (const unsigned char*) in;
456 *outlen = out - outstart;
457 *inlen = processed - base;
458 return(0);
462 * UTF8Toascii:
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
473 * as the return value is positive, else unpredictable.
474 * The value of @outlen after return is the number of ocetes consumed.
476 static int
477 UTF8Toascii(unsigned char* out, int *outlen,
478 const unsigned char* in, int *inlen) {
479 const unsigned char* processed = in;
480 const unsigned char* outend;
481 const unsigned char* outstart = out;
482 const unsigned char* instart = in;
483 const unsigned char* inend;
484 unsigned int c, d;
485 int trailing;
487 if (in == NULL) {
489 * initialization nothing to do
491 *outlen = 0;
492 *inlen = 0;
493 return(0);
495 inend = in + (*inlen);
496 outend = out + (*outlen);
497 while (in < inend) {
498 d = *in++;
499 if (d < 0x80) { c= d; trailing= 0; }
500 else if (d < 0xC0) {
501 /* trailing byte in leading position */
502 *outlen = out - outstart;
503 *inlen = processed - instart;
504 return(-2);
505 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
506 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
507 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
508 else {
509 /* no chance for this in Ascii */
510 *outlen = out - outstart;
511 *inlen = processed - instart;
512 return(-2);
515 if (inend - in < trailing) {
516 break;
519 for ( ; trailing; trailing--) {
520 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
521 break;
522 c <<= 6;
523 c |= d & 0x3F;
526 /* assertion: c is a single UTF-4 value */
527 if (c < 0x80) {
528 if (out >= outend)
529 break;
530 *out++ = c;
531 } else {
532 /* no chance for this in Ascii */
533 *outlen = out - outstart;
534 *inlen = processed - instart;
535 return(-2);
537 processed = in;
539 *outlen = out - outstart;
540 *inlen = processed - instart;
541 return(0);
545 * isolat1ToUTF8:
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
555 * as the return value is positive, else unpredictable.
556 * The value of @outlen after return is the number of ocetes consumed.
559 isolat1ToUTF8(unsigned char* out, int *outlen,
560 const unsigned char* in, int *inlen) {
561 unsigned char* outstart = out;
562 const unsigned char* base = in;
563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
565 const unsigned char* instop;
566 xmlChar c = *in;
568 inend = in + (*inlen);
569 instop = inend;
571 while (in < inend && out < outend - 1) {
572 if (c >= 0x80) {
573 *out++= ((c >> 6) & 0x1F) | 0xC0;
574 *out++= (c & 0x3F) | 0x80;
575 ++in;
576 c = *in;
578 if (instop - in > outend - out) instop = in + (outend - out);
579 while (c < 0x80 && in < instop) {
580 *out++ = c;
581 ++in;
582 c = *in;
585 if (in < inend && out < outend && c < 0x80) {
586 *out++ = c;
587 ++in;
589 *outlen = out - outstart;
590 *inlen = in - base;
591 return(0);
596 * UTF8Toisolat1:
597 * @out: a pointer to an array of bytes to store the result
598 * @outlen: the length of @out
599 * @in: a pointer to an array of UTF-8 chars
600 * @inlen: the length of @in
602 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
603 * block of chars out.
605 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
606 * The value of @inlen after return is the number of octets consumed
607 * as the return value is positive, else unpredictable.
608 * The value of @outlen after return is the number of ocetes consumed.
611 UTF8Toisolat1(unsigned char* out, int *outlen,
612 const unsigned char* in, int *inlen) {
613 const unsigned char* processed = in;
614 const unsigned char* outend;
615 const unsigned char* outstart = out;
616 const unsigned char* instart = in;
617 const unsigned char* inend;
618 unsigned int c, d;
619 int trailing;
621 if (in == NULL) {
623 * initialization nothing to do
625 *outlen = 0;
626 *inlen = 0;
627 return(0);
629 inend = in + (*inlen);
630 outend = out + (*outlen);
631 while (in < inend) {
632 d = *in++;
633 if (d < 0x80) { c= d; trailing= 0; }
634 else if (d < 0xC0) {
635 /* trailing byte in leading position */
636 *outlen = out - outstart;
637 *inlen = processed - instart;
638 return(-2);
639 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
640 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
641 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
642 else {
643 /* no chance for this in IsoLat1 */
644 *outlen = out - outstart;
645 *inlen = processed - instart;
646 return(-2);
649 if (inend - in < trailing) {
650 break;
653 for ( ; trailing; trailing--) {
654 if (in >= inend)
655 break;
656 if (((d= *in++) & 0xC0) != 0x80) {
657 *outlen = out - outstart;
658 *inlen = processed - instart;
659 return(-2);
661 c <<= 6;
662 c |= d & 0x3F;
665 /* assertion: c is a single UTF-4 value */
666 if (c <= 0xFF) {
667 if (out >= outend)
668 break;
669 *out++ = c;
670 } else {
671 /* no chance for this in IsoLat1 */
672 *outlen = out - outstart;
673 *inlen = processed - instart;
674 return(-2);
676 processed = in;
678 *outlen = out - outstart;
679 *inlen = processed - instart;
680 return(0);
684 * UTF16LEToUTF8:
685 * @out: a pointer to an array of bytes to store the result
686 * @outlen: the length of @out
687 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
688 * @inlenb: the length of @in in UTF-16LE chars
690 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
691 * block of chars out. This function assume the endian property
692 * is the same between the native type of this machine and the
693 * inputed one.
695 * Returns the number of byte written, or -1 by lack of space, or -2
696 * if the transcoding fails (for *in is not valid utf16 string)
697 * The value of *inlen after return is the number of octets consumed
698 * as the return value is positive, else unpredictable.
700 static int
701 UTF16LEToUTF8(unsigned char* out, int *outlen,
702 const unsigned char* inb, int *inlenb)
704 unsigned char* outstart = out;
705 const unsigned char* processed = inb;
706 unsigned char* outend = out + *outlen;
707 unsigned short* in = (unsigned short*) inb;
708 unsigned short* inend;
709 unsigned int c, d, inlen;
710 unsigned char *tmp;
711 int bits;
713 if ((*inlenb % 2) == 1)
714 (*inlenb)--;
715 inlen = *inlenb / 2;
716 inend = in + inlen;
717 while ((in < inend) && (out - outstart + 5 < *outlen)) {
718 if (xmlLittleEndian) {
719 c= *in++;
720 } else {
721 tmp = (unsigned char *) in;
722 c = *tmp++;
723 c = c | (((unsigned int)*tmp) << 8);
724 in++;
726 if ((c & 0xFC00) == 0xD800) { /* surrogates */
727 if (in >= inend) { /* (in > inend) shouldn't happens */
728 break;
730 if (xmlLittleEndian) {
731 d = *in++;
732 } else {
733 tmp = (unsigned char *) in;
734 d = *tmp++;
735 d = d | (((unsigned int)*tmp) << 8);
736 in++;
738 if ((d & 0xFC00) == 0xDC00) {
739 c &= 0x03FF;
740 c <<= 10;
741 c |= d & 0x03FF;
742 c += 0x10000;
744 else {
745 *outlen = out - outstart;
746 *inlenb = processed - inb;
747 return(-2);
751 /* assertion: c is a single UTF-4 value */
752 if (out >= outend)
753 break;
754 if (c < 0x80) { *out++= c; bits= -6; }
755 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
756 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
757 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
759 for ( ; bits >= 0; bits-= 6) {
760 if (out >= outend)
761 break;
762 *out++= ((c >> bits) & 0x3F) | 0x80;
764 processed = (const unsigned char*) in;
766 *outlen = out - outstart;
767 *inlenb = processed - inb;
768 return(0);
772 * UTF8ToUTF16LE:
773 * @outb: a pointer to an array of bytes to store the result
774 * @outlen: the length of @outb
775 * @in: a pointer to an array of UTF-8 chars
776 * @inlen: the length of @in
778 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
779 * block of chars out.
781 * Returns the number of byte written, or -1 by lack of space, or -2
782 * if the transcoding failed.
784 static int
785 UTF8ToUTF16LE(unsigned char* outb, int *outlen,
786 const unsigned char* in, int *inlen)
788 unsigned short* out = (unsigned short*) outb;
789 const unsigned char* processed = in;
790 unsigned short* outstart= out;
791 unsigned short* outend;
792 const unsigned char* inend= in+*inlen;
793 unsigned int c, d;
794 int trailing;
795 unsigned char *tmp;
796 unsigned short tmp1, tmp2;
798 if (in == NULL) {
800 * initialization, add the Byte Order Mark
802 if (*outlen >= 2) {
803 outb[0] = 0xFF;
804 outb[1] = 0xFE;
805 *outlen = 2;
806 *inlen = 0;
807 #ifdef DEBUG_ENCODING
808 xmlGenericError(xmlGenericErrorContext,
809 "Added FFFE Byte Order Mark\n");
810 #endif
811 return(2);
813 *outlen = 0;
814 *inlen = 0;
815 return(0);
817 outend = out + (*outlen / 2);
818 while (in < inend) {
819 d= *in++;
820 if (d < 0x80) { c= d; trailing= 0; }
821 else if (d < 0xC0) {
822 /* trailing byte in leading position */
823 *outlen = (out - outstart) * 2;
824 *inlen = processed - in;
825 return(-2);
826 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
827 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
828 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
829 else {
830 /* no chance for this in UTF-16 */
831 *outlen = (out - outstart) * 2;
832 *inlen = processed - in;
833 return(-2);
836 if (inend - in < trailing) {
837 break;
840 for ( ; trailing; trailing--) {
841 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
842 break;
843 c <<= 6;
844 c |= d & 0x3F;
847 /* assertion: c is a single UTF-4 value */
848 if (c < 0x10000) {
849 if (out >= outend)
850 break;
851 if (xmlLittleEndian) {
852 *out++ = c;
853 } else {
854 tmp = (unsigned char *) out;
855 *tmp = c ;
856 *(tmp + 1) = c >> 8 ;
857 out++;
860 else if (c < 0x110000) {
861 if (out+1 >= outend)
862 break;
863 c -= 0x10000;
864 if (xmlLittleEndian) {
865 *out++ = 0xD800 | (c >> 10);
866 *out++ = 0xDC00 | (c & 0x03FF);
867 } else {
868 tmp1 = 0xD800 | (c >> 10);
869 tmp = (unsigned char *) out;
870 *tmp = (unsigned char) tmp1;
871 *(tmp + 1) = tmp1 >> 8;
872 out++;
874 tmp2 = 0xDC00 | (c & 0x03FF);
875 tmp = (unsigned char *) out;
876 *tmp = (unsigned char) tmp2;
877 *(tmp + 1) = tmp2 >> 8;
878 out++;
881 else
882 break;
883 processed = in;
885 *outlen = (out - outstart) * 2;
886 *inlen = processed - in;
887 return(0);
891 * UTF16BEToUTF8:
892 * @out: a pointer to an array of bytes to store the result
893 * @outlen: the length of @out
894 * @inb: a pointer to an array of UTF-16 passwd as a byte array
895 * @inlenb: the length of @in in UTF-16 chars
897 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
898 * block of chars out. This function assume the endian property
899 * is the same between the native type of this machine and the
900 * inputed one.
902 * Returns the number of byte written, or -1 by lack of space, or -2
903 * if the transcoding fails (for *in is not valid utf16 string)
904 * The value of *inlen after return is the number of octets consumed
905 * as the return value is positive, else unpredictable.
907 static int
908 UTF16BEToUTF8(unsigned char* out, int *outlen,
909 const unsigned char* inb, int *inlenb)
911 unsigned char* outstart = out;
912 const unsigned char* processed = inb;
913 unsigned char* outend = out + *outlen;
914 unsigned short* in = (unsigned short*) inb;
915 unsigned short* inend;
916 unsigned int c, d, inlen;
917 unsigned char *tmp;
918 int bits;
920 if ((*inlenb % 2) == 1)
921 (*inlenb)--;
922 inlen = *inlenb / 2;
923 inend= in + inlen;
924 while (in < inend) {
925 if (xmlLittleEndian) {
926 tmp = (unsigned char *) in;
927 c = *tmp++;
928 c = c << 8;
929 c = c | (unsigned int) *tmp;
930 in++;
931 } else {
932 c= *in++;
934 if ((c & 0xFC00) == 0xD800) { /* surrogates */
935 if (in >= inend) { /* (in > inend) shouldn't happens */
936 *outlen = out - outstart;
937 *inlenb = processed - inb;
938 return(-2);
940 if (xmlLittleEndian) {
941 tmp = (unsigned char *) in;
942 d = *tmp++;
943 d = d << 8;
944 d = d | (unsigned int) *tmp;
945 in++;
946 } else {
947 d= *in++;
949 if ((d & 0xFC00) == 0xDC00) {
950 c &= 0x03FF;
951 c <<= 10;
952 c |= d & 0x03FF;
953 c += 0x10000;
955 else {
956 *outlen = out - outstart;
957 *inlenb = processed - inb;
958 return(-2);
962 /* assertion: c is a single UTF-4 value */
963 if (out >= outend)
964 break;
965 if (c < 0x80) { *out++= c; bits= -6; }
966 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
967 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
968 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
970 for ( ; bits >= 0; bits-= 6) {
971 if (out >= outend)
972 break;
973 *out++= ((c >> bits) & 0x3F) | 0x80;
975 processed = (const unsigned char*) in;
977 *outlen = out - outstart;
978 *inlenb = processed - inb;
979 return(0);
983 * UTF8ToUTF16BE:
984 * @outb: a pointer to an array of bytes to store the result
985 * @outlen: the length of @outb
986 * @in: a pointer to an array of UTF-8 chars
987 * @inlen: the length of @in
989 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
990 * block of chars out.
992 * Returns the number of byte written, or -1 by lack of space, or -2
993 * if the transcoding failed.
995 static int
996 UTF8ToUTF16BE(unsigned char* outb, int *outlen,
997 const unsigned char* in, int *inlen)
999 unsigned short* out = (unsigned short*) outb;
1000 const unsigned char* processed = in;
1001 unsigned short* outstart= out;
1002 unsigned short* outend;
1003 const unsigned char* inend= in+*inlen;
1004 unsigned int c, d;
1005 int trailing;
1006 unsigned char *tmp;
1007 unsigned short tmp1, tmp2;
1009 if (in == NULL) {
1011 * initialization, add the Byte Order Mark
1013 if (*outlen >= 2) {
1014 outb[0] = 0xFE;
1015 outb[1] = 0xFF;
1016 *outlen = 2;
1017 *inlen = 0;
1018 #ifdef DEBUG_ENCODING
1019 xmlGenericError(xmlGenericErrorContext,
1020 "Added FEFF Byte Order Mark\n");
1021 #endif
1022 return(2);
1024 *outlen = 0;
1025 *inlen = 0;
1026 return(0);
1028 outend = out + (*outlen / 2);
1029 while (in < inend) {
1030 d= *in++;
1031 if (d < 0x80) { c= d; trailing= 0; }
1032 else if (d < 0xC0) {
1033 /* trailing byte in leading position */
1034 *outlen = out - outstart;
1035 *inlen = processed - in;
1036 return(-2);
1037 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1038 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1039 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1040 else {
1041 /* no chance for this in UTF-16 */
1042 *outlen = out - outstart;
1043 *inlen = processed - in;
1044 return(-2);
1047 if (inend - in < trailing) {
1048 break;
1051 for ( ; trailing; trailing--) {
1052 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1053 c <<= 6;
1054 c |= d & 0x3F;
1057 /* assertion: c is a single UTF-4 value */
1058 if (c < 0x10000) {
1059 if (out >= outend) break;
1060 if (xmlLittleEndian) {
1061 tmp = (unsigned char *) out;
1062 *tmp = c >> 8;
1063 *(tmp + 1) = c;
1064 out++;
1065 } else {
1066 *out++ = c;
1069 else if (c < 0x110000) {
1070 if (out+1 >= outend) break;
1071 c -= 0x10000;
1072 if (xmlLittleEndian) {
1073 tmp1 = 0xD800 | (c >> 10);
1074 tmp = (unsigned char *) out;
1075 *tmp = tmp1 >> 8;
1076 *(tmp + 1) = (unsigned char) tmp1;
1077 out++;
1079 tmp2 = 0xDC00 | (c & 0x03FF);
1080 tmp = (unsigned char *) out;
1081 *tmp = tmp2 >> 8;
1082 *(tmp + 1) = (unsigned char) tmp2;
1083 out++;
1084 } else {
1085 *out++ = 0xD800 | (c >> 10);
1086 *out++ = 0xDC00 | (c & 0x03FF);
1089 else
1090 break;
1091 processed = in;
1093 *outlen = (out - outstart) * 2;
1094 *inlen = processed - in;
1095 return(0);
1098 /************************************************************************
1100 * Generic encoding handling routines *
1102 ************************************************************************/
1105 * xmlDetectCharEncoding:
1106 * @in: a pointer to the first bytes of the XML entity, must be at least
1107 * 4 bytes long.
1108 * @len: pointer to the length of the buffer
1110 * Guess the encoding of the entity using the first bytes of the entity content
1111 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1113 * Returns one of the XML_CHAR_ENCODING_... values.
1115 xmlCharEncoding
1116 xmlDetectCharEncoding(const unsigned char* in, int len)
1118 if (len >= 4) {
1119 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1120 (in[2] == 0x00) && (in[3] == 0x3C))
1121 return(XML_CHAR_ENCODING_UCS4BE);
1122 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1123 (in[2] == 0x00) && (in[3] == 0x00))
1124 return(XML_CHAR_ENCODING_UCS4LE);
1125 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1126 (in[2] == 0x3C) && (in[3] == 0x00))
1127 return(XML_CHAR_ENCODING_UCS4_2143);
1128 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1129 (in[2] == 0x00) && (in[3] == 0x00))
1130 return(XML_CHAR_ENCODING_UCS4_3412);
1131 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1132 (in[2] == 0xA7) && (in[3] == 0x94))
1133 return(XML_CHAR_ENCODING_EBCDIC);
1134 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1135 (in[2] == 0x78) && (in[3] == 0x6D))
1136 return(XML_CHAR_ENCODING_UTF8);
1138 if (len >= 3) {
1140 * Errata on XML-1.0 June 20 2001
1141 * We now allow an UTF8 encoded BOM
1143 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1144 (in[2] == 0xBF))
1145 return(XML_CHAR_ENCODING_UTF8);
1147 if (len >= 2) {
1148 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1149 return(XML_CHAR_ENCODING_UTF16BE);
1150 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1151 return(XML_CHAR_ENCODING_UTF16LE);
1153 return(XML_CHAR_ENCODING_NONE);
1157 * xmlCleanupEncodingAliases:
1159 * Unregisters all aliases
1161 void
1162 xmlCleanupEncodingAliases(void) {
1163 int i;
1165 if (xmlCharEncodingAliases == NULL)
1166 return;
1168 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1169 if (xmlCharEncodingAliases[i].name != NULL)
1170 xmlFree((char *) xmlCharEncodingAliases[i].name);
1171 if (xmlCharEncodingAliases[i].alias != NULL)
1172 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1174 xmlCharEncodingAliasesNb = 0;
1175 xmlCharEncodingAliasesMax = 0;
1176 xmlFree(xmlCharEncodingAliases);
1177 xmlCharEncodingAliases = NULL;
1181 * xmlGetEncodingAlias:
1182 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1184 * Lookup an encoding name for the given alias.
1186 * Returns NULL if not found the original name otherwise
1188 const char *
1189 xmlGetEncodingAlias(const char *alias) {
1190 int i;
1191 char upper[100];
1193 if (alias == NULL)
1194 return(NULL);
1196 if (xmlCharEncodingAliases == NULL)
1197 return(NULL);
1199 for (i = 0;i < 99;i++) {
1200 upper[i] = toupper(alias[i]);
1201 if (upper[i] == 0) break;
1203 upper[i] = 0;
1206 * Walk down the list looking for a definition of the alias
1208 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1209 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1210 return(xmlCharEncodingAliases[i].name);
1213 return(NULL);
1217 * xmlAddEncodingAlias:
1218 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1219 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1221 * Registers and alias @alias for an encoding named @name. Existing alias
1222 * will be overwritten.
1224 * Returns 0 in case of success, -1 in case of error
1227 xmlAddEncodingAlias(const char *name, const char *alias) {
1228 int i;
1229 char upper[100];
1231 if ((name == NULL) || (alias == NULL))
1232 return(-1);
1234 for (i = 0;i < 99;i++) {
1235 upper[i] = toupper(alias[i]);
1236 if (upper[i] == 0) break;
1238 upper[i] = 0;
1240 if (xmlCharEncodingAliases == NULL) {
1241 xmlCharEncodingAliasesNb = 0;
1242 xmlCharEncodingAliasesMax = 20;
1243 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1244 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1245 if (xmlCharEncodingAliases == NULL)
1246 return(-1);
1247 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1248 xmlCharEncodingAliasesMax *= 2;
1249 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1250 xmlRealloc(xmlCharEncodingAliases,
1251 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1254 * Walk down the list looking for a definition of the alias
1256 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1257 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1259 * Replace the definition.
1261 xmlFree((char *) xmlCharEncodingAliases[i].name);
1262 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1263 return(0);
1267 * Add the definition
1269 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1270 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1271 xmlCharEncodingAliasesNb++;
1272 return(0);
1276 * xmlDelEncodingAlias:
1277 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1279 * Unregisters an encoding alias @alias
1281 * Returns 0 in case of success, -1 in case of error
1284 xmlDelEncodingAlias(const char *alias) {
1285 int i;
1287 if (alias == NULL)
1288 return(-1);
1290 if (xmlCharEncodingAliases == NULL)
1291 return(-1);
1293 * Walk down the list looking for a definition of the alias
1295 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1296 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1297 xmlFree((char *) xmlCharEncodingAliases[i].name);
1298 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1299 xmlCharEncodingAliasesNb--;
1300 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1301 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1302 return(0);
1305 return(-1);
1309 * xmlParseCharEncoding:
1310 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1312 * Compare the string to the known encoding schemes already known. Note
1313 * that the comparison is case insensitive accordingly to the section
1314 * [XML] 4.3.3 Character Encoding in Entities.
1316 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1317 * if not recognized.
1319 xmlCharEncoding
1320 xmlParseCharEncoding(const char* name)
1322 const char *alias;
1323 char upper[500];
1324 int i;
1326 if (name == NULL)
1327 return(XML_CHAR_ENCODING_NONE);
1330 * Do the alias resolution
1332 alias = xmlGetEncodingAlias(name);
1333 if (alias != NULL)
1334 name = alias;
1336 for (i = 0;i < 499;i++) {
1337 upper[i] = toupper(name[i]);
1338 if (upper[i] == 0) break;
1340 upper[i] = 0;
1342 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1343 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1344 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1347 * NOTE: if we were able to parse this, the endianness of UTF16 is
1348 * already found and in use
1350 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1351 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1353 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1354 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1355 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1358 * NOTE: if we were able to parse this, the endianness of UCS4 is
1359 * already found and in use
1361 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1362 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1363 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1366 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1367 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1368 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1370 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1371 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1372 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1374 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1375 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1376 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1377 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1378 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1379 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1380 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1382 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1383 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1384 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1386 #ifdef DEBUG_ENCODING
1387 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1388 #endif
1389 return(XML_CHAR_ENCODING_ERROR);
1393 * xmlGetCharEncodingName:
1394 * @enc: the encoding
1396 * The "canonical" name for XML encoding.
1397 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1398 * Section 4.3.3 Character Encoding in Entities
1400 * Returns the canonical name for the given encoding
1403 const char*
1404 xmlGetCharEncodingName(xmlCharEncoding enc) {
1405 switch (enc) {
1406 case XML_CHAR_ENCODING_ERROR:
1407 return(NULL);
1408 case XML_CHAR_ENCODING_NONE:
1409 return(NULL);
1410 case XML_CHAR_ENCODING_UTF8:
1411 return("UTF-8");
1412 case XML_CHAR_ENCODING_UTF16LE:
1413 return("UTF-16");
1414 case XML_CHAR_ENCODING_UTF16BE:
1415 return("UTF-16");
1416 case XML_CHAR_ENCODING_EBCDIC:
1417 return("EBCDIC");
1418 case XML_CHAR_ENCODING_UCS4LE:
1419 return("ISO-10646-UCS-4");
1420 case XML_CHAR_ENCODING_UCS4BE:
1421 return("ISO-10646-UCS-4");
1422 case XML_CHAR_ENCODING_UCS4_2143:
1423 return("ISO-10646-UCS-4");
1424 case XML_CHAR_ENCODING_UCS4_3412:
1425 return("ISO-10646-UCS-4");
1426 case XML_CHAR_ENCODING_UCS2:
1427 return("ISO-10646-UCS-2");
1428 case XML_CHAR_ENCODING_8859_1:
1429 return("ISO-8859-1");
1430 case XML_CHAR_ENCODING_8859_2:
1431 return("ISO-8859-2");
1432 case XML_CHAR_ENCODING_8859_3:
1433 return("ISO-8859-3");
1434 case XML_CHAR_ENCODING_8859_4:
1435 return("ISO-8859-4");
1436 case XML_CHAR_ENCODING_8859_5:
1437 return("ISO-8859-5");
1438 case XML_CHAR_ENCODING_8859_6:
1439 return("ISO-8859-6");
1440 case XML_CHAR_ENCODING_8859_7:
1441 return("ISO-8859-7");
1442 case XML_CHAR_ENCODING_8859_8:
1443 return("ISO-8859-8");
1444 case XML_CHAR_ENCODING_8859_9:
1445 return("ISO-8859-9");
1446 case XML_CHAR_ENCODING_2022_JP:
1447 return("ISO-2022-JP");
1448 case XML_CHAR_ENCODING_SHIFT_JIS:
1449 return("Shift-JIS");
1450 case XML_CHAR_ENCODING_EUC_JP:
1451 return("EUC-JP");
1452 case XML_CHAR_ENCODING_ASCII:
1453 return(NULL);
1455 return(NULL);
1458 /************************************************************************
1460 * Char encoding handlers *
1462 ************************************************************************/
1465 /* the size should be growable, but it's not a big deal ... */
1466 #define MAX_ENCODING_HANDLERS 50
1467 static xmlCharEncodingHandlerPtr *handlers = NULL;
1468 static int nbCharEncodingHandler = 0;
1471 * The default is UTF-8 for XML, that's also the default used for the
1472 * parser internals, so the default encoding handler is NULL
1475 static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1478 * xmlNewCharEncodingHandler:
1479 * @name: the encoding name, in UTF-8 format (ASCII actually)
1480 * @input: the xmlCharEncodingInputFunc to read that encoding
1481 * @output: the xmlCharEncodingOutputFunc to write that encoding
1483 * Create and registers an xmlCharEncodingHandler.
1485 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1487 xmlCharEncodingHandlerPtr
1488 xmlNewCharEncodingHandler(const char *name,
1489 xmlCharEncodingInputFunc input,
1490 xmlCharEncodingOutputFunc output) {
1491 xmlCharEncodingHandlerPtr handler;
1492 const char *alias;
1493 char upper[500];
1494 int i;
1495 char *up = 0;
1498 * Do the alias resolution
1500 alias = xmlGetEncodingAlias(name);
1501 if (alias != NULL)
1502 name = alias;
1505 * Keep only the uppercase version of the encoding.
1507 if (name == NULL) {
1508 xmlGenericError(xmlGenericErrorContext,
1509 "xmlNewCharEncodingHandler : no name !\n");
1510 return(NULL);
1512 for (i = 0;i < 499;i++) {
1513 upper[i] = toupper(name[i]);
1514 if (upper[i] == 0) break;
1516 upper[i] = 0;
1517 up = xmlMemStrdup(upper);
1518 if (up == NULL) {
1519 xmlGenericError(xmlGenericErrorContext,
1520 "xmlNewCharEncodingHandler : out of memory !\n");
1521 return(NULL);
1525 * allocate and fill-up an handler block.
1527 handler = (xmlCharEncodingHandlerPtr)
1528 xmlMalloc(sizeof(xmlCharEncodingHandler));
1529 if (handler == NULL) {
1530 xmlGenericError(xmlGenericErrorContext,
1531 "xmlNewCharEncodingHandler : out of memory !\n");
1532 return(NULL);
1534 handler->input = input;
1535 handler->output = output;
1536 handler->name = up;
1538 #ifdef LIBXML_ICONV_ENABLED
1539 handler->iconv_in = NULL;
1540 handler->iconv_out = NULL;
1541 #endif /* LIBXML_ICONV_ENABLED */
1544 * registers and returns the handler.
1546 xmlRegisterCharEncodingHandler(handler);
1547 #ifdef DEBUG_ENCODING
1548 xmlGenericError(xmlGenericErrorContext,
1549 "Registered encoding handler for %s\n", name);
1550 #endif
1551 return(handler);
1555 * xmlInitCharEncodingHandlers:
1557 * Initialize the char encoding support, it registers the default
1558 * encoding supported.
1559 * NOTE: while public, this function usually doesn't need to be called
1560 * in normal processing.
1562 void
1563 xmlInitCharEncodingHandlers(void) {
1564 unsigned short int tst = 0x1234;
1565 unsigned char *ptr = (unsigned char *) &tst;
1567 if (handlers != NULL) return;
1569 handlers = (xmlCharEncodingHandlerPtr *)
1570 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1572 if (*ptr == 0x12) xmlLittleEndian = 0;
1573 else if (*ptr == 0x34) xmlLittleEndian = 1;
1574 else xmlGenericError(xmlGenericErrorContext,
1575 "Odd problem at endianness detection\n");
1577 if (handlers == NULL) {
1578 xmlGenericError(xmlGenericErrorContext,
1579 "xmlInitCharEncodingHandlers : out of memory !\n");
1580 return;
1582 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1583 xmlUTF16LEHandler =
1584 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1585 xmlUTF16BEHandler =
1586 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1587 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1588 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1589 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
1590 #ifdef LIBXML_HTML_ENABLED
1591 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1592 #endif
1596 * xmlCleanupCharEncodingHandlers:
1598 * Cleanup the memory allocated for the char encoding support, it
1599 * unregisters all the encoding handlers and the aliases.
1601 void
1602 xmlCleanupCharEncodingHandlers(void) {
1603 xmlCleanupEncodingAliases();
1605 if (handlers == NULL) return;
1607 for (;nbCharEncodingHandler > 0;) {
1608 nbCharEncodingHandler--;
1609 if (handlers[nbCharEncodingHandler] != NULL) {
1610 if (handlers[nbCharEncodingHandler]->name != NULL)
1611 xmlFree(handlers[nbCharEncodingHandler]->name);
1612 xmlFree(handlers[nbCharEncodingHandler]);
1615 xmlFree(handlers);
1616 handlers = NULL;
1617 nbCharEncodingHandler = 0;
1618 xmlDefaultCharEncodingHandler = NULL;
1622 * xmlRegisterCharEncodingHandler:
1623 * @handler: the xmlCharEncodingHandlerPtr handler block
1625 * Register the char encoding handler, surprising, isn't it ?
1627 void
1628 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1629 if (handlers == NULL) xmlInitCharEncodingHandlers();
1630 if (handler == NULL) {
1631 xmlGenericError(xmlGenericErrorContext,
1632 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1633 return;
1636 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1637 xmlGenericError(xmlGenericErrorContext,
1638 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1639 xmlGenericError(xmlGenericErrorContext,
1640 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1641 return;
1643 handlers[nbCharEncodingHandler++] = handler;
1647 * xmlGetCharEncodingHandler:
1648 * @enc: an xmlCharEncoding value.
1650 * Search in the registered set the handler able to read/write that encoding.
1652 * Returns the handler or NULL if not found
1654 xmlCharEncodingHandlerPtr
1655 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1656 xmlCharEncodingHandlerPtr handler;
1658 if (handlers == NULL) xmlInitCharEncodingHandlers();
1659 switch (enc) {
1660 case XML_CHAR_ENCODING_ERROR:
1661 return(NULL);
1662 case XML_CHAR_ENCODING_NONE:
1663 return(NULL);
1664 case XML_CHAR_ENCODING_UTF8:
1665 return(NULL);
1666 case XML_CHAR_ENCODING_UTF16LE:
1667 return(xmlUTF16LEHandler);
1668 case XML_CHAR_ENCODING_UTF16BE:
1669 return(xmlUTF16BEHandler);
1670 case XML_CHAR_ENCODING_EBCDIC:
1671 handler = xmlFindCharEncodingHandler("EBCDIC");
1672 if (handler != NULL) return(handler);
1673 handler = xmlFindCharEncodingHandler("ebcdic");
1674 if (handler != NULL) return(handler);
1675 break;
1676 case XML_CHAR_ENCODING_UCS4BE:
1677 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1678 if (handler != NULL) return(handler);
1679 handler = xmlFindCharEncodingHandler("UCS-4");
1680 if (handler != NULL) return(handler);
1681 handler = xmlFindCharEncodingHandler("UCS4");
1682 if (handler != NULL) return(handler);
1683 break;
1684 case XML_CHAR_ENCODING_UCS4LE:
1685 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1686 if (handler != NULL) return(handler);
1687 handler = xmlFindCharEncodingHandler("UCS-4");
1688 if (handler != NULL) return(handler);
1689 handler = xmlFindCharEncodingHandler("UCS4");
1690 if (handler != NULL) return(handler);
1691 break;
1692 case XML_CHAR_ENCODING_UCS4_2143:
1693 break;
1694 case XML_CHAR_ENCODING_UCS4_3412:
1695 break;
1696 case XML_CHAR_ENCODING_UCS2:
1697 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1698 if (handler != NULL) return(handler);
1699 handler = xmlFindCharEncodingHandler("UCS-2");
1700 if (handler != NULL) return(handler);
1701 handler = xmlFindCharEncodingHandler("UCS2");
1702 if (handler != NULL) return(handler);
1703 break;
1706 * We used to keep ISO Latin encodings native in the
1707 * generated data. This led to so many problems that
1708 * this has been removed. One can still change this
1709 * back by registering no-ops encoders for those
1711 case XML_CHAR_ENCODING_8859_1:
1712 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1713 if (handler != NULL) return(handler);
1714 break;
1715 case XML_CHAR_ENCODING_8859_2:
1716 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1717 if (handler != NULL) return(handler);
1718 break;
1719 case XML_CHAR_ENCODING_8859_3:
1720 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1721 if (handler != NULL) return(handler);
1722 break;
1723 case XML_CHAR_ENCODING_8859_4:
1724 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1725 if (handler != NULL) return(handler);
1726 break;
1727 case XML_CHAR_ENCODING_8859_5:
1728 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1729 if (handler != NULL) return(handler);
1730 break;
1731 case XML_CHAR_ENCODING_8859_6:
1732 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1733 if (handler != NULL) return(handler);
1734 break;
1735 case XML_CHAR_ENCODING_8859_7:
1736 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1737 if (handler != NULL) return(handler);
1738 break;
1739 case XML_CHAR_ENCODING_8859_8:
1740 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1741 if (handler != NULL) return(handler);
1742 break;
1743 case XML_CHAR_ENCODING_8859_9:
1744 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1745 if (handler != NULL) return(handler);
1746 break;
1749 case XML_CHAR_ENCODING_2022_JP:
1750 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1751 if (handler != NULL) return(handler);
1752 break;
1753 case XML_CHAR_ENCODING_SHIFT_JIS:
1754 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1755 if (handler != NULL) return(handler);
1756 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1757 if (handler != NULL) return(handler);
1758 handler = xmlFindCharEncodingHandler("Shift_JIS");
1759 if (handler != NULL) return(handler);
1760 break;
1761 case XML_CHAR_ENCODING_EUC_JP:
1762 handler = xmlFindCharEncodingHandler("EUC-JP");
1763 if (handler != NULL) return(handler);
1764 break;
1765 default:
1766 break;
1769 #ifdef DEBUG_ENCODING
1770 xmlGenericError(xmlGenericErrorContext,
1771 "No handler found for encoding %d\n", enc);
1772 #endif
1773 return(NULL);
1777 * xmlFindCharEncodingHandler:
1778 * @name: a string describing the char encoding.
1780 * Search in the registered set the handler able to read/write that encoding.
1782 * Returns the handler or NULL if not found
1784 xmlCharEncodingHandlerPtr
1785 xmlFindCharEncodingHandler(const char *name) {
1786 const char *nalias;
1787 const char *norig;
1788 xmlCharEncoding alias;
1789 #ifdef LIBXML_ICONV_ENABLED
1790 xmlCharEncodingHandlerPtr enc;
1791 iconv_t icv_in, icv_out;
1792 #endif /* LIBXML_ICONV_ENABLED */
1793 char upper[100];
1794 int i;
1796 if (handlers == NULL) xmlInitCharEncodingHandlers();
1797 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1798 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1801 * Do the alias resolution
1803 norig = name;
1804 nalias = xmlGetEncodingAlias(name);
1805 if (nalias != NULL)
1806 name = nalias;
1809 * Check first for directly registered encoding names
1811 for (i = 0;i < 99;i++) {
1812 upper[i] = toupper(name[i]);
1813 if (upper[i] == 0) break;
1815 upper[i] = 0;
1817 for (i = 0;i < nbCharEncodingHandler; i++)
1818 if (!strcmp(upper, handlers[i]->name)) {
1819 #ifdef DEBUG_ENCODING
1820 xmlGenericError(xmlGenericErrorContext,
1821 "Found registered handler for encoding %s\n", name);
1822 #endif
1823 return(handlers[i]);
1826 #ifdef LIBXML_ICONV_ENABLED
1827 /* check whether iconv can handle this */
1828 icv_in = iconv_open("UTF-8", name);
1829 icv_out = iconv_open(name, "UTF-8");
1830 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1831 enc = (xmlCharEncodingHandlerPtr)
1832 xmlMalloc(sizeof(xmlCharEncodingHandler));
1833 if (enc == NULL) {
1834 iconv_close(icv_in);
1835 iconv_close(icv_out);
1836 return(NULL);
1838 enc->name = xmlMemStrdup(name);
1839 enc->input = NULL;
1840 enc->output = NULL;
1841 enc->iconv_in = icv_in;
1842 enc->iconv_out = icv_out;
1843 #ifdef DEBUG_ENCODING
1844 xmlGenericError(xmlGenericErrorContext,
1845 "Found iconv handler for encoding %s\n", name);
1846 #endif
1847 return enc;
1848 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1849 xmlGenericError(xmlGenericErrorContext,
1850 "iconv : problems with filters for '%s'\n", name);
1852 #endif /* LIBXML_ICONV_ENABLED */
1854 #ifdef DEBUG_ENCODING
1855 xmlGenericError(xmlGenericErrorContext,
1856 "No handler found for encoding %s\n", name);
1857 #endif
1860 * Fallback using the canonical names
1862 alias = xmlParseCharEncoding(norig);
1863 if (alias != XML_CHAR_ENCODING_ERROR) {
1864 const char* canon;
1865 canon = xmlGetCharEncodingName(alias);
1866 if ((canon != NULL) && (strcmp(name, canon))) {
1867 return(xmlFindCharEncodingHandler(canon));
1871 return(NULL);
1874 /************************************************************************
1876 * ICONV based generic conversion functions *
1878 ************************************************************************/
1880 #ifdef LIBXML_ICONV_ENABLED
1882 * xmlIconvWrapper:
1883 * @cd: iconv converter data structure
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of ISO Latin 1 chars
1887 * @inlen: the length of @in
1889 * Returns 0 if success, or
1890 * -1 by lack of space, or
1891 * -2 if the transcoding fails (for *in is not valid utf8 string or
1892 * the result of transformation can't fit into the encoding we want), or
1893 * -3 if there the last byte can't form a single output char.
1895 * The value of @inlen after return is the number of octets consumed
1896 * as the return value is positive, else unpredictable.
1897 * The value of @outlen after return is the number of ocetes consumed.
1899 static int
1900 xmlIconvWrapper(iconv_t cd,
1901 unsigned char *out, int *outlen,
1902 const unsigned char *in, int *inlen) {
1904 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1905 const char *icv_in = (const char *) in;
1906 char *icv_out = (char *) out;
1907 int ret;
1909 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1910 if (in != NULL) {
1911 *inlen -= icv_inlen;
1912 *outlen -= icv_outlen;
1913 } else {
1914 *inlen = 0;
1915 *outlen = 0;
1917 if ((icv_inlen != 0) || (ret == -1)) {
1918 #ifdef EILSEQ
1919 if (errno == EILSEQ) {
1920 return -2;
1921 } else
1922 #endif
1923 #ifdef E2BIG
1924 if (errno == E2BIG) {
1925 return -1;
1926 } else
1927 #endif
1928 #ifdef EINVAL
1929 if (errno == EINVAL) {
1930 return -3;
1931 } else
1932 #endif
1934 return -3;
1937 return 0;
1939 #endif /* LIBXML_ICONV_ENABLED */
1941 /************************************************************************
1943 * The real API used by libxml for on-the-fly conversion *
1945 ************************************************************************/
1948 * xmlCharEncFirstLine:
1949 * @handler: char enconding transformation data structure
1950 * @out: an xmlBuffer for the output.
1951 * @in: an xmlBuffer for the input
1953 * Front-end for the encoding handler input function, but handle only
1954 * the very first line, i.e. limit itself to 45 chars.
1956 * Returns the number of byte written if success, or
1957 * -1 general error
1958 * -2 if the transcoding fails (for *in is not valid utf8 string or
1959 * the result of transformation can't fit into the encoding we want), or
1962 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1963 xmlBufferPtr in) {
1964 int ret = -2;
1965 int written;
1966 int toconv;
1968 if (handler == NULL) return(-1);
1969 if (out == NULL) return(-1);
1970 if (in == NULL) return(-1);
1972 written = out->size - out->use;
1973 toconv = in->use;
1974 if (toconv * 2 >= written) {
1975 xmlBufferGrow(out, toconv);
1976 written = out->size - out->use - 1;
1980 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1981 * 45 chars should be sufficient to reach the end of the encoding
1982 * declaration without going too far inside the document content.
1984 written = 45;
1986 if (handler->input != NULL) {
1987 ret = handler->input(&out->content[out->use], &written,
1988 in->content, &toconv);
1989 xmlBufferShrink(in, toconv);
1990 out->use += written;
1991 out->content[out->use] = 0;
1993 #ifdef LIBXML_ICONV_ENABLED
1994 else if (handler->iconv_in != NULL) {
1995 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1996 &written, in->content, &toconv);
1997 xmlBufferShrink(in, toconv);
1998 out->use += written;
1999 out->content[out->use] = 0;
2000 if (ret == -1) ret = -3;
2002 #endif /* LIBXML_ICONV_ENABLED */
2003 #ifdef DEBUG_ENCODING
2004 switch (ret) {
2005 case 0:
2006 xmlGenericError(xmlGenericErrorContext,
2007 "converted %d bytes to %d bytes of input\n",
2008 toconv, written);
2009 break;
2010 case -1:
2011 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2012 toconv, written, in->use);
2013 break;
2014 case -2:
2015 xmlGenericError(xmlGenericErrorContext,
2016 "input conversion failed due to input error\n");
2017 break;
2018 case -3:
2019 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2020 toconv, written, in->use);
2021 break;
2022 default:
2023 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2025 #endif /* DEBUG_ENCODING */
2027 * Ignore when input buffer is not on a boundary
2029 if (ret == -3) ret = 0;
2030 if (ret == -1) ret = 0;
2031 return(ret);
2035 * xmlCharEncInFunc:
2036 * @handler: char encoding transformation data structure
2037 * @out: an xmlBuffer for the output.
2038 * @in: an xmlBuffer for the input
2040 * Generic front-end for the encoding handler input function
2042 * Returns the number of byte written if success, or
2043 * -1 general error
2044 * -2 if the transcoding fails (for *in is not valid utf8 string or
2045 * the result of transformation can't fit into the encoding we want), or
2048 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2049 xmlBufferPtr in)
2051 int ret = -2;
2052 int written;
2053 int toconv;
2055 if (handler == NULL)
2056 return (-1);
2057 if (out == NULL)
2058 return (-1);
2059 if (in == NULL)
2060 return (-1);
2062 toconv = in->use;
2063 if (toconv == 0)
2064 return (0);
2065 written = out->size - out->use;
2066 if (toconv * 2 >= written) {
2067 xmlBufferGrow(out, out->size + toconv * 2);
2068 written = out->size - out->use - 1;
2070 if (handler->input != NULL) {
2071 ret = handler->input(&out->content[out->use], &written,
2072 in->content, &toconv);
2073 xmlBufferShrink(in, toconv);
2074 out->use += written;
2075 out->content[out->use] = 0;
2077 #ifdef LIBXML_ICONV_ENABLED
2078 else if (handler->iconv_in != NULL) {
2079 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2080 &written, in->content, &toconv);
2081 xmlBufferShrink(in, toconv);
2082 out->use += written;
2083 out->content[out->use] = 0;
2084 if (ret == -1)
2085 ret = -3;
2087 #endif /* LIBXML_ICONV_ENABLED */
2088 switch (ret) {
2089 case 0:
2090 #ifdef DEBUG_ENCODING
2091 xmlGenericError(xmlGenericErrorContext,
2092 "converted %d bytes to %d bytes of input\n",
2093 toconv, written);
2094 #endif
2095 break;
2096 case -1:
2097 #ifdef DEBUG_ENCODING
2098 xmlGenericError(xmlGenericErrorContext,
2099 "converted %d bytes to %d bytes of input, %d left\n",
2100 toconv, written, in->use);
2101 #endif
2102 break;
2103 case -3:
2104 #ifdef DEBUG_ENCODING
2105 xmlGenericError(xmlGenericErrorContext,
2106 "converted %d bytes to %d bytes of input, %d left\n",
2107 toconv, written, in->use);
2108 #endif
2109 break;
2110 case -2:
2111 xmlGenericError(xmlGenericErrorContext,
2112 "input conversion failed due to input error\n");
2113 xmlGenericError(xmlGenericErrorContext,
2114 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2115 in->content[0], in->content[1],
2116 in->content[2], in->content[3]);
2119 * Ignore when input buffer is not on a boundary
2121 if (ret == -3)
2122 ret = 0;
2123 return (written);
2127 * xmlCharEncOutFunc:
2128 * @handler: char enconding transformation data structure
2129 * @out: an xmlBuffer for the output.
2130 * @in: an xmlBuffer for the input
2132 * Generic front-end for the encoding handler output function
2133 * a first call with @in == NULL has to be made firs to initiate the
2134 * output in case of non-stateless encoding needing to initiate their
2135 * state or the output (like the BOM in UTF16).
2136 * In case of UTF8 sequence conversion errors for the given encoder,
2137 * the content will be automatically remapped to a CharRef sequence.
2139 * Returns the number of byte written if success, or
2140 * -1 general error
2141 * -2 if the transcoding fails (for *in is not valid utf8 string or
2142 * the result of transformation can't fit into the encoding we want), or
2145 xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2146 xmlBufferPtr in) {
2147 int ret = -2;
2148 int written;
2149 int writtentot = 0;
2150 int toconv;
2151 int output = 0;
2153 if (handler == NULL) return(-1);
2154 if (out == NULL) return(-1);
2156 retry:
2158 written = out->size - out->use;
2161 * First specific handling of in = NULL, i.e. the initialization call
2163 if (in == NULL) {
2164 toconv = 0;
2165 if (handler->output != NULL) {
2166 ret = handler->output(&out->content[out->use], &written,
2167 NULL, &toconv);
2168 out->use += written;
2169 out->content[out->use] = 0;
2171 #ifdef LIBXML_ICONV_ENABLED
2172 else if (handler->iconv_out != NULL) {
2173 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2174 &written, NULL, &toconv);
2175 out->use += written;
2176 out->content[out->use] = 0;
2178 #endif /* LIBXML_ICONV_ENABLED */
2179 #ifdef DEBUG_ENCODING
2180 xmlGenericError(xmlGenericErrorContext,
2181 "initialized encoder\n");
2182 #endif
2183 return(0);
2187 * Conversion itself.
2189 toconv = in->use;
2190 if (toconv == 0)
2191 return(0);
2192 if (toconv * 2 >= written) {
2193 xmlBufferGrow(out, toconv * 2);
2194 written = out->size - out->use - 1;
2196 if (handler->output != NULL) {
2197 ret = handler->output(&out->content[out->use], &written,
2198 in->content, &toconv);
2199 xmlBufferShrink(in, toconv);
2200 out->use += written;
2201 writtentot += written;
2202 out->content[out->use] = 0;
2204 #ifdef LIBXML_ICONV_ENABLED
2205 else if (handler->iconv_out != NULL) {
2206 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2207 &written, in->content, &toconv);
2208 xmlBufferShrink(in, toconv);
2209 out->use += written;
2210 writtentot += written;
2211 out->content[out->use] = 0;
2212 if (ret == -1) {
2213 if (written > 0) {
2215 * Can be a limitation of iconv
2217 goto retry;
2219 ret = -3;
2222 #endif /* LIBXML_ICONV_ENABLED */
2223 else {
2224 xmlGenericError(xmlGenericErrorContext,
2225 "xmlCharEncOutFunc: no output function !\n");
2226 return(-1);
2229 if (ret >= 0) output += ret;
2232 * Attempt to handle error cases
2234 switch (ret) {
2235 case 0:
2236 #ifdef DEBUG_ENCODING
2237 xmlGenericError(xmlGenericErrorContext,
2238 "converted %d bytes to %d bytes of output\n",
2239 toconv, written);
2240 #endif
2241 break;
2242 case -1:
2243 #ifdef DEBUG_ENCODING
2244 xmlGenericError(xmlGenericErrorContext,
2245 "output conversion failed by lack of space\n");
2246 #endif
2247 break;
2248 case -3:
2249 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2250 toconv, written, in->use);
2251 break;
2252 case -2: {
2253 int len = in->use;
2254 const xmlChar *utf = (const xmlChar *) in->content;
2255 int cur;
2257 cur = xmlGetUTF8Char(utf, &len);
2258 if (cur > 0) {
2259 xmlChar charref[20];
2261 #ifdef DEBUG_ENCODING
2262 xmlGenericError(xmlGenericErrorContext,
2263 "handling output conversion error\n");
2264 xmlGenericError(xmlGenericErrorContext,
2265 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2266 in->content[0], in->content[1],
2267 in->content[2], in->content[3]);
2268 #endif
2270 * Removes the UTF8 sequence, and replace it by a charref
2271 * and continue the transcoding phase, hoping the error
2272 * did not mangle the encoder state.
2274 snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
2275 xmlBufferShrink(in, len);
2276 xmlBufferAddHead(in, charref, -1);
2278 goto retry;
2279 } else {
2280 xmlGenericError(xmlGenericErrorContext,
2281 "output conversion failed due to conv error\n");
2282 xmlGenericError(xmlGenericErrorContext,
2283 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2284 in->content[0], in->content[1],
2285 in->content[2], in->content[3]);
2286 in->content[0] = ' ';
2288 break;
2291 return(ret);
2295 * xmlCharEncCloseFunc:
2296 * @handler: char enconding transformation data structure
2298 * Generic front-end for encoding handler close function
2300 * Returns 0 if success, or -1 in case of error
2303 xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2304 int ret = 0;
2305 if (handler == NULL) return(-1);
2306 if (handler->name == NULL) return(-1);
2307 #ifdef LIBXML_ICONV_ENABLED
2309 * Iconv handlers can be used only once, free the whole block.
2310 * and the associated icon resources.
2312 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2313 if (handler->name != NULL)
2314 xmlFree(handler->name);
2315 handler->name = NULL;
2316 if (handler->iconv_out != NULL) {
2317 if (iconv_close(handler->iconv_out))
2318 ret = -1;
2319 handler->iconv_out = NULL;
2321 if (handler->iconv_in != NULL) {
2322 if (iconv_close(handler->iconv_in))
2323 ret = -1;
2324 handler->iconv_in = NULL;
2326 xmlFree(handler);
2328 #endif /* LIBXML_ICONV_ENABLED */
2329 #ifdef DEBUG_ENCODING
2330 if (ret)
2331 xmlGenericError(xmlGenericErrorContext,
2332 "failed to close the encoding handler\n");
2333 else
2334 xmlGenericError(xmlGenericErrorContext,
2335 "closed the encoding handler\n");
2336 #endif
2338 return(ret);