2 * encoding.c : implements the encoding conversion functions needed for XML
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
16 * See Copyright for the status of this software.
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
37 #ifdef LIBXML_ICONV_ENABLED
42 #include <libxml/encoding.h>
43 #include <libxml/xmlmemory.h>
44 #ifdef LIBXML_HTML_ENABLED
45 #include <libxml/HTMLparser.h>
47 #include <libxml/globals.h>
48 #include <libxml/xmlerror.h>
50 static xmlCharEncodingHandlerPtr xmlUTF16LEHandler
= NULL
;
51 static xmlCharEncodingHandlerPtr xmlUTF16BEHandler
= NULL
;
53 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias
;
54 typedef xmlCharEncodingAlias
*xmlCharEncodingAliasPtr
;
55 struct _xmlCharEncodingAlias
{
60 static xmlCharEncodingAliasPtr xmlCharEncodingAliases
= NULL
;
61 static int xmlCharEncodingAliasesNb
= 0;
62 static int xmlCharEncodingAliasesMax
= 0;
64 #ifdef LIBXML_ICONV_ENABLED
66 #define DEBUG_ENCODING /* Define this to get encoding traces */
70 static int xmlLittleEndian
= 1;
72 /************************************************************************
74 * Generic UTF8 handling routines *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
85 ************************************************************************/
89 * @utf: a sequence of UTF-8 encoded bytes
91 * compute the length of an UTF8 string, it doesn't do a full UTF8
92 * checking of the content of the string.
94 * Returns the number of characters in the string or -1 in case of error
97 xmlUTF8Strlen(const xmlChar
*utf
) {
105 if ((utf
[1] & 0xc0) != 0x80)
107 if ((utf
[0] & 0xe0) == 0xe0) {
108 if ((utf
[2] & 0xc0) != 0x80)
110 if ((utf
[0] & 0xf0) == 0xf0) {
111 if ((utf
[0] & 0xf8) != 0xf0 || (utf
[3] & 0xc0) != 0x80)
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
133 * Read one UTF8 Char from @utf
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
139 xmlGetUTF8Char(const unsigned char *utf
, int *len
) {
153 if ((utf
[1] & 0xc0) != 0x80)
155 if ((c
& 0xe0) == 0xe0) {
158 if ((utf
[2] & 0xc0) != 0x80)
160 if ((c
& 0xf0) == 0xf0) {
163 if ((c
& 0xf8) != 0xf0 || (utf
[3] & 0xc0) != 0x80)
167 c
= (utf
[0] & 0x7) << 18;
168 c
|= (utf
[1] & 0x3f) << 12;
169 c
|= (utf
[2] & 0x3f) << 6;
174 c
= (utf
[0] & 0xf) << 12;
175 c
|= (utf
[1] & 0x3f) << 6;
181 c
= (utf
[0] & 0x1f) << 6;
196 * xmlCheckUTF8: Check utf-8 string for legality.
197 * @utf: Pointer to putative utf-8 encoded string.
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
203 * routine checks for the 4-byte maximum size, but does not check for
204 * 0x10ffff maximum value.
206 * Return value: true if @utf is valid.
209 xmlCheckUTF8(const unsigned char *utf
)
214 for (ix
= 0; (c
= utf
[ix
]);) {
216 if ((utf
[ix
+ 1] & 0xc0) != 0x80)
218 if ((c
& 0xe0) == 0xe0) {
219 if ((utf
[ix
+ 2] & 0xc0) != 0x80)
221 if ((c
& 0xf0) == 0xf0) {
222 if ((c
& 0xf8) != 0xf0 || (utf
[ix
+ 3] & 0xc0) != 0x80)
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
244 * storage size of an UTF8 string
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
252 xmlUTF8Strsize(const xmlChar
*utf
, int len
) {
253 const xmlChar
*ptr
=utf
;
262 if ( (ch
= *ptr
++) & 0x80)
263 while ( (ch
<<=1) & 0x80 )
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
275 * a strndup for array of UTF8's
277 * Returns a new UTF8 * or NULL
280 xmlUTF8Strndup(const xmlChar
*utf
, int len
) {
284 if ((utf
== NULL
) || (len
< 0)) return(NULL
);
285 i
= xmlUTF8Strsize(utf
, len
);
286 ret
= (xmlChar
*) xmlMalloc((i
+ 1) * sizeof(xmlChar
));
288 xmlGenericError(xmlGenericErrorContext
,
289 "malloc of %ld byte failed\n",
290 (len
+ 1) * (long)sizeof(xmlChar
));
293 memcpy(ret
, utf
, i
* sizeof(xmlChar
));
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
303 * a function to provide the equivalent of fetching a
304 * character from a string array
306 * Returns a pointer to the UTF8 character or NULL
309 xmlUTF8Strpos(const xmlChar
*utf
, int pos
) {
312 if (utf
== NULL
) return(NULL
);
313 if ( (pos
< 0) || (pos
>= xmlUTF8Strlen(utf
)) )
316 if ((ch
=*utf
++) == 0) return(NULL
);
318 /* if not simple ascii, verify proper format */
319 if ( (ch
& 0xc0) != 0xc0 )
321 /* then skip over remaining bytes for this char */
322 while ( (ch
<<= 1) & 0x80 )
323 if ( (*utf
++ & 0xc0) != 0x80 )
327 return((xmlChar
*)utf
);
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
335 * a function to provide relative location of a UTF8 char
337 * Returns the relative character position of the desired char
341 xmlUTF8Strloc(const xmlChar
*utf
, const xmlChar
*utfchar
) {
345 if (utf
==NULL
|| utfchar
==NULL
) return -1;
346 size
= xmlUTF8Strsize(utfchar
, 1);
347 for(i
=0; (ch
=*utf
) != 0; i
++) {
348 if (xmlStrncmp(utf
, utfchar
, size
)==0)
352 /* if not simple ascii, verify proper format */
353 if ( (ch
& 0xc0) != 0xc0 )
355 /* then skip over remaining bytes for this char */
356 while ( (ch
<<= 1) & 0x80 )
357 if ( (*utf
++ & 0xc0) != 0x80 )
366 * @utf: a sequence of UTF-8 encoded bytes
367 * @start: relative pos of first char
368 * @len: total number to copy
370 * Note: positions are given in units of UTF-8 chars
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
377 xmlUTF8Strsub(const xmlChar
*utf
, int start
, int len
) {
381 if (utf
== NULL
) return(NULL
);
382 if (start
< 0) return(NULL
);
383 if (len
< 0) return(NULL
);
386 * Skip over any leading chars
388 for (i
= 0;i
< start
;i
++) {
389 if ((ch
=*utf
++) == 0) return(NULL
);
391 /* if not simple ascii, verify proper format */
392 if ( (ch
& 0xc0) != 0xc0 )
394 /* then skip over remaining bytes for this char */
395 while ( (ch
<<= 1) & 0x80 )
396 if ( (*utf
++ & 0xc0) != 0x80 )
401 return(xmlUTF8Strndup(utf
, len
));
404 /************************************************************************
406 * Conversions To/From UTF8 encoding *
408 ************************************************************************/
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
421 * as the return value is positive, else unpredictable.
422 * The value of @outlen after return is the number of ocetes consumed.
425 asciiToUTF8(unsigned char* out
, int *outlen
,
426 const unsigned char* in
, int *inlen
) {
427 unsigned char* outstart
= out
;
428 const unsigned char* base
= in
;
429 const unsigned char* processed
= in
;
430 unsigned char* outend
= out
+ *outlen
;
431 const unsigned char* inend
;
435 inend
= in
+ (*inlen
);
436 while ((in
< inend
) && (out
- outstart
+ 5 < *outlen
)) {
439 /* assertion: c is a single UTF-4 value */
442 if (c
< 0x80) { *out
++= c
; bits
= -6; }
444 *outlen
= out
- outstart
;
445 *inlen
= processed
- base
;
449 for ( ; bits
>= 0; bits
-= 6) {
452 *out
++= ((c
>> bits
) & 0x3F) | 0x80;
454 processed
= (const unsigned char*) in
;
456 *outlen
= out
- outstart
;
457 *inlen
= processed
- base
;
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
473 * as the return value is positive, else unpredictable.
474 * The value of @outlen after return is the number of ocetes consumed.
477 UTF8Toascii(unsigned char* out
, int *outlen
,
478 const unsigned char* in
, int *inlen
) {
479 const unsigned char* processed
= in
;
480 const unsigned char* outend
;
481 const unsigned char* outstart
= out
;
482 const unsigned char* instart
= in
;
483 const unsigned char* inend
;
489 * initialization nothing to do
495 inend
= in
+ (*inlen
);
496 outend
= out
+ (*outlen
);
499 if (d
< 0x80) { c
= d
; trailing
= 0; }
501 /* trailing byte in leading position */
502 *outlen
= out
- outstart
;
503 *inlen
= processed
- instart
;
505 } else if (d
< 0xE0) { c
= d
& 0x1F; trailing
= 1; }
506 else if (d
< 0xF0) { c
= d
& 0x0F; trailing
= 2; }
507 else if (d
< 0xF8) { c
= d
& 0x07; trailing
= 3; }
509 /* no chance for this in Ascii */
510 *outlen
= out
- outstart
;
511 *inlen
= processed
- instart
;
515 if (inend
- in
< trailing
) {
519 for ( ; trailing
; trailing
--) {
520 if ((in
>= inend
) || (((d
= *in
++) & 0xC0) != 0x80))
526 /* assertion: c is a single UTF-4 value */
532 /* no chance for this in Ascii */
533 *outlen
= out
- outstart
;
534 *inlen
= processed
- instart
;
539 *outlen
= out
- outstart
;
540 *inlen
= processed
- instart
;
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
555 * as the return value is positive, else unpredictable.
556 * The value of @outlen after return is the number of ocetes consumed.
559 isolat1ToUTF8(unsigned char* out
, int *outlen
,
560 const unsigned char* in
, int *inlen
) {
561 unsigned char* outstart
= out
;
562 const unsigned char* base
= in
;
563 unsigned char* outend
= out
+ *outlen
;
564 const unsigned char* inend
;
565 const unsigned char* instop
;
568 inend
= in
+ (*inlen
);
571 while (in
< inend
&& out
< outend
- 1) {
573 *out
++= ((c
>> 6) & 0x1F) | 0xC0;
574 *out
++= (c
& 0x3F) | 0x80;
578 if (instop
- in
> outend
- out
) instop
= in
+ (outend
- out
);
579 while (c
< 0x80 && in
< instop
) {
585 if (in
< inend
&& out
< outend
&& c
< 0x80) {
589 *outlen
= out
- outstart
;
597 * @out: a pointer to an array of bytes to store the result
598 * @outlen: the length of @out
599 * @in: a pointer to an array of UTF-8 chars
600 * @inlen: the length of @in
602 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
603 * block of chars out.
605 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
606 * The value of @inlen after return is the number of octets consumed
607 * as the return value is positive, else unpredictable.
608 * The value of @outlen after return is the number of ocetes consumed.
611 UTF8Toisolat1(unsigned char* out
, int *outlen
,
612 const unsigned char* in
, int *inlen
) {
613 const unsigned char* processed
= in
;
614 const unsigned char* outend
;
615 const unsigned char* outstart
= out
;
616 const unsigned char* instart
= in
;
617 const unsigned char* inend
;
623 * initialization nothing to do
629 inend
= in
+ (*inlen
);
630 outend
= out
+ (*outlen
);
633 if (d
< 0x80) { c
= d
; trailing
= 0; }
635 /* trailing byte in leading position */
636 *outlen
= out
- outstart
;
637 *inlen
= processed
- instart
;
639 } else if (d
< 0xE0) { c
= d
& 0x1F; trailing
= 1; }
640 else if (d
< 0xF0) { c
= d
& 0x0F; trailing
= 2; }
641 else if (d
< 0xF8) { c
= d
& 0x07; trailing
= 3; }
643 /* no chance for this in IsoLat1 */
644 *outlen
= out
- outstart
;
645 *inlen
= processed
- instart
;
649 if (inend
- in
< trailing
) {
653 for ( ; trailing
; trailing
--) {
656 if (((d
= *in
++) & 0xC0) != 0x80) {
657 *outlen
= out
- outstart
;
658 *inlen
= processed
- instart
;
665 /* assertion: c is a single UTF-4 value */
671 /* no chance for this in IsoLat1 */
672 *outlen
= out
- outstart
;
673 *inlen
= processed
- instart
;
678 *outlen
= out
- outstart
;
679 *inlen
= processed
- instart
;
685 * @out: a pointer to an array of bytes to store the result
686 * @outlen: the length of @out
687 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
688 * @inlenb: the length of @in in UTF-16LE chars
690 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
691 * block of chars out. This function assume the endian property
692 * is the same between the native type of this machine and the
695 * Returns the number of byte written, or -1 by lack of space, or -2
696 * if the transcoding fails (for *in is not valid utf16 string)
697 * The value of *inlen after return is the number of octets consumed
698 * as the return value is positive, else unpredictable.
701 UTF16LEToUTF8(unsigned char* out
, int *outlen
,
702 const unsigned char* inb
, int *inlenb
)
704 unsigned char* outstart
= out
;
705 const unsigned char* processed
= inb
;
706 unsigned char* outend
= out
+ *outlen
;
707 unsigned short* in
= (unsigned short*) inb
;
708 unsigned short* inend
;
709 unsigned int c
, d
, inlen
;
713 if ((*inlenb
% 2) == 1)
717 while ((in
< inend
) && (out
- outstart
+ 5 < *outlen
)) {
718 if (xmlLittleEndian
) {
721 tmp
= (unsigned char *) in
;
723 c
= c
| (((unsigned int)*tmp
) << 8);
726 if ((c
& 0xFC00) == 0xD800) { /* surrogates */
727 if (in
>= inend
) { /* (in > inend) shouldn't happens */
730 if (xmlLittleEndian
) {
733 tmp
= (unsigned char *) in
;
735 d
= d
| (((unsigned int)*tmp
) << 8);
738 if ((d
& 0xFC00) == 0xDC00) {
745 *outlen
= out
- outstart
;
746 *inlenb
= processed
- inb
;
751 /* assertion: c is a single UTF-4 value */
754 if (c
< 0x80) { *out
++= c
; bits
= -6; }
755 else if (c
< 0x800) { *out
++= ((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
756 else if (c
< 0x10000) { *out
++= ((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
757 else { *out
++= ((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
759 for ( ; bits
>= 0; bits
-= 6) {
762 *out
++= ((c
>> bits
) & 0x3F) | 0x80;
764 processed
= (const unsigned char*) in
;
766 *outlen
= out
- outstart
;
767 *inlenb
= processed
- inb
;
773 * @outb: a pointer to an array of bytes to store the result
774 * @outlen: the length of @outb
775 * @in: a pointer to an array of UTF-8 chars
776 * @inlen: the length of @in
778 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
779 * block of chars out.
781 * Returns the number of byte written, or -1 by lack of space, or -2
782 * if the transcoding failed.
785 UTF8ToUTF16LE(unsigned char* outb
, int *outlen
,
786 const unsigned char* in
, int *inlen
)
788 unsigned short* out
= (unsigned short*) outb
;
789 const unsigned char* processed
= in
;
790 unsigned short* outstart
= out
;
791 unsigned short* outend
;
792 const unsigned char* inend
= in
+*inlen
;
796 unsigned short tmp1
, tmp2
;
800 * initialization, add the Byte Order Mark
807 #ifdef DEBUG_ENCODING
808 xmlGenericError(xmlGenericErrorContext
,
809 "Added FFFE Byte Order Mark\n");
817 outend
= out
+ (*outlen
/ 2);
820 if (d
< 0x80) { c
= d
; trailing
= 0; }
822 /* trailing byte in leading position */
823 *outlen
= (out
- outstart
) * 2;
824 *inlen
= processed
- in
;
826 } else if (d
< 0xE0) { c
= d
& 0x1F; trailing
= 1; }
827 else if (d
< 0xF0) { c
= d
& 0x0F; trailing
= 2; }
828 else if (d
< 0xF8) { c
= d
& 0x07; trailing
= 3; }
830 /* no chance for this in UTF-16 */
831 *outlen
= (out
- outstart
) * 2;
832 *inlen
= processed
- in
;
836 if (inend
- in
< trailing
) {
840 for ( ; trailing
; trailing
--) {
841 if ((in
>= inend
) || (((d
= *in
++) & 0xC0) != 0x80))
847 /* assertion: c is a single UTF-4 value */
851 if (xmlLittleEndian
) {
854 tmp
= (unsigned char *) out
;
856 *(tmp
+ 1) = c
>> 8 ;
860 else if (c
< 0x110000) {
864 if (xmlLittleEndian
) {
865 *out
++ = 0xD800 | (c
>> 10);
866 *out
++ = 0xDC00 | (c
& 0x03FF);
868 tmp1
= 0xD800 | (c
>> 10);
869 tmp
= (unsigned char *) out
;
870 *tmp
= (unsigned char) tmp1
;
871 *(tmp
+ 1) = tmp1
>> 8;
874 tmp2
= 0xDC00 | (c
& 0x03FF);
875 tmp
= (unsigned char *) out
;
876 *tmp
= (unsigned char) tmp2
;
877 *(tmp
+ 1) = tmp2
>> 8;
885 *outlen
= (out
- outstart
) * 2;
886 *inlen
= processed
- in
;
892 * @out: a pointer to an array of bytes to store the result
893 * @outlen: the length of @out
894 * @inb: a pointer to an array of UTF-16 passwd as a byte array
895 * @inlenb: the length of @in in UTF-16 chars
897 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
898 * block of chars out. This function assume the endian property
899 * is the same between the native type of this machine and the
902 * Returns the number of byte written, or -1 by lack of space, or -2
903 * if the transcoding fails (for *in is not valid utf16 string)
904 * The value of *inlen after return is the number of octets consumed
905 * as the return value is positive, else unpredictable.
908 UTF16BEToUTF8(unsigned char* out
, int *outlen
,
909 const unsigned char* inb
, int *inlenb
)
911 unsigned char* outstart
= out
;
912 const unsigned char* processed
= inb
;
913 unsigned char* outend
= out
+ *outlen
;
914 unsigned short* in
= (unsigned short*) inb
;
915 unsigned short* inend
;
916 unsigned int c
, d
, inlen
;
920 if ((*inlenb
% 2) == 1)
925 if (xmlLittleEndian
) {
926 tmp
= (unsigned char *) in
;
929 c
= c
| (unsigned int) *tmp
;
934 if ((c
& 0xFC00) == 0xD800) { /* surrogates */
935 if (in
>= inend
) { /* (in > inend) shouldn't happens */
936 *outlen
= out
- outstart
;
937 *inlenb
= processed
- inb
;
940 if (xmlLittleEndian
) {
941 tmp
= (unsigned char *) in
;
944 d
= d
| (unsigned int) *tmp
;
949 if ((d
& 0xFC00) == 0xDC00) {
956 *outlen
= out
- outstart
;
957 *inlenb
= processed
- inb
;
962 /* assertion: c is a single UTF-4 value */
965 if (c
< 0x80) { *out
++= c
; bits
= -6; }
966 else if (c
< 0x800) { *out
++= ((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
967 else if (c
< 0x10000) { *out
++= ((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
968 else { *out
++= ((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
970 for ( ; bits
>= 0; bits
-= 6) {
973 *out
++= ((c
>> bits
) & 0x3F) | 0x80;
975 processed
= (const unsigned char*) in
;
977 *outlen
= out
- outstart
;
978 *inlenb
= processed
- inb
;
984 * @outb: a pointer to an array of bytes to store the result
985 * @outlen: the length of @outb
986 * @in: a pointer to an array of UTF-8 chars
987 * @inlen: the length of @in
989 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
990 * block of chars out.
992 * Returns the number of byte written, or -1 by lack of space, or -2
993 * if the transcoding failed.
996 UTF8ToUTF16BE(unsigned char* outb
, int *outlen
,
997 const unsigned char* in
, int *inlen
)
999 unsigned short* out
= (unsigned short*) outb
;
1000 const unsigned char* processed
= in
;
1001 unsigned short* outstart
= out
;
1002 unsigned short* outend
;
1003 const unsigned char* inend
= in
+*inlen
;
1007 unsigned short tmp1
, tmp2
;
1011 * initialization, add the Byte Order Mark
1018 #ifdef DEBUG_ENCODING
1019 xmlGenericError(xmlGenericErrorContext
,
1020 "Added FEFF Byte Order Mark\n");
1028 outend
= out
+ (*outlen
/ 2);
1029 while (in
< inend
) {
1031 if (d
< 0x80) { c
= d
; trailing
= 0; }
1032 else if (d
< 0xC0) {
1033 /* trailing byte in leading position */
1034 *outlen
= out
- outstart
;
1035 *inlen
= processed
- in
;
1037 } else if (d
< 0xE0) { c
= d
& 0x1F; trailing
= 1; }
1038 else if (d
< 0xF0) { c
= d
& 0x0F; trailing
= 2; }
1039 else if (d
< 0xF8) { c
= d
& 0x07; trailing
= 3; }
1041 /* no chance for this in UTF-16 */
1042 *outlen
= out
- outstart
;
1043 *inlen
= processed
- in
;
1047 if (inend
- in
< trailing
) {
1051 for ( ; trailing
; trailing
--) {
1052 if ((in
>= inend
) || (((d
= *in
++) & 0xC0) != 0x80)) break;
1057 /* assertion: c is a single UTF-4 value */
1059 if (out
>= outend
) break;
1060 if (xmlLittleEndian
) {
1061 tmp
= (unsigned char *) out
;
1069 else if (c
< 0x110000) {
1070 if (out
+1 >= outend
) break;
1072 if (xmlLittleEndian
) {
1073 tmp1
= 0xD800 | (c
>> 10);
1074 tmp
= (unsigned char *) out
;
1076 *(tmp
+ 1) = (unsigned char) tmp1
;
1079 tmp2
= 0xDC00 | (c
& 0x03FF);
1080 tmp
= (unsigned char *) out
;
1082 *(tmp
+ 1) = (unsigned char) tmp2
;
1085 *out
++ = 0xD800 | (c
>> 10);
1086 *out
++ = 0xDC00 | (c
& 0x03FF);
1093 *outlen
= (out
- outstart
) * 2;
1094 *inlen
= processed
- in
;
1098 /************************************************************************
1100 * Generic encoding handling routines *
1102 ************************************************************************/
1105 * xmlDetectCharEncoding:
1106 * @in: a pointer to the first bytes of the XML entity, must be at least
1108 * @len: pointer to the length of the buffer
1110 * Guess the encoding of the entity using the first bytes of the entity content
1111 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1113 * Returns one of the XML_CHAR_ENCODING_... values.
1116 xmlDetectCharEncoding(const unsigned char* in
, int len
)
1119 if ((in
[0] == 0x00) && (in
[1] == 0x00) &&
1120 (in
[2] == 0x00) && (in
[3] == 0x3C))
1121 return(XML_CHAR_ENCODING_UCS4BE
);
1122 if ((in
[0] == 0x3C) && (in
[1] == 0x00) &&
1123 (in
[2] == 0x00) && (in
[3] == 0x00))
1124 return(XML_CHAR_ENCODING_UCS4LE
);
1125 if ((in
[0] == 0x00) && (in
[1] == 0x00) &&
1126 (in
[2] == 0x3C) && (in
[3] == 0x00))
1127 return(XML_CHAR_ENCODING_UCS4_2143
);
1128 if ((in
[0] == 0x00) && (in
[1] == 0x3C) &&
1129 (in
[2] == 0x00) && (in
[3] == 0x00))
1130 return(XML_CHAR_ENCODING_UCS4_3412
);
1131 if ((in
[0] == 0x4C) && (in
[1] == 0x6F) &&
1132 (in
[2] == 0xA7) && (in
[3] == 0x94))
1133 return(XML_CHAR_ENCODING_EBCDIC
);
1134 if ((in
[0] == 0x3C) && (in
[1] == 0x3F) &&
1135 (in
[2] == 0x78) && (in
[3] == 0x6D))
1136 return(XML_CHAR_ENCODING_UTF8
);
1140 * Errata on XML-1.0 June 20 2001
1141 * We now allow an UTF8 encoded BOM
1143 if ((in
[0] == 0xEF) && (in
[1] == 0xBB) &&
1145 return(XML_CHAR_ENCODING_UTF8
);
1148 if ((in
[0] == 0xFE) && (in
[1] == 0xFF))
1149 return(XML_CHAR_ENCODING_UTF16BE
);
1150 if ((in
[0] == 0xFF) && (in
[1] == 0xFE))
1151 return(XML_CHAR_ENCODING_UTF16LE
);
1153 return(XML_CHAR_ENCODING_NONE
);
1157 * xmlCleanupEncodingAliases:
1159 * Unregisters all aliases
1162 xmlCleanupEncodingAliases(void) {
1165 if (xmlCharEncodingAliases
== NULL
)
1168 for (i
= 0;i
< xmlCharEncodingAliasesNb
;i
++) {
1169 if (xmlCharEncodingAliases
[i
].name
!= NULL
)
1170 xmlFree((char *) xmlCharEncodingAliases
[i
].name
);
1171 if (xmlCharEncodingAliases
[i
].alias
!= NULL
)
1172 xmlFree((char *) xmlCharEncodingAliases
[i
].alias
);
1174 xmlCharEncodingAliasesNb
= 0;
1175 xmlCharEncodingAliasesMax
= 0;
1176 xmlFree(xmlCharEncodingAliases
);
1177 xmlCharEncodingAliases
= NULL
;
1181 * xmlGetEncodingAlias:
1182 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1184 * Lookup an encoding name for the given alias.
1186 * Returns NULL if not found the original name otherwise
1189 xmlGetEncodingAlias(const char *alias
) {
1196 if (xmlCharEncodingAliases
== NULL
)
1199 for (i
= 0;i
< 99;i
++) {
1200 upper
[i
] = toupper(alias
[i
]);
1201 if (upper
[i
] == 0) break;
1206 * Walk down the list looking for a definition of the alias
1208 for (i
= 0;i
< xmlCharEncodingAliasesNb
;i
++) {
1209 if (!strcmp(xmlCharEncodingAliases
[i
].alias
, upper
)) {
1210 return(xmlCharEncodingAliases
[i
].name
);
1217 * xmlAddEncodingAlias:
1218 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1219 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1221 * Registers and alias @alias for an encoding named @name. Existing alias
1222 * will be overwritten.
1224 * Returns 0 in case of success, -1 in case of error
1227 xmlAddEncodingAlias(const char *name
, const char *alias
) {
1231 if ((name
== NULL
) || (alias
== NULL
))
1234 for (i
= 0;i
< 99;i
++) {
1235 upper
[i
] = toupper(alias
[i
]);
1236 if (upper
[i
] == 0) break;
1240 if (xmlCharEncodingAliases
== NULL
) {
1241 xmlCharEncodingAliasesNb
= 0;
1242 xmlCharEncodingAliasesMax
= 20;
1243 xmlCharEncodingAliases
= (xmlCharEncodingAliasPtr
)
1244 xmlMalloc(xmlCharEncodingAliasesMax
* sizeof(xmlCharEncodingAlias
));
1245 if (xmlCharEncodingAliases
== NULL
)
1247 } else if (xmlCharEncodingAliasesNb
>= xmlCharEncodingAliasesMax
) {
1248 xmlCharEncodingAliasesMax
*= 2;
1249 xmlCharEncodingAliases
= (xmlCharEncodingAliasPtr
)
1250 xmlRealloc(xmlCharEncodingAliases
,
1251 xmlCharEncodingAliasesMax
* sizeof(xmlCharEncodingAlias
));
1254 * Walk down the list looking for a definition of the alias
1256 for (i
= 0;i
< xmlCharEncodingAliasesNb
;i
++) {
1257 if (!strcmp(xmlCharEncodingAliases
[i
].alias
, upper
)) {
1259 * Replace the definition.
1261 xmlFree((char *) xmlCharEncodingAliases
[i
].name
);
1262 xmlCharEncodingAliases
[i
].name
= xmlMemStrdup(name
);
1267 * Add the definition
1269 xmlCharEncodingAliases
[xmlCharEncodingAliasesNb
].name
= xmlMemStrdup(name
);
1270 xmlCharEncodingAliases
[xmlCharEncodingAliasesNb
].alias
= xmlMemStrdup(upper
);
1271 xmlCharEncodingAliasesNb
++;
1276 * xmlDelEncodingAlias:
1277 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1279 * Unregisters an encoding alias @alias
1281 * Returns 0 in case of success, -1 in case of error
1284 xmlDelEncodingAlias(const char *alias
) {
1290 if (xmlCharEncodingAliases
== NULL
)
1293 * Walk down the list looking for a definition of the alias
1295 for (i
= 0;i
< xmlCharEncodingAliasesNb
;i
++) {
1296 if (!strcmp(xmlCharEncodingAliases
[i
].alias
, alias
)) {
1297 xmlFree((char *) xmlCharEncodingAliases
[i
].name
);
1298 xmlFree((char *) xmlCharEncodingAliases
[i
].alias
);
1299 xmlCharEncodingAliasesNb
--;
1300 memmove(&xmlCharEncodingAliases
[i
], &xmlCharEncodingAliases
[i
+ 1],
1301 sizeof(xmlCharEncodingAlias
) * (xmlCharEncodingAliasesNb
- i
));
1309 * xmlParseCharEncoding:
1310 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1312 * Compare the string to the known encoding schemes already known. Note
1313 * that the comparison is case insensitive accordingly to the section
1314 * [XML] 4.3.3 Character Encoding in Entities.
1316 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1317 * if not recognized.
1320 xmlParseCharEncoding(const char* name
)
1327 return(XML_CHAR_ENCODING_NONE
);
1330 * Do the alias resolution
1332 alias
= xmlGetEncodingAlias(name
);
1336 for (i
= 0;i
< 499;i
++) {
1337 upper
[i
] = toupper(name
[i
]);
1338 if (upper
[i
] == 0) break;
1342 if (!strcmp(upper
, "")) return(XML_CHAR_ENCODING_NONE
);
1343 if (!strcmp(upper
, "UTF-8")) return(XML_CHAR_ENCODING_UTF8
);
1344 if (!strcmp(upper
, "UTF8")) return(XML_CHAR_ENCODING_UTF8
);
1347 * NOTE: if we were able to parse this, the endianness of UTF16 is
1348 * already found and in use
1350 if (!strcmp(upper
, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE
);
1351 if (!strcmp(upper
, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE
);
1353 if (!strcmp(upper
, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2
);
1354 if (!strcmp(upper
, "UCS-2")) return(XML_CHAR_ENCODING_UCS2
);
1355 if (!strcmp(upper
, "UCS2")) return(XML_CHAR_ENCODING_UCS2
);
1358 * NOTE: if we were able to parse this, the endianness of UCS4 is
1359 * already found and in use
1361 if (!strcmp(upper
, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE
);
1362 if (!strcmp(upper
, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE
);
1363 if (!strcmp(upper
, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE
);
1366 if (!strcmp(upper
, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1
);
1367 if (!strcmp(upper
, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1
);
1368 if (!strcmp(upper
, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1
);
1370 if (!strcmp(upper
, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2
);
1371 if (!strcmp(upper
, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2
);
1372 if (!strcmp(upper
, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2
);
1374 if (!strcmp(upper
, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3
);
1375 if (!strcmp(upper
, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4
);
1376 if (!strcmp(upper
, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5
);
1377 if (!strcmp(upper
, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6
);
1378 if (!strcmp(upper
, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7
);
1379 if (!strcmp(upper
, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8
);
1380 if (!strcmp(upper
, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9
);
1382 if (!strcmp(upper
, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP
);
1383 if (!strcmp(upper
, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS
);
1384 if (!strcmp(upper
, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP
);
1386 #ifdef DEBUG_ENCODING
1387 xmlGenericError(xmlGenericErrorContext
, "Unknown encoding %s\n", name
);
1389 return(XML_CHAR_ENCODING_ERROR
);
1393 * xmlGetCharEncodingName:
1394 * @enc: the encoding
1396 * The "canonical" name for XML encoding.
1397 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1398 * Section 4.3.3 Character Encoding in Entities
1400 * Returns the canonical name for the given encoding
1404 xmlGetCharEncodingName(xmlCharEncoding enc
) {
1406 case XML_CHAR_ENCODING_ERROR
:
1408 case XML_CHAR_ENCODING_NONE
:
1410 case XML_CHAR_ENCODING_UTF8
:
1412 case XML_CHAR_ENCODING_UTF16LE
:
1414 case XML_CHAR_ENCODING_UTF16BE
:
1416 case XML_CHAR_ENCODING_EBCDIC
:
1418 case XML_CHAR_ENCODING_UCS4LE
:
1419 return("ISO-10646-UCS-4");
1420 case XML_CHAR_ENCODING_UCS4BE
:
1421 return("ISO-10646-UCS-4");
1422 case XML_CHAR_ENCODING_UCS4_2143
:
1423 return("ISO-10646-UCS-4");
1424 case XML_CHAR_ENCODING_UCS4_3412
:
1425 return("ISO-10646-UCS-4");
1426 case XML_CHAR_ENCODING_UCS2
:
1427 return("ISO-10646-UCS-2");
1428 case XML_CHAR_ENCODING_8859_1
:
1429 return("ISO-8859-1");
1430 case XML_CHAR_ENCODING_8859_2
:
1431 return("ISO-8859-2");
1432 case XML_CHAR_ENCODING_8859_3
:
1433 return("ISO-8859-3");
1434 case XML_CHAR_ENCODING_8859_4
:
1435 return("ISO-8859-4");
1436 case XML_CHAR_ENCODING_8859_5
:
1437 return("ISO-8859-5");
1438 case XML_CHAR_ENCODING_8859_6
:
1439 return("ISO-8859-6");
1440 case XML_CHAR_ENCODING_8859_7
:
1441 return("ISO-8859-7");
1442 case XML_CHAR_ENCODING_8859_8
:
1443 return("ISO-8859-8");
1444 case XML_CHAR_ENCODING_8859_9
:
1445 return("ISO-8859-9");
1446 case XML_CHAR_ENCODING_2022_JP
:
1447 return("ISO-2022-JP");
1448 case XML_CHAR_ENCODING_SHIFT_JIS
:
1449 return("Shift-JIS");
1450 case XML_CHAR_ENCODING_EUC_JP
:
1452 case XML_CHAR_ENCODING_ASCII
:
1458 /************************************************************************
1460 * Char encoding handlers *
1462 ************************************************************************/
1465 /* the size should be growable, but it's not a big deal ... */
1466 #define MAX_ENCODING_HANDLERS 50
1467 static xmlCharEncodingHandlerPtr
*handlers
= NULL
;
1468 static int nbCharEncodingHandler
= 0;
1471 * The default is UTF-8 for XML, that's also the default used for the
1472 * parser internals, so the default encoding handler is NULL
1475 static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler
= NULL
;
1478 * xmlNewCharEncodingHandler:
1479 * @name: the encoding name, in UTF-8 format (ASCII actually)
1480 * @input: the xmlCharEncodingInputFunc to read that encoding
1481 * @output: the xmlCharEncodingOutputFunc to write that encoding
1483 * Create and registers an xmlCharEncodingHandler.
1485 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1487 xmlCharEncodingHandlerPtr
1488 xmlNewCharEncodingHandler(const char *name
,
1489 xmlCharEncodingInputFunc input
,
1490 xmlCharEncodingOutputFunc output
) {
1491 xmlCharEncodingHandlerPtr handler
;
1498 * Do the alias resolution
1500 alias
= xmlGetEncodingAlias(name
);
1505 * Keep only the uppercase version of the encoding.
1508 xmlGenericError(xmlGenericErrorContext
,
1509 "xmlNewCharEncodingHandler : no name !\n");
1512 for (i
= 0;i
< 499;i
++) {
1513 upper
[i
] = toupper(name
[i
]);
1514 if (upper
[i
] == 0) break;
1517 up
= xmlMemStrdup(upper
);
1519 xmlGenericError(xmlGenericErrorContext
,
1520 "xmlNewCharEncodingHandler : out of memory !\n");
1525 * allocate and fill-up an handler block.
1527 handler
= (xmlCharEncodingHandlerPtr
)
1528 xmlMalloc(sizeof(xmlCharEncodingHandler
));
1529 if (handler
== NULL
) {
1530 xmlGenericError(xmlGenericErrorContext
,
1531 "xmlNewCharEncodingHandler : out of memory !\n");
1534 handler
->input
= input
;
1535 handler
->output
= output
;
1538 #ifdef LIBXML_ICONV_ENABLED
1539 handler
->iconv_in
= NULL
;
1540 handler
->iconv_out
= NULL
;
1541 #endif /* LIBXML_ICONV_ENABLED */
1544 * registers and returns the handler.
1546 xmlRegisterCharEncodingHandler(handler
);
1547 #ifdef DEBUG_ENCODING
1548 xmlGenericError(xmlGenericErrorContext
,
1549 "Registered encoding handler for %s\n", name
);
1555 * xmlInitCharEncodingHandlers:
1557 * Initialize the char encoding support, it registers the default
1558 * encoding supported.
1559 * NOTE: while public, this function usually doesn't need to be called
1560 * in normal processing.
1563 xmlInitCharEncodingHandlers(void) {
1564 unsigned short int tst
= 0x1234;
1565 unsigned char *ptr
= (unsigned char *) &tst
;
1567 if (handlers
!= NULL
) return;
1569 handlers
= (xmlCharEncodingHandlerPtr
*)
1570 xmlMalloc(MAX_ENCODING_HANDLERS
* sizeof(xmlCharEncodingHandlerPtr
));
1572 if (*ptr
== 0x12) xmlLittleEndian
= 0;
1573 else if (*ptr
== 0x34) xmlLittleEndian
= 1;
1574 else xmlGenericError(xmlGenericErrorContext
,
1575 "Odd problem at endianness detection\n");
1577 if (handlers
== NULL
) {
1578 xmlGenericError(xmlGenericErrorContext
,
1579 "xmlInitCharEncodingHandlers : out of memory !\n");
1582 xmlNewCharEncodingHandler("UTF-8", NULL
, NULL
);
1584 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8
, UTF8ToUTF16LE
);
1586 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8
, UTF8ToUTF16BE
);
1587 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8
, UTF8Toisolat1
);
1588 xmlNewCharEncodingHandler("ASCII", asciiToUTF8
, UTF8Toascii
);
1589 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8
, UTF8Toascii
);
1590 #ifdef LIBXML_HTML_ENABLED
1591 xmlNewCharEncodingHandler("HTML", NULL
, UTF8ToHtml
);
1596 * xmlCleanupCharEncodingHandlers:
1598 * Cleanup the memory allocated for the char encoding support, it
1599 * unregisters all the encoding handlers and the aliases.
1602 xmlCleanupCharEncodingHandlers(void) {
1603 xmlCleanupEncodingAliases();
1605 if (handlers
== NULL
) return;
1607 for (;nbCharEncodingHandler
> 0;) {
1608 nbCharEncodingHandler
--;
1609 if (handlers
[nbCharEncodingHandler
] != NULL
) {
1610 if (handlers
[nbCharEncodingHandler
]->name
!= NULL
)
1611 xmlFree(handlers
[nbCharEncodingHandler
]->name
);
1612 xmlFree(handlers
[nbCharEncodingHandler
]);
1617 nbCharEncodingHandler
= 0;
1618 xmlDefaultCharEncodingHandler
= NULL
;
1622 * xmlRegisterCharEncodingHandler:
1623 * @handler: the xmlCharEncodingHandlerPtr handler block
1625 * Register the char encoding handler, surprising, isn't it ?
1628 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler
) {
1629 if (handlers
== NULL
) xmlInitCharEncodingHandlers();
1630 if (handler
== NULL
) {
1631 xmlGenericError(xmlGenericErrorContext
,
1632 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1636 if (nbCharEncodingHandler
>= MAX_ENCODING_HANDLERS
) {
1637 xmlGenericError(xmlGenericErrorContext
,
1638 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1639 xmlGenericError(xmlGenericErrorContext
,
1640 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__
);
1643 handlers
[nbCharEncodingHandler
++] = handler
;
1647 * xmlGetCharEncodingHandler:
1648 * @enc: an xmlCharEncoding value.
1650 * Search in the registered set the handler able to read/write that encoding.
1652 * Returns the handler or NULL if not found
1654 xmlCharEncodingHandlerPtr
1655 xmlGetCharEncodingHandler(xmlCharEncoding enc
) {
1656 xmlCharEncodingHandlerPtr handler
;
1658 if (handlers
== NULL
) xmlInitCharEncodingHandlers();
1660 case XML_CHAR_ENCODING_ERROR
:
1662 case XML_CHAR_ENCODING_NONE
:
1664 case XML_CHAR_ENCODING_UTF8
:
1666 case XML_CHAR_ENCODING_UTF16LE
:
1667 return(xmlUTF16LEHandler
);
1668 case XML_CHAR_ENCODING_UTF16BE
:
1669 return(xmlUTF16BEHandler
);
1670 case XML_CHAR_ENCODING_EBCDIC
:
1671 handler
= xmlFindCharEncodingHandler("EBCDIC");
1672 if (handler
!= NULL
) return(handler
);
1673 handler
= xmlFindCharEncodingHandler("ebcdic");
1674 if (handler
!= NULL
) return(handler
);
1676 case XML_CHAR_ENCODING_UCS4BE
:
1677 handler
= xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1678 if (handler
!= NULL
) return(handler
);
1679 handler
= xmlFindCharEncodingHandler("UCS-4");
1680 if (handler
!= NULL
) return(handler
);
1681 handler
= xmlFindCharEncodingHandler("UCS4");
1682 if (handler
!= NULL
) return(handler
);
1684 case XML_CHAR_ENCODING_UCS4LE
:
1685 handler
= xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1686 if (handler
!= NULL
) return(handler
);
1687 handler
= xmlFindCharEncodingHandler("UCS-4");
1688 if (handler
!= NULL
) return(handler
);
1689 handler
= xmlFindCharEncodingHandler("UCS4");
1690 if (handler
!= NULL
) return(handler
);
1692 case XML_CHAR_ENCODING_UCS4_2143
:
1694 case XML_CHAR_ENCODING_UCS4_3412
:
1696 case XML_CHAR_ENCODING_UCS2
:
1697 handler
= xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1698 if (handler
!= NULL
) return(handler
);
1699 handler
= xmlFindCharEncodingHandler("UCS-2");
1700 if (handler
!= NULL
) return(handler
);
1701 handler
= xmlFindCharEncodingHandler("UCS2");
1702 if (handler
!= NULL
) return(handler
);
1706 * We used to keep ISO Latin encodings native in the
1707 * generated data. This led to so many problems that
1708 * this has been removed. One can still change this
1709 * back by registering no-ops encoders for those
1711 case XML_CHAR_ENCODING_8859_1
:
1712 handler
= xmlFindCharEncodingHandler("ISO-8859-1");
1713 if (handler
!= NULL
) return(handler
);
1715 case XML_CHAR_ENCODING_8859_2
:
1716 handler
= xmlFindCharEncodingHandler("ISO-8859-2");
1717 if (handler
!= NULL
) return(handler
);
1719 case XML_CHAR_ENCODING_8859_3
:
1720 handler
= xmlFindCharEncodingHandler("ISO-8859-3");
1721 if (handler
!= NULL
) return(handler
);
1723 case XML_CHAR_ENCODING_8859_4
:
1724 handler
= xmlFindCharEncodingHandler("ISO-8859-4");
1725 if (handler
!= NULL
) return(handler
);
1727 case XML_CHAR_ENCODING_8859_5
:
1728 handler
= xmlFindCharEncodingHandler("ISO-8859-5");
1729 if (handler
!= NULL
) return(handler
);
1731 case XML_CHAR_ENCODING_8859_6
:
1732 handler
= xmlFindCharEncodingHandler("ISO-8859-6");
1733 if (handler
!= NULL
) return(handler
);
1735 case XML_CHAR_ENCODING_8859_7
:
1736 handler
= xmlFindCharEncodingHandler("ISO-8859-7");
1737 if (handler
!= NULL
) return(handler
);
1739 case XML_CHAR_ENCODING_8859_8
:
1740 handler
= xmlFindCharEncodingHandler("ISO-8859-8");
1741 if (handler
!= NULL
) return(handler
);
1743 case XML_CHAR_ENCODING_8859_9
:
1744 handler
= xmlFindCharEncodingHandler("ISO-8859-9");
1745 if (handler
!= NULL
) return(handler
);
1749 case XML_CHAR_ENCODING_2022_JP
:
1750 handler
= xmlFindCharEncodingHandler("ISO-2022-JP");
1751 if (handler
!= NULL
) return(handler
);
1753 case XML_CHAR_ENCODING_SHIFT_JIS
:
1754 handler
= xmlFindCharEncodingHandler("SHIFT-JIS");
1755 if (handler
!= NULL
) return(handler
);
1756 handler
= xmlFindCharEncodingHandler("SHIFT_JIS");
1757 if (handler
!= NULL
) return(handler
);
1758 handler
= xmlFindCharEncodingHandler("Shift_JIS");
1759 if (handler
!= NULL
) return(handler
);
1761 case XML_CHAR_ENCODING_EUC_JP
:
1762 handler
= xmlFindCharEncodingHandler("EUC-JP");
1763 if (handler
!= NULL
) return(handler
);
1769 #ifdef DEBUG_ENCODING
1770 xmlGenericError(xmlGenericErrorContext
,
1771 "No handler found for encoding %d\n", enc
);
1777 * xmlFindCharEncodingHandler:
1778 * @name: a string describing the char encoding.
1780 * Search in the registered set the handler able to read/write that encoding.
1782 * Returns the handler or NULL if not found
1784 xmlCharEncodingHandlerPtr
1785 xmlFindCharEncodingHandler(const char *name
) {
1788 xmlCharEncoding alias
;
1789 #ifdef LIBXML_ICONV_ENABLED
1790 xmlCharEncodingHandlerPtr enc
;
1791 iconv_t icv_in
, icv_out
;
1792 #endif /* LIBXML_ICONV_ENABLED */
1796 if (handlers
== NULL
) xmlInitCharEncodingHandlers();
1797 if (name
== NULL
) return(xmlDefaultCharEncodingHandler
);
1798 if (name
[0] == 0) return(xmlDefaultCharEncodingHandler
);
1801 * Do the alias resolution
1804 nalias
= xmlGetEncodingAlias(name
);
1809 * Check first for directly registered encoding names
1811 for (i
= 0;i
< 99;i
++) {
1812 upper
[i
] = toupper(name
[i
]);
1813 if (upper
[i
] == 0) break;
1817 for (i
= 0;i
< nbCharEncodingHandler
; i
++)
1818 if (!strcmp(upper
, handlers
[i
]->name
)) {
1819 #ifdef DEBUG_ENCODING
1820 xmlGenericError(xmlGenericErrorContext
,
1821 "Found registered handler for encoding %s\n", name
);
1823 return(handlers
[i
]);
1826 #ifdef LIBXML_ICONV_ENABLED
1827 /* check whether iconv can handle this */
1828 icv_in
= iconv_open("UTF-8", name
);
1829 icv_out
= iconv_open(name
, "UTF-8");
1830 if ((icv_in
!= (iconv_t
) -1) && (icv_out
!= (iconv_t
) -1)) {
1831 enc
= (xmlCharEncodingHandlerPtr
)
1832 xmlMalloc(sizeof(xmlCharEncodingHandler
));
1834 iconv_close(icv_in
);
1835 iconv_close(icv_out
);
1838 enc
->name
= xmlMemStrdup(name
);
1841 enc
->iconv_in
= icv_in
;
1842 enc
->iconv_out
= icv_out
;
1843 #ifdef DEBUG_ENCODING
1844 xmlGenericError(xmlGenericErrorContext
,
1845 "Found iconv handler for encoding %s\n", name
);
1848 } else if ((icv_in
!= (iconv_t
) -1) || icv_out
!= (iconv_t
) -1) {
1849 xmlGenericError(xmlGenericErrorContext
,
1850 "iconv : problems with filters for '%s'\n", name
);
1852 #endif /* LIBXML_ICONV_ENABLED */
1854 #ifdef DEBUG_ENCODING
1855 xmlGenericError(xmlGenericErrorContext
,
1856 "No handler found for encoding %s\n", name
);
1860 * Fallback using the canonical names
1862 alias
= xmlParseCharEncoding(norig
);
1863 if (alias
!= XML_CHAR_ENCODING_ERROR
) {
1865 canon
= xmlGetCharEncodingName(alias
);
1866 if ((canon
!= NULL
) && (strcmp(name
, canon
))) {
1867 return(xmlFindCharEncodingHandler(canon
));
1874 /************************************************************************
1876 * ICONV based generic conversion functions *
1878 ************************************************************************/
1880 #ifdef LIBXML_ICONV_ENABLED
1883 * @cd: iconv converter data structure
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of ISO Latin 1 chars
1887 * @inlen: the length of @in
1889 * Returns 0 if success, or
1890 * -1 by lack of space, or
1891 * -2 if the transcoding fails (for *in is not valid utf8 string or
1892 * the result of transformation can't fit into the encoding we want), or
1893 * -3 if there the last byte can't form a single output char.
1895 * The value of @inlen after return is the number of octets consumed
1896 * as the return value is positive, else unpredictable.
1897 * The value of @outlen after return is the number of ocetes consumed.
1900 xmlIconvWrapper(iconv_t cd
,
1901 unsigned char *out
, int *outlen
,
1902 const unsigned char *in
, int *inlen
) {
1904 size_t icv_inlen
= *inlen
, icv_outlen
= *outlen
;
1905 const char *icv_in
= (const char *) in
;
1906 char *icv_out
= (char *) out
;
1909 ret
= iconv(cd
, (char **) &icv_in
, &icv_inlen
, &icv_out
, &icv_outlen
);
1911 *inlen
-= icv_inlen
;
1912 *outlen
-= icv_outlen
;
1917 if ((icv_inlen
!= 0) || (ret
== -1)) {
1919 if (errno
== EILSEQ
) {
1924 if (errno
== E2BIG
) {
1929 if (errno
== EINVAL
) {
1939 #endif /* LIBXML_ICONV_ENABLED */
1941 /************************************************************************
1943 * The real API used by libxml for on-the-fly conversion *
1945 ************************************************************************/
1948 * xmlCharEncFirstLine:
1949 * @handler: char enconding transformation data structure
1950 * @out: an xmlBuffer for the output.
1951 * @in: an xmlBuffer for the input
1953 * Front-end for the encoding handler input function, but handle only
1954 * the very first line, i.e. limit itself to 45 chars.
1956 * Returns the number of byte written if success, or
1958 * -2 if the transcoding fails (for *in is not valid utf8 string or
1959 * the result of transformation can't fit into the encoding we want), or
1962 xmlCharEncFirstLine(xmlCharEncodingHandler
*handler
, xmlBufferPtr out
,
1968 if (handler
== NULL
) return(-1);
1969 if (out
== NULL
) return(-1);
1970 if (in
== NULL
) return(-1);
1972 written
= out
->size
- out
->use
;
1974 if (toconv
* 2 >= written
) {
1975 xmlBufferGrow(out
, toconv
);
1976 written
= out
->size
- out
->use
- 1;
1980 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1981 * 45 chars should be sufficient to reach the end of the encoding
1982 * declaration without going too far inside the document content.
1986 if (handler
->input
!= NULL
) {
1987 ret
= handler
->input(&out
->content
[out
->use
], &written
,
1988 in
->content
, &toconv
);
1989 xmlBufferShrink(in
, toconv
);
1990 out
->use
+= written
;
1991 out
->content
[out
->use
] = 0;
1993 #ifdef LIBXML_ICONV_ENABLED
1994 else if (handler
->iconv_in
!= NULL
) {
1995 ret
= xmlIconvWrapper(handler
->iconv_in
, &out
->content
[out
->use
],
1996 &written
, in
->content
, &toconv
);
1997 xmlBufferShrink(in
, toconv
);
1998 out
->use
+= written
;
1999 out
->content
[out
->use
] = 0;
2000 if (ret
== -1) ret
= -3;
2002 #endif /* LIBXML_ICONV_ENABLED */
2003 #ifdef DEBUG_ENCODING
2006 xmlGenericError(xmlGenericErrorContext
,
2007 "converted %d bytes to %d bytes of input\n",
2011 xmlGenericError(xmlGenericErrorContext
,"converted %d bytes to %d bytes of input, %d left\n",
2012 toconv
, written
, in
->use
);
2015 xmlGenericError(xmlGenericErrorContext
,
2016 "input conversion failed due to input error\n");
2019 xmlGenericError(xmlGenericErrorContext
,"converted %d bytes to %d bytes of input, %d left\n",
2020 toconv
, written
, in
->use
);
2023 xmlGenericError(xmlGenericErrorContext
,"Unknown input conversion failed %d\n", ret
);
2025 #endif /* DEBUG_ENCODING */
2027 * Ignore when input buffer is not on a boundary
2029 if (ret
== -3) ret
= 0;
2030 if (ret
== -1) ret
= 0;
2036 * @handler: char encoding transformation data structure
2037 * @out: an xmlBuffer for the output.
2038 * @in: an xmlBuffer for the input
2040 * Generic front-end for the encoding handler input function
2042 * Returns the number of byte written if success, or
2044 * -2 if the transcoding fails (for *in is not valid utf8 string or
2045 * the result of transformation can't fit into the encoding we want), or
2048 xmlCharEncInFunc(xmlCharEncodingHandler
* handler
, xmlBufferPtr out
,
2055 if (handler
== NULL
)
2065 written
= out
->size
- out
->use
;
2066 if (toconv
* 2 >= written
) {
2067 xmlBufferGrow(out
, out
->size
+ toconv
* 2);
2068 written
= out
->size
- out
->use
- 1;
2070 if (handler
->input
!= NULL
) {
2071 ret
= handler
->input(&out
->content
[out
->use
], &written
,
2072 in
->content
, &toconv
);
2073 xmlBufferShrink(in
, toconv
);
2074 out
->use
+= written
;
2075 out
->content
[out
->use
] = 0;
2077 #ifdef LIBXML_ICONV_ENABLED
2078 else if (handler
->iconv_in
!= NULL
) {
2079 ret
= xmlIconvWrapper(handler
->iconv_in
, &out
->content
[out
->use
],
2080 &written
, in
->content
, &toconv
);
2081 xmlBufferShrink(in
, toconv
);
2082 out
->use
+= written
;
2083 out
->content
[out
->use
] = 0;
2087 #endif /* LIBXML_ICONV_ENABLED */
2090 #ifdef DEBUG_ENCODING
2091 xmlGenericError(xmlGenericErrorContext
,
2092 "converted %d bytes to %d bytes of input\n",
2097 #ifdef DEBUG_ENCODING
2098 xmlGenericError(xmlGenericErrorContext
,
2099 "converted %d bytes to %d bytes of input, %d left\n",
2100 toconv
, written
, in
->use
);
2104 #ifdef DEBUG_ENCODING
2105 xmlGenericError(xmlGenericErrorContext
,
2106 "converted %d bytes to %d bytes of input, %d left\n",
2107 toconv
, written
, in
->use
);
2111 xmlGenericError(xmlGenericErrorContext
,
2112 "input conversion failed due to input error\n");
2113 xmlGenericError(xmlGenericErrorContext
,
2114 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2115 in
->content
[0], in
->content
[1],
2116 in
->content
[2], in
->content
[3]);
2119 * Ignore when input buffer is not on a boundary
2127 * xmlCharEncOutFunc:
2128 * @handler: char enconding transformation data structure
2129 * @out: an xmlBuffer for the output.
2130 * @in: an xmlBuffer for the input
2132 * Generic front-end for the encoding handler output function
2133 * a first call with @in == NULL has to be made firs to initiate the
2134 * output in case of non-stateless encoding needing to initiate their
2135 * state or the output (like the BOM in UTF16).
2136 * In case of UTF8 sequence conversion errors for the given encoder,
2137 * the content will be automatically remapped to a CharRef sequence.
2139 * Returns the number of byte written if success, or
2141 * -2 if the transcoding fails (for *in is not valid utf8 string or
2142 * the result of transformation can't fit into the encoding we want), or
2145 xmlCharEncOutFunc(xmlCharEncodingHandler
*handler
, xmlBufferPtr out
,
2153 if (handler
== NULL
) return(-1);
2154 if (out
== NULL
) return(-1);
2158 written
= out
->size
- out
->use
;
2161 * First specific handling of in = NULL, i.e. the initialization call
2165 if (handler
->output
!= NULL
) {
2166 ret
= handler
->output(&out
->content
[out
->use
], &written
,
2168 out
->use
+= written
;
2169 out
->content
[out
->use
] = 0;
2171 #ifdef LIBXML_ICONV_ENABLED
2172 else if (handler
->iconv_out
!= NULL
) {
2173 ret
= xmlIconvWrapper(handler
->iconv_out
, &out
->content
[out
->use
],
2174 &written
, NULL
, &toconv
);
2175 out
->use
+= written
;
2176 out
->content
[out
->use
] = 0;
2178 #endif /* LIBXML_ICONV_ENABLED */
2179 #ifdef DEBUG_ENCODING
2180 xmlGenericError(xmlGenericErrorContext
,
2181 "initialized encoder\n");
2187 * Conversion itself.
2192 if (toconv
* 2 >= written
) {
2193 xmlBufferGrow(out
, toconv
* 2);
2194 written
= out
->size
- out
->use
- 1;
2196 if (handler
->output
!= NULL
) {
2197 ret
= handler
->output(&out
->content
[out
->use
], &written
,
2198 in
->content
, &toconv
);
2199 xmlBufferShrink(in
, toconv
);
2200 out
->use
+= written
;
2201 writtentot
+= written
;
2202 out
->content
[out
->use
] = 0;
2204 #ifdef LIBXML_ICONV_ENABLED
2205 else if (handler
->iconv_out
!= NULL
) {
2206 ret
= xmlIconvWrapper(handler
->iconv_out
, &out
->content
[out
->use
],
2207 &written
, in
->content
, &toconv
);
2208 xmlBufferShrink(in
, toconv
);
2209 out
->use
+= written
;
2210 writtentot
+= written
;
2211 out
->content
[out
->use
] = 0;
2215 * Can be a limitation of iconv
2222 #endif /* LIBXML_ICONV_ENABLED */
2224 xmlGenericError(xmlGenericErrorContext
,
2225 "xmlCharEncOutFunc: no output function !\n");
2229 if (ret
>= 0) output
+= ret
;
2232 * Attempt to handle error cases
2236 #ifdef DEBUG_ENCODING
2237 xmlGenericError(xmlGenericErrorContext
,
2238 "converted %d bytes to %d bytes of output\n",
2243 #ifdef DEBUG_ENCODING
2244 xmlGenericError(xmlGenericErrorContext
,
2245 "output conversion failed by lack of space\n");
2249 xmlGenericError(xmlGenericErrorContext
,"converted %d bytes to %d bytes of output %d left\n",
2250 toconv
, written
, in
->use
);
2254 const xmlChar
*utf
= (const xmlChar
*) in
->content
;
2257 cur
= xmlGetUTF8Char(utf
, &len
);
2259 xmlChar charref
[20];
2261 #ifdef DEBUG_ENCODING
2262 xmlGenericError(xmlGenericErrorContext
,
2263 "handling output conversion error\n");
2264 xmlGenericError(xmlGenericErrorContext
,
2265 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2266 in
->content
[0], in
->content
[1],
2267 in
->content
[2], in
->content
[3]);
2270 * Removes the UTF8 sequence, and replace it by a charref
2271 * and continue the transcoding phase, hoping the error
2272 * did not mangle the encoder state.
2274 snprintf((char *) charref
, sizeof(charref
), "&#%d;", cur
);
2275 xmlBufferShrink(in
, len
);
2276 xmlBufferAddHead(in
, charref
, -1);
2280 xmlGenericError(xmlGenericErrorContext
,
2281 "output conversion failed due to conv error\n");
2282 xmlGenericError(xmlGenericErrorContext
,
2283 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2284 in
->content
[0], in
->content
[1],
2285 in
->content
[2], in
->content
[3]);
2286 in
->content
[0] = ' ';
2295 * xmlCharEncCloseFunc:
2296 * @handler: char enconding transformation data structure
2298 * Generic front-end for encoding handler close function
2300 * Returns 0 if success, or -1 in case of error
2303 xmlCharEncCloseFunc(xmlCharEncodingHandler
*handler
) {
2305 if (handler
== NULL
) return(-1);
2306 if (handler
->name
== NULL
) return(-1);
2307 #ifdef LIBXML_ICONV_ENABLED
2309 * Iconv handlers can be used only once, free the whole block.
2310 * and the associated icon resources.
2312 if ((handler
->iconv_out
!= NULL
) || (handler
->iconv_in
!= NULL
)) {
2313 if (handler
->name
!= NULL
)
2314 xmlFree(handler
->name
);
2315 handler
->name
= NULL
;
2316 if (handler
->iconv_out
!= NULL
) {
2317 if (iconv_close(handler
->iconv_out
))
2319 handler
->iconv_out
= NULL
;
2321 if (handler
->iconv_in
!= NULL
) {
2322 if (iconv_close(handler
->iconv_in
))
2324 handler
->iconv_in
= NULL
;
2328 #endif /* LIBXML_ICONV_ENABLED */
2329 #ifdef DEBUG_ENCODING
2331 xmlGenericError(xmlGenericErrorContext
,
2332 "failed to close the encoding handler\n");
2334 xmlGenericError(xmlGenericErrorContext
,
2335 "closed the encoding handler\n");