src/external/3rd/library/libxml/encoding.c

   1 /*
   2  * encoding.c : implements the encoding conversion functions needed for XML
   3  *
   4  * Related specs:
   5  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
   6  * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
   7  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
   8  * [ISO-8859-1]   ISO Latin-1 characters codes.
   9  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
  10  *                Worldwide Character Encoding -- Version 1.0", Addison-
  11  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
  12  *                described in Unicode Technical Report #4.
  13  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
  14  *                Information Interchange, ANSI X3.4-1986.
  15  *
  16  * See Copyright for the status of this software.
  17  *
  18  * daniel@veillard.com
  19  *
  20  * UTF8 string routines from:
  21  * "William M. Brack" <wbrack@mmm.com.hk>
  22  *
  23  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
  24  */
  25
  26 #define IN_LIBXML
  27 #include "libxml.h"
  28
  29 #include <string.h>
  30
  31 #ifdef HAVE_CTYPE_H
  32 #include <ctype.h>
  33 #endif
  34 #ifdef HAVE_STDLIB_H
  35 #include <stdlib.h>
  36 #endif
  37 #ifdef LIBXML_ICONV_ENABLED
  38 #ifdef HAVE_ERRNO_H
  39 #include <errno.h>
  40 #endif
  41 #endif
  42 #include <libxml/encoding.h>
  43 #include <libxml/xmlmemory.h>
  44 #ifdef LIBXML_HTML_ENABLED
  45 #include <libxml/HTMLparser.h>
  46 #endif
  47 #include <libxml/globals.h>
  48 #include <libxml/xmlerror.h>
  49
  50 static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
  51 static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
  52
  53 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
  54 typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
  55 struct _xmlCharEncodingAlias {
  56     const char *name;
  57     const char *alias;
  58 };
  59
  60 static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
  61 static int xmlCharEncodingAliasesNb = 0;
  62 static int xmlCharEncodingAliasesMax = 0;
  63
  64 #ifdef LIBXML_ICONV_ENABLED
  65 #if 0
  66 #define DEBUG_ENCODING  /* Define this to get encoding traces */
  67 #endif
  68 #endif
  69
  70 static int xmlLittleEndian = 1;
  71
  72 /************************************************************************
  73  *                                                                      *
  74  *                      Generic UTF8 handling routines                  *
  75  *                                                                      *
  76  * From rfc2044: encoding of the Unicode values on UTF-8:               *
  77  *                                                                      *
  78  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
  79  * 0000 0000-0000 007F   0xxxxxxx                                       *
  80  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
  81  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
  82  *                                                                      *
  83  * I hope we won't use values > 0xFFFF anytime soon !                   *
  84  *                                                                      *
  85  ************************************************************************/
  86
  87 /**
  88  * xmlUTF8Strlen:
  89  * @utf:  a sequence of UTF-8 encoded bytes
  90  *
  91  * compute the length of an UTF8 string, it doesn't do a full UTF8
  92  * checking of the content of the string.
  93  *
  94  * Returns the number of characters in the string or -1 in case of error
  95  */
  96 int
  97 xmlUTF8Strlen(const xmlChar *utf) {
  98     int ret = 0;
  99
 100     if (utf == NULL)
 101         return(-1);
 102
 103     while (*utf != 0) {
 104         if (utf[0] & 0x80) {
 105             if ((utf[1] & 0xc0) != 0x80)
 106                 return(-1);
 107             if ((utf[0] & 0xe0) == 0xe0) {
 108                 if ((utf[2] & 0xc0) != 0x80)
 109                     return(-1);
 110                 if ((utf[0] & 0xf0) == 0xf0) {
 111                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 112                         return(-1);
 113                     utf += 4;
 114                 } else {
 115                     utf += 3;
 116                 }
 117             } else {
 118                 utf += 2;
 119             }
 120         } else {
 121             utf++;
 122         }
 123         ret++;
 124     }
 125     return(ret);
 126 }
 127
 128 /**
 129  * xmlGetUTF8Char:
 130  * @utf:  a sequence of UTF-8 encoded bytes
 131  * @len:  a pointer to @bytes len
 132  *
 133  * Read one UTF8 Char from @utf
 134  *
 135  * Returns the char value or -1 in case of error and update @len with the
 136  *        number of bytes used
 137  */
 138 int
 139 xmlGetUTF8Char(const unsigned char *utf, int *len) {
 140     unsigned int c;
 141
 142     if (utf == NULL)
 143         goto error;
 144     if (len == NULL)
 145         goto error;
 146     if (*len < 1)
 147         goto error;
 148
 149     c = utf[0];
 150     if (c & 0x80) {
 151         if (*len < 2)
 152             goto error;
 153         if ((utf[1] & 0xc0) != 0x80)
 154             goto error;
 155         if ((c & 0xe0) == 0xe0) {
 156             if (*len < 3)
 157                 goto error;
 158             if ((utf[2] & 0xc0) != 0x80)
 159                 goto error;
 160             if ((c & 0xf0) == 0xf0) {
 161                 if (*len < 4)
 162                     goto error;
 163                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 164                     goto error;
 165                 *len = 4;
 166                 /* 4-byte code */
 167                 c = (utf[0] & 0x7) << 18;
 168                 c |= (utf[1] & 0x3f) << 12;
 169                 c |= (utf[2] & 0x3f) << 6;
 170                 c |= utf[3] & 0x3f;
 171             } else {
 172               /* 3-byte code */
 173                 *len = 3;
 174                 c = (utf[0] & 0xf) << 12;
 175                 c |= (utf[1] & 0x3f) << 6;
 176                 c |= utf[2] & 0x3f;
 177             }
 178         } else {
 179           /* 2-byte code */
 180             *len = 2;
 181             c = (utf[0] & 0x1f) << 6;
 182             c |= utf[1] & 0x3f;
 183         }
 184     } else {
 185         /* 1-byte code */
 186         *len = 1;
 187     }
 188     return(c);
 189
 190 error:
 191     *len = 0;
 192     return(-1);
 193 }
 194
 195 /**
 196  * xmlCheckUTF8: Check utf-8 string for legality.
 197  * @utf: Pointer to putative utf-8 encoded string.
 198  *
 199  * Checks @utf for being valid utf-8. @utf is assumed to be
 200  * null-terminated. This function is not super-strict, as it will
 201  * allow longer utf-8 sequences than necessary. Note that Java is
 202  * capable of producing these sequences if provoked. Also note, this
 203  * routine checks for the 4-byte maximum size, but does not check for
 204  * 0x10ffff maximum value.
 205  *
 206  * Return value: true if @utf is valid.
 207  **/
 208 int
 209 xmlCheckUTF8(const unsigned char *utf)
 210 {
 211     int ix;
 212     unsigned char c;
 213
 214     for (ix = 0; (c = utf[ix]);) {
 215         if (c & 0x80) {
 216             if ((utf[ix + 1] & 0xc0) != 0x80)
 217                 return(0);
 218             if ((c & 0xe0) == 0xe0) {
 219                 if ((utf[ix + 2] & 0xc0) != 0x80)
 220                     return(0);
 221                 if ((c & 0xf0) == 0xf0) {
 222                     if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
 223                         return(0);
 224                     ix += 4;
 225                     /* 4-byte code */
 226                 } else
 227                   /* 3-byte code */
 228                     ix += 3;
 229             } else
 230               /* 2-byte code */
 231                 ix += 2;
 232         } else
 233             /* 1-byte code */
 234             ix++;
 235       }
 236       return(1);
 237 }
 238
 239 /**
 240  * xmlUTF8Strsize:
 241  * @utf:  a sequence of UTF-8 encoded bytes
 242  * @len:  the number of characters in the array
 243  *
 244  * storage size of an UTF8 string
 245  *
 246  * Returns the storage size of
 247  * the first 'len' characters of ARRAY
 248  *
 249  */
 250
 251 int
 252 xmlUTF8Strsize(const xmlChar *utf, int len) {
 253     const xmlChar       *ptr=utf;
 254     xmlChar     ch;
 255
 256     if (len <= 0)
 257         return(0);
 258
 259     while ( len-- > 0) {
 260         if ( !*ptr )
 261             break;
 262         if ( (ch = *ptr++) & 0x80)
 263             while ( (ch<<=1) & 0x80 )
 264                 ptr++;
 265     }
 266     return (ptr - utf);
 267 }
 268
 269
 270 /**
 271  * xmlUTF8Strndup:
 272  * @utf:  the input UTF8 *
 273  * @len:  the len of @utf (in chars)
 274  *
 275  * a strndup for array of UTF8's
 276  *
 277  * Returns a new UTF8 * or NULL
 278  */
 279 xmlChar *
 280 xmlUTF8Strndup(const xmlChar *utf, int len) {
 281     xmlChar *ret;
 282     int i;
 283
 284     if ((utf == NULL) || (len < 0)) return(NULL);
 285     i = xmlUTF8Strsize(utf, len);
 286     ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
 287     if (ret == NULL) {
 288         xmlGenericError(xmlGenericErrorContext,
 289                 "malloc of %ld byte failed\n",
 290                 (len + 1) * (long)sizeof(xmlChar));
 291         return(NULL);
 292     }
 293     memcpy(ret, utf, i * sizeof(xmlChar));
 294     ret[i] = 0;
 295     return(ret);
 296 }
 297
 298 /**
 299  * xmlUTF8Strpos:
 300  * @utf:  the input UTF8 *
 301  * @pos:  the position of the desired UTF8 char (in chars)
 302  *
 303  * a function to provide the equivalent of fetching a
 304  * character from a string array
 305  *
 306  * Returns a pointer to the UTF8 character or NULL
 307  */
 308 xmlChar *
 309 xmlUTF8Strpos(const xmlChar *utf, int pos) {
 310     xmlChar ch;
 311
 312     if (utf == NULL) return(NULL);
 313     if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
 314         return(NULL);
 315     while (pos--) {
 316         if ((ch=*utf++) == 0) return(NULL);
 317         if ( ch & 0x80 ) {
 318             /* if not simple ascii, verify proper format */
 319             if ( (ch & 0xc0) != 0xc0 )
 320                 return(NULL);
 321             /* then skip over remaining bytes for this char */
 322             while ( (ch <<= 1) & 0x80 )
 323                 if ( (*utf++ & 0xc0) != 0x80 )
 324                     return(NULL);
 325         }
 326     }
 327     return((xmlChar *)utf);
 328 }
 329
 330 /**
 331  * xmlUTF8Strloc:
 332  * @utf:  the input UTF8 *
 333  * @utfchar:  the UTF8 character to be found
 334  *
 335  * a function to provide relative location of a UTF8 char
 336  *
 337  * Returns the relative character position of the desired char
 338  * or -1 if not found
 339  */
 340 int
 341 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
 342     int i, size;
 343     xmlChar ch;
 344
 345     if (utf==NULL || utfchar==NULL) return -1;
 346     size = xmlUTF8Strsize(utfchar, 1);
 347         for(i=0; (ch=*utf) != 0; i++) {
 348             if (xmlStrncmp(utf, utfchar, size)==0)
 349                 return(i);
 350             utf++;
 351             if ( ch & 0x80 ) {
 352                 /* if not simple ascii, verify proper format */
 353                 if ( (ch & 0xc0) != 0xc0 )
 354                     return(-1);
 355                 /* then skip over remaining bytes for this char */
 356                 while ( (ch <<= 1) & 0x80 )
 357                     if ( (*utf++ & 0xc0) != 0x80 )
 358                         return(-1);
 359             }
 360         }
 361
 362     return(-1);
 363 }
 364 /**
 365  * xmlUTF8Strsub:
 366  * @utf:  a sequence of UTF-8 encoded bytes
 367  * @start: relative pos of first char
 368  * @len:   total number to copy
 369  *
 370  * Note:  positions are given in units of UTF-8 chars
 371  *
 372  * Returns a pointer to a newly created string
 373  * or NULL if any problem
 374  */
 375
 376 xmlChar *
 377 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
 378     int     i;
 379     xmlChar ch;
 380
 381     if (utf == NULL) return(NULL);
 382     if (start < 0) return(NULL);
 383     if (len < 0) return(NULL);
 384
 385     /*
 386      * Skip over any leading chars
 387      */
 388     for (i = 0;i < start;i++) {
 389         if ((ch=*utf++) == 0) return(NULL);
 390         if ( ch & 0x80 ) {
 391             /* if not simple ascii, verify proper format */
 392             if ( (ch & 0xc0) != 0xc0 )
 393                 return(NULL);
 394             /* then skip over remaining bytes for this char */
 395             while ( (ch <<= 1) & 0x80 )
 396                 if ( (*utf++ & 0xc0) != 0x80 )
 397                     return(NULL);
 398         }
 399     }
 400
 401     return(xmlUTF8Strndup(utf, len));
 402 }
 403
 404 /************************************************************************
 405  *                                                                      *
 406  *              Conversions To/From UTF8 encoding                       *
 407  *                                                                      *
 408  ************************************************************************/
 409
 410 /**
 411  * asciiToUTF8:
 412  * @out:  a pointer to an array of bytes to store the result
 413  * @outlen:  the length of @out
 414  * @in:  a pointer to an array of ASCII chars
 415  * @inlen:  the length of @in
 416  *
 417  * Take a block of ASCII chars in and try to convert it to an UTF-8
 418  * block of chars out.
 419  * Returns 0 if success, or -1 otherwise
 420  * The value of @inlen after return is the number of octets consumed
 421  *     as the return value is positive, else unpredictable.
 422  * The value of @outlen after return is the number of ocetes consumed.
 423  */
 424 static int
 425 asciiToUTF8(unsigned char* out, int *outlen,
 426               const unsigned char* in, int *inlen) {
 427     unsigned char* outstart = out;
 428     const unsigned char* base = in;
 429     const unsigned char* processed = in;
 430     unsigned char* outend = out + *outlen;
 431     const unsigned char* inend;
 432     unsigned int c;
 433     int bits;
 434
 435     inend = in + (*inlen);
 436     while ((in < inend) && (out - outstart + 5 < *outlen)) {
 437         c= *in++;
 438
 439         /* assertion: c is a single UTF-4 value */
 440         if (out >= outend)
 441             break;
 442         if      (c <    0x80) {  *out++=  c;                bits= -6; }
 443         else {
 444             *outlen = out - outstart;
 445             *inlen = processed - base;
 446             return(-1);
 447         }
 448
 449         for ( ; bits >= 0; bits-= 6) {
 450             if (out >= outend)
 451                 break;
 452             *out++= ((c >> bits) & 0x3F) | 0x80;
 453         }
 454         processed = (const unsigned char*) in;
 455     }
 456     *outlen = out - outstart;
 457     *inlen = processed - base;
 458     return(0);
 459 }
 460
 461 /**
 462  * UTF8Toascii:
 463  * @out:  a pointer to an array of bytes to store the result
 464  * @outlen:  the length of @out
 465  * @in:  a pointer to an array of UTF-8 chars
 466  * @inlen:  the length of @in
 467  *
 468  * Take a block of UTF-8 chars in and try to convert it to an ASCII
 469  * block of chars out.
 470  *
 471  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
 472  * The value of @inlen after return is the number of octets consumed
 473  *     as the return value is positive, else unpredictable.
 474  * The value of @outlen after return is the number of ocetes consumed.
 475  */
 476 static int
 477 UTF8Toascii(unsigned char* out, int *outlen,
 478               const unsigned char* in, int *inlen) {
 479     const unsigned char* processed = in;
 480     const unsigned char* outend;
 481     const unsigned char* outstart = out;
 482     const unsigned char* instart = in;
 483     const unsigned char* inend;
 484     unsigned int c, d;
 485     int trailing;
 486
 487     if (in == NULL) {
 488         /*
 489          * initialization nothing to do
 490          */
 491         *outlen = 0;
 492         *inlen = 0;
 493         return(0);
 494     }
 495     inend = in + (*inlen);
 496     outend = out + (*outlen);
 497     while (in < inend) {
 498         d = *in++;
 499         if      (d < 0x80)  { c= d; trailing= 0; }
 500         else if (d < 0xC0) {
 501             /* trailing byte in leading position */
 502             *outlen = out - outstart;
 503             *inlen = processed - instart;
 504             return(-2);
 505         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
 506         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
 507         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
 508         else {
 509             /* no chance for this in Ascii */
 510             *outlen = out - outstart;
 511             *inlen = processed - instart;
 512             return(-2);
 513         }
 514
 515         if (inend - in < trailing) {
 516             break;
 517         }
 518
 519         for ( ; trailing; trailing--) {
 520             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
 521                 break;
 522             c <<= 6;
 523             c |= d & 0x3F;
 524         }
 525
 526         /* assertion: c is a single UTF-4 value */
 527         if (c < 0x80) {
 528             if (out >= outend)
 529                 break;
 530             *out++ = c;
 531         } else {
 532             /* no chance for this in Ascii */
 533             *outlen = out - outstart;
 534             *inlen = processed - instart;
 535             return(-2);
 536         }
 537         processed = in;
 538     }
 539     *outlen = out - outstart;
 540     *inlen = processed - instart;
 541     return(0);
 542 }
 543
 544 /**
 545  * isolat1ToUTF8:
 546  * @out:  a pointer to an array of bytes to store the result
 547  * @outlen:  the length of @out
 548  * @in:  a pointer to an array of ISO Latin 1 chars
 549  * @inlen:  the length of @in
 550  *
 551  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
 552  * block of chars out.
 553  * Returns 0 if success, or -1 otherwise
 554  * The value of @inlen after return is the number of octets consumed
 555  *     as the return value is positive, else unpredictable.
 556  * The value of @outlen after return is the number of ocetes consumed.
 557  */
 558 int
 559 isolat1ToUTF8(unsigned char* out, int *outlen,
 560               const unsigned char* in, int *inlen) {
 561     unsigned char* outstart = out;
 562     const unsigned char* base = in;
 563     unsigned char* outend = out + *outlen;
 564     const unsigned char* inend;
 565     const unsigned char* instop;
 566     xmlChar c = *in;
 567
 568     inend = in + (*inlen);
 569     instop = inend;
 570
 571     while (in < inend && out < outend - 1) {
 572         if (c >= 0x80) {
 573             *out++= ((c >>  6) & 0x1F) | 0xC0;
 574             *out++= (c & 0x3F) | 0x80;
 575             ++in;
 576             c = *in;
 577         }
 578         if (instop - in > outend - out) instop = in + (outend - out);
 579         while (c < 0x80 && in < instop) {
 580             *out++ =  c;
 581             ++in;
 582             c = *in;
 583         }
 584     }
 585     if (in < inend && out < outend && c < 0x80) {
 586         *out++ =  c;
 587         ++in;
 588     }
 589     *outlen = out - outstart;
 590     *inlen = in - base;
 591     return(0);
 592 }
 593
 594
 595 /**
 596  * UTF8Toisolat1:
 597  * @out:  a pointer to an array of bytes to store the result
 598  * @outlen:  the length of @out
 599  * @in:  a pointer to an array of UTF-8 chars
 600  * @inlen:  the length of @in
 601  *
 602  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
 603  * block of chars out.
 604  *
 605  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
 606  * The value of @inlen after return is the number of octets consumed
 607  *     as the return value is positive, else unpredictable.
 608  * The value of @outlen after return is the number of ocetes consumed.
 609  */
 610 int
 611 UTF8Toisolat1(unsigned char* out, int *outlen,
 612               const unsigned char* in, int *inlen) {
 613     const unsigned char* processed = in;
 614     const unsigned char* outend;
 615     const unsigned char* outstart = out;
 616     const unsigned char* instart = in;
 617     const unsigned char* inend;
 618     unsigned int c, d;
 619     int trailing;
 620
 621     if (in == NULL) {
 622         /*
 623          * initialization nothing to do
 624          */
 625         *outlen = 0;
 626         *inlen = 0;
 627         return(0);
 628     }
 629     inend = in + (*inlen);
 630     outend = out + (*outlen);
 631     while (in < inend) {
 632         d = *in++;
 633         if      (d < 0x80)  { c= d; trailing= 0; }
 634         else if (d < 0xC0) {
 635             /* trailing byte in leading position */
 636             *outlen = out - outstart;
 637             *inlen = processed - instart;
 638             return(-2);
 639         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
 640         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
 641         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
 642         else {
 643             /* no chance for this in IsoLat1 */
 644             *outlen = out - outstart;
 645             *inlen = processed - instart;
 646             return(-2);
 647         }
 648
 649         if (inend - in < trailing) {
 650             break;
 651         }
 652
 653         for ( ; trailing; trailing--) {
 654             if (in >= inend)
 655                 break;
 656             if (((d= *in++) & 0xC0) != 0x80) {
 657                 *outlen = out - outstart;
 658                 *inlen = processed - instart;
 659                 return(-2);
 660             }
 661             c <<= 6;
 662             c |= d & 0x3F;
 663         }
 664
 665         /* assertion: c is a single UTF-4 value */
 666         if (c <= 0xFF) {
 667             if (out >= outend)
 668                 break;
 669             *out++ = c;
 670         } else {
 671             /* no chance for this in IsoLat1 */
 672             *outlen = out - outstart;
 673             *inlen = processed - instart;
 674             return(-2);
 675         }
 676         processed = in;
 677     }
 678     *outlen = out - outstart;
 679     *inlen = processed - instart;
 680     return(0);
 681 }
 682
 683 /**
 684  * UTF16LEToUTF8:
 685  * @out:  a pointer to an array of bytes to store the result
 686  * @outlen:  the length of @out
 687  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
 688  * @inlenb:  the length of @in in UTF-16LE chars
 689  *
 690  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
 691  * block of chars out. This function assume the endian property
 692  * is the same between the native type of this machine and the
 693  * inputed one.
 694  *
 695  * Returns the number of byte written, or -1 by lack of space, or -2
 696  *     if the transcoding fails (for *in is not valid utf16 string)
 697  *     The value of *inlen after return is the number of octets consumed
 698  *     as the return value is positive, else unpredictable.
 699  */
 700 static int
 701 UTF16LEToUTF8(unsigned char* out, int *outlen,
 702             const unsigned char* inb, int *inlenb)
 703 {
 704     unsigned char* outstart = out;
 705     const unsigned char* processed = inb;
 706     unsigned char* outend = out + *outlen;
 707     unsigned short* in = (unsigned short*) inb;
 708     unsigned short* inend;
 709     unsigned int c, d, inlen;
 710     unsigned char *tmp;
 711     int bits;
 712
 713     if ((*inlenb % 2) == 1)
 714         (*inlenb)--;
 715     inlen = *inlenb / 2;
 716     inend = in + inlen;
 717     while ((in < inend) && (out - outstart + 5 < *outlen)) {
 718         if (xmlLittleEndian) {
 719             c= *in++;
 720         } else {
 721             tmp = (unsigned char *) in;
 722             c = *tmp++;
 723             c = c | (((unsigned int)*tmp) << 8);
 724             in++;
 725         }
 726         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
 727             if (in >= inend) {           /* (in > inend) shouldn't happens */
 728                 break;
 729             }
 730             if (xmlLittleEndian) {
 731                 d = *in++;
 732             } else {
 733                 tmp = (unsigned char *) in;
 734                 d = *tmp++;
 735                 d = d | (((unsigned int)*tmp) << 8);
 736                 in++;
 737             }
 738             if ((d & 0xFC00) == 0xDC00) {
 739                 c &= 0x03FF;
 740                 c <<= 10;
 741                 c |= d & 0x03FF;
 742                 c += 0x10000;
 743             }
 744             else {
 745                 *outlen = out - outstart;
 746                 *inlenb = processed - inb;
 747                 return(-2);
 748             }
 749         }
 750
 751         /* assertion: c is a single UTF-4 value */
 752         if (out >= outend)
 753             break;
 754         if      (c <    0x80) {  *out++=  c;                bits= -6; }
 755         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
 756         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
 757         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
 758
 759         for ( ; bits >= 0; bits-= 6) {
 760             if (out >= outend)
 761                 break;
 762             *out++= ((c >> bits) & 0x3F) | 0x80;
 763         }
 764         processed = (const unsigned char*) in;
 765     }
 766     *outlen = out - outstart;
 767     *inlenb = processed - inb;
 768     return(0);
 769 }
 770
 771 /**
 772  * UTF8ToUTF16LE:
 773  * @outb:  a pointer to an array of bytes to store the result
 774  * @outlen:  the length of @outb
 775  * @in:  a pointer to an array of UTF-8 chars
 776  * @inlen:  the length of @in
 777  *
 778  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
 779  * block of chars out.
 780  *
 781  * Returns the number of byte written, or -1 by lack of space, or -2
 782  *     if the transcoding failed.
 783  */
 784 static int
 785 UTF8ToUTF16LE(unsigned char* outb, int *outlen,
 786             const unsigned char* in, int *inlen)
 787 {
 788     unsigned short* out = (unsigned short*) outb;
 789     const unsigned char* processed = in;
 790     unsigned short* outstart= out;
 791     unsigned short* outend;
 792     const unsigned char* inend= in+*inlen;
 793     unsigned int c, d;
 794     int trailing;
 795     unsigned char *tmp;
 796     unsigned short tmp1, tmp2;
 797
 798     if (in == NULL) {
 799         /*
 800          * initialization, add the Byte Order Mark
 801          */
 802         if (*outlen >= 2) {
 803             outb[0] = 0xFF;
 804             outb[1] = 0xFE;
 805             *outlen = 2;
 806             *inlen = 0;
 807 #ifdef DEBUG_ENCODING
 808             xmlGenericError(xmlGenericErrorContext,
 809                     "Added FFFE Byte Order Mark\n");
 810 #endif
 811             return(2);
 812         }
 813         *outlen = 0;
 814         *inlen = 0;
 815         return(0);
 816     }
 817     outend = out + (*outlen / 2);
 818     while (in < inend) {
 819       d= *in++;
 820       if      (d < 0x80)  { c= d; trailing= 0; }
 821       else if (d < 0xC0) {
 822           /* trailing byte in leading position */
 823           *outlen = (out - outstart) * 2;
 824           *inlen = processed - in;
 825           return(-2);
 826       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
 827       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
 828       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
 829       else {
 830         /* no chance for this in UTF-16 */
 831         *outlen = (out - outstart) * 2;
 832         *inlen = processed - in;
 833         return(-2);
 834       }
 835
 836       if (inend - in < trailing) {
 837           break;
 838       }
 839
 840       for ( ; trailing; trailing--) {
 841           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
 842               break;
 843           c <<= 6;
 844           c |= d & 0x3F;
 845       }
 846
 847       /* assertion: c is a single UTF-4 value */
 848         if (c < 0x10000) {
 849             if (out >= outend)
 850                 break;
 851             if (xmlLittleEndian) {
 852                 *out++ = c;
 853             } else {
 854                 tmp = (unsigned char *) out;
 855                 *tmp = c ;
 856                 *(tmp + 1) = c >> 8 ;
 857                 out++;
 858             }
 859         }
 860         else if (c < 0x110000) {
 861             if (out+1 >= outend)
 862                 break;
 863             c -= 0x10000;
 864             if (xmlLittleEndian) {
 865                 *out++ = 0xD800 | (c >> 10);
 866                 *out++ = 0xDC00 | (c & 0x03FF);
 867             } else {
 868                 tmp1 = 0xD800 | (c >> 10);
 869                 tmp = (unsigned char *) out;
 870                 *tmp = (unsigned char) tmp1;
 871                 *(tmp + 1) = tmp1 >> 8;
 872                 out++;
 873
 874                 tmp2 = 0xDC00 | (c & 0x03FF);
 875                 tmp = (unsigned char *) out;
 876                 *tmp  = (unsigned char) tmp2;
 877                 *(tmp + 1) = tmp2 >> 8;
 878                 out++;
 879             }
 880         }
 881         else
 882             break;
 883         processed = in;
 884     }
 885     *outlen = (out - outstart) * 2;
 886     *inlen = processed - in;
 887     return(0);
 888 }
 889
 890 /**
 891  * UTF16BEToUTF8:
 892  * @out:  a pointer to an array of bytes to store the result
 893  * @outlen:  the length of @out
 894  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
 895  * @inlenb:  the length of @in in UTF-16 chars
 896  *
 897  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
 898  * block of chars out. This function assume the endian property
 899  * is the same between the native type of this machine and the
 900  * inputed one.
 901  *
 902  * Returns the number of byte written, or -1 by lack of space, or -2
 903  *     if the transcoding fails (for *in is not valid utf16 string)
 904  * The value of *inlen after return is the number of octets consumed
 905  *     as the return value is positive, else unpredictable.
 906  */
 907 static int
 908 UTF16BEToUTF8(unsigned char* out, int *outlen,
 909             const unsigned char* inb, int *inlenb)
 910 {
 911     unsigned char* outstart = out;
 912     const unsigned char* processed = inb;
 913     unsigned char* outend = out + *outlen;
 914     unsigned short* in = (unsigned short*) inb;
 915     unsigned short* inend;
 916     unsigned int c, d, inlen;
 917     unsigned char *tmp;
 918     int bits;
 919
 920     if ((*inlenb % 2) == 1)
 921         (*inlenb)--;
 922     inlen = *inlenb / 2;
 923     inend= in + inlen;
 924     while (in < inend) {
 925         if (xmlLittleEndian) {
 926             tmp = (unsigned char *) in;
 927             c = *tmp++;
 928             c = c << 8;
 929             c = c | (unsigned int) *tmp;
 930             in++;
 931         } else {
 932             c= *in++;
 933         }
 934         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
 935             if (in >= inend) {           /* (in > inend) shouldn't happens */
 936                 *outlen = out - outstart;
 937                 *inlenb = processed - inb;
 938                 return(-2);
 939             }
 940             if (xmlLittleEndian) {
 941                 tmp = (unsigned char *) in;
 942                 d = *tmp++;
 943                 d = d << 8;
 944                 d = d | (unsigned int) *tmp;
 945                 in++;
 946             } else {
 947                 d= *in++;
 948             }
 949             if ((d & 0xFC00) == 0xDC00) {
 950                 c &= 0x03FF;
 951                 c <<= 10;
 952                 c |= d & 0x03FF;
 953                 c += 0x10000;
 954             }
 955             else {
 956                 *outlen = out - outstart;
 957                 *inlenb = processed - inb;
 958                 return(-2);
 959             }
 960         }
 961
 962         /* assertion: c is a single UTF-4 value */
 963         if (out >= outend)
 964             break;
 965         if      (c <    0x80) {  *out++=  c;                bits= -6; }
 966         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
 967         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
 968         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
 969
 970         for ( ; bits >= 0; bits-= 6) {
 971             if (out >= outend)
 972                 break;
 973             *out++= ((c >> bits) & 0x3F) | 0x80;
 974         }
 975         processed = (const unsigned char*) in;
 976     }
 977     *outlen = out - outstart;
 978     *inlenb = processed - inb;
 979     return(0);
 980 }
 981
 982 /**
 983  * UTF8ToUTF16BE:
 984  * @outb:  a pointer to an array of bytes to store the result
 985  * @outlen:  the length of @outb
 986  * @in:  a pointer to an array of UTF-8 chars
 987  * @inlen:  the length of @in
 988  *
 989  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
 990  * block of chars out.
 991  *
 992  * Returns the number of byte written, or -1 by lack of space, or -2
 993  *     if the transcoding failed.
 994  */
 995 static int
 996 UTF8ToUTF16BE(unsigned char* outb, int *outlen,
 997             const unsigned char* in, int *inlen)
 998 {
 999     unsigned short* out = (unsigned short*) outb;
1000     const unsigned char* processed = in;
1001     unsigned short* outstart= out;
1002     unsigned short* outend;
1003     const unsigned char* inend= in+*inlen;
1004     unsigned int c, d;
1005     int trailing;
1006     unsigned char *tmp;
1007     unsigned short tmp1, tmp2;
1008
1009     if (in == NULL) {
1010         /*
1011          * initialization, add the Byte Order Mark
1012          */
1013         if (*outlen >= 2) {
1014             outb[0] = 0xFE;
1015             outb[1] = 0xFF;
1016             *outlen = 2;
1017             *inlen = 0;
1018 #ifdef DEBUG_ENCODING
1019             xmlGenericError(xmlGenericErrorContext,
1020                     "Added FEFF Byte Order Mark\n");
1021 #endif
1022             return(2);
1023         }
1024         *outlen = 0;
1025         *inlen = 0;
1026         return(0);
1027     }
1028     outend = out + (*outlen / 2);
1029     while (in < inend) {
1030       d= *in++;
1031       if      (d < 0x80)  { c= d; trailing= 0; }
1032       else if (d < 0xC0)  {
1033           /* trailing byte in leading position */
1034           *outlen = out - outstart;
1035           *inlen = processed - in;
1036           return(-2);
1037       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1038       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1039       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1040       else {
1041           /* no chance for this in UTF-16 */
1042           *outlen = out - outstart;
1043           *inlen = processed - in;
1044           return(-2);
1045       }
1046
1047       if (inend - in < trailing) {
1048           break;
1049       }
1050
1051       for ( ; trailing; trailing--) {
1052           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1053           c <<= 6;
1054           c |= d & 0x3F;
1055       }
1056
1057       /* assertion: c is a single UTF-4 value */
1058         if (c < 0x10000) {
1059             if (out >= outend)  break;
1060             if (xmlLittleEndian) {
1061                 tmp = (unsigned char *) out;
1062                 *tmp = c >> 8;
1063                 *(tmp + 1) = c;
1064                 out++;
1065             } else {
1066                 *out++ = c;
1067             }
1068         }
1069         else if (c < 0x110000) {
1070             if (out+1 >= outend)  break;
1071             c -= 0x10000;
1072             if (xmlLittleEndian) {
1073                 tmp1 = 0xD800 | (c >> 10);
1074                 tmp = (unsigned char *) out;
1075                 *tmp = tmp1 >> 8;
1076                 *(tmp + 1) = (unsigned char) tmp1;
1077                 out++;
1078
1079                 tmp2 = 0xDC00 | (c & 0x03FF);
1080                 tmp = (unsigned char *) out;
1081                 *tmp = tmp2 >> 8;
1082                 *(tmp + 1) = (unsigned char) tmp2;
1083                 out++;
1084             } else {
1085                 *out++ = 0xD800 | (c >> 10);
1086                 *out++ = 0xDC00 | (c & 0x03FF);
1087             }
1088         }
1089         else
1090             break;
1091         processed = in;
1092     }
1093     *outlen = (out - outstart) * 2;
1094     *inlen = processed - in;
1095     return(0);
1096 }
1097
1098 /************************************************************************
1099  *                                                                      *
1100  *              Generic encoding handling routines                      *
1101  *                                                                      *
1102  ************************************************************************/
1103
1104 /**
1105  * xmlDetectCharEncoding:
1106  * @in:  a pointer to the first bytes of the XML entity, must be at least
1107  *       4 bytes long.
1108  * @len:  pointer to the length of the buffer
1109  *
1110  * Guess the encoding of the entity using the first bytes of the entity content
1111  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1112  *
1113  * Returns one of the XML_CHAR_ENCODING_... values.
1114  */
1115 xmlCharEncoding
1116 xmlDetectCharEncoding(const unsigned char* in, int len)
1117 {
1118     if (len >= 4) {
1119         if ((in[0] == 0x00) && (in[1] == 0x00) &&
1120             (in[2] == 0x00) && (in[3] == 0x3C))
1121             return(XML_CHAR_ENCODING_UCS4BE);
1122         if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1123             (in[2] == 0x00) && (in[3] == 0x00))
1124             return(XML_CHAR_ENCODING_UCS4LE);
1125         if ((in[0] == 0x00) && (in[1] == 0x00) &&
1126             (in[2] == 0x3C) && (in[3] == 0x00))
1127             return(XML_CHAR_ENCODING_UCS4_2143);
1128         if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1129             (in[2] == 0x00) && (in[3] == 0x00))
1130             return(XML_CHAR_ENCODING_UCS4_3412);
1131         if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1132             (in[2] == 0xA7) && (in[3] == 0x94))
1133             return(XML_CHAR_ENCODING_EBCDIC);
1134         if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1135             (in[2] == 0x78) && (in[3] == 0x6D))
1136             return(XML_CHAR_ENCODING_UTF8);
1137     }
1138     if (len >= 3) {
1139         /*
1140          * Errata on XML-1.0 June 20 2001
1141          * We now allow an UTF8 encoded BOM
1142          */
1143         if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1144             (in[2] == 0xBF))
1145             return(XML_CHAR_ENCODING_UTF8);
1146     }
1147     if (len >= 2) {
1148         if ((in[0] == 0xFE) && (in[1] == 0xFF))
1149             return(XML_CHAR_ENCODING_UTF16BE);
1150         if ((in[0] == 0xFF) && (in[1] == 0xFE))
1151             return(XML_CHAR_ENCODING_UTF16LE);
1152     }
1153     return(XML_CHAR_ENCODING_NONE);
1154 }
1155
1156 /**
1157  * xmlCleanupEncodingAliases:
1158  *
1159  * Unregisters all aliases
1160  */
1161 void
1162 xmlCleanupEncodingAliases(void) {
1163     int i;
1164
1165     if (xmlCharEncodingAliases == NULL)
1166         return;
1167
1168     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1169         if (xmlCharEncodingAliases[i].name != NULL)
1170             xmlFree((char *) xmlCharEncodingAliases[i].name);
1171         if (xmlCharEncodingAliases[i].alias != NULL)
1172             xmlFree((char *) xmlCharEncodingAliases[i].alias);
1173     }
1174     xmlCharEncodingAliasesNb = 0;
1175     xmlCharEncodingAliasesMax = 0;
1176     xmlFree(xmlCharEncodingAliases);
1177     xmlCharEncodingAliases = NULL;
1178 }
1179
1180 /**
1181  * xmlGetEncodingAlias:
1182  * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
1183  *
1184  * Lookup an encoding name for the given alias.
1185  *
1186  * Returns NULL if not found the original name otherwise
1187  */
1188 const char *
1189 xmlGetEncodingAlias(const char *alias) {
1190     int i;
1191     char upper[100];
1192
1193     if (alias == NULL)
1194         return(NULL);
1195
1196     if (xmlCharEncodingAliases == NULL)
1197         return(NULL);
1198
1199     for (i = 0;i < 99;i++) {
1200         upper[i] = toupper(alias[i]);
1201         if (upper[i] == 0) break;
1202     }
1203     upper[i] = 0;
1204
1205     /*
1206      * Walk down the list looking for a definition of the alias
1207      */
1208     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1209         if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1210             return(xmlCharEncodingAliases[i].name);
1211         }
1212     }
1213     return(NULL);
1214 }
1215
1216 /**
1217  * xmlAddEncodingAlias:
1218  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1219  * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
1220  *
1221  * Registers and alias @alias for an encoding named @name. Existing alias
1222  * will be overwritten.
1223  *
1224  * Returns 0 in case of success, -1 in case of error
1225  */
1226 int
1227 xmlAddEncodingAlias(const char *name, const char *alias) {
1228     int i;
1229     char upper[100];
1230
1231     if ((name == NULL) || (alias == NULL))
1232         return(-1);
1233
1234     for (i = 0;i < 99;i++) {
1235         upper[i] = toupper(alias[i]);
1236         if (upper[i] == 0) break;
1237     }
1238     upper[i] = 0;
1239
1240     if (xmlCharEncodingAliases == NULL) {
1241         xmlCharEncodingAliasesNb = 0;
1242         xmlCharEncodingAliasesMax = 20;
1243         xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1244               xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1245         if (xmlCharEncodingAliases == NULL)
1246             return(-1);
1247     } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1248         xmlCharEncodingAliasesMax *= 2;
1249         xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1250               xmlRealloc(xmlCharEncodingAliases,
1251                          xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1252     }
1253     /*
1254      * Walk down the list looking for a definition of the alias
1255      */
1256     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1257         if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1258             /*
1259              * Replace the definition.
1260              */
1261             xmlFree((char *) xmlCharEncodingAliases[i].name);
1262             xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1263             return(0);
1264         }
1265     }
1266     /*
1267      * Add the definition
1268      */
1269     xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1270     xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1271     xmlCharEncodingAliasesNb++;
1272     return(0);
1273 }
1274
1275 /**
1276  * xmlDelEncodingAlias:
1277  * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
1278  *
1279  * Unregisters an encoding alias @alias
1280  *
1281  * Returns 0 in case of success, -1 in case of error
1282  */
1283 int
1284 xmlDelEncodingAlias(const char *alias) {
1285     int i;
1286
1287     if (alias == NULL)
1288         return(-1);
1289
1290     if (xmlCharEncodingAliases == NULL)
1291         return(-1);
1292     /*
1293      * Walk down the list looking for a definition of the alias
1294      */
1295     for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1296         if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1297             xmlFree((char *) xmlCharEncodingAliases[i].name);
1298             xmlFree((char *) xmlCharEncodingAliases[i].alias);
1299             xmlCharEncodingAliasesNb--;
1300             memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1301                     sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1302             return(0);
1303         }
1304     }
1305     return(-1);
1306 }
1307
1308 /**
1309  * xmlParseCharEncoding:
1310  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1311  *
1312  * Compare the string to the known encoding schemes already known. Note
1313  * that the comparison is case insensitive accordingly to the section
1314  * [XML] 4.3.3 Character Encoding in Entities.
1315  *
1316  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1317  * if not recognized.
1318  */
1319 xmlCharEncoding
1320 xmlParseCharEncoding(const char* name)
1321 {
1322     const char *alias;
1323     char upper[500];
1324     int i;
1325
1326     if (name == NULL)
1327         return(XML_CHAR_ENCODING_NONE);
1328
1329     /*
1330      * Do the alias resolution
1331      */
1332     alias = xmlGetEncodingAlias(name);
1333     if (alias != NULL)
1334         name = alias;
1335
1336     for (i = 0;i < 499;i++) {
1337         upper[i] = toupper(name[i]);
1338         if (upper[i] == 0) break;
1339     }
1340     upper[i] = 0;
1341
1342     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1343     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1344     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1345
1346     /*
1347      * NOTE: if we were able to parse this, the endianness of UTF16 is
1348      *       already found and in use
1349      */
1350     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1351     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1352
1353     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1354     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1355     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1356
1357     /*
1358      * NOTE: if we were able to parse this, the endianness of UCS4 is
1359      *       already found and in use
1360      */
1361     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1362     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1363     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1364
1365
1366     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1367     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1368     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1369
1370     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1371     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1372     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1373
1374     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1375     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1376     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1377     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1378     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1379     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1380     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1381
1382     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1383     if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1384     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1385
1386 #ifdef DEBUG_ENCODING
1387     xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1388 #endif
1389     return(XML_CHAR_ENCODING_ERROR);
1390 }
1391
1392 /**
1393  * xmlGetCharEncodingName:
1394  * @enc:  the encoding
1395  *
1396  * The "canonical" name for XML encoding.
1397  * C.f. http://www.w3.org/TR/REC-xml#charencoding
1398  * Section 4.3.3  Character Encoding in Entities
1399  *
1400  * Returns the canonical name for the given encoding
1401  */
1402
1403 const char*
1404 xmlGetCharEncodingName(xmlCharEncoding enc) {
1405     switch (enc) {
1406         case XML_CHAR_ENCODING_ERROR:
1407             return(NULL);
1408         case XML_CHAR_ENCODING_NONE:
1409             return(NULL);
1410         case XML_CHAR_ENCODING_UTF8:
1411             return("UTF-8");
1412         case XML_CHAR_ENCODING_UTF16LE:
1413             return("UTF-16");
1414         case XML_CHAR_ENCODING_UTF16BE:
1415             return("UTF-16");
1416         case XML_CHAR_ENCODING_EBCDIC:
1417             return("EBCDIC");
1418         case XML_CHAR_ENCODING_UCS4LE:
1419             return("ISO-10646-UCS-4");
1420         case XML_CHAR_ENCODING_UCS4BE:
1421             return("ISO-10646-UCS-4");
1422         case XML_CHAR_ENCODING_UCS4_2143:
1423             return("ISO-10646-UCS-4");
1424         case XML_CHAR_ENCODING_UCS4_3412:
1425             return("ISO-10646-UCS-4");
1426         case XML_CHAR_ENCODING_UCS2:
1427             return("ISO-10646-UCS-2");
1428         case XML_CHAR_ENCODING_8859_1:
1429             return("ISO-8859-1");
1430         case XML_CHAR_ENCODING_8859_2:
1431             return("ISO-8859-2");
1432         case XML_CHAR_ENCODING_8859_3:
1433             return("ISO-8859-3");
1434         case XML_CHAR_ENCODING_8859_4:
1435             return("ISO-8859-4");
1436         case XML_CHAR_ENCODING_8859_5:
1437             return("ISO-8859-5");
1438         case XML_CHAR_ENCODING_8859_6:
1439             return("ISO-8859-6");
1440         case XML_CHAR_ENCODING_8859_7:
1441             return("ISO-8859-7");
1442         case XML_CHAR_ENCODING_8859_8:
1443             return("ISO-8859-8");
1444         case XML_CHAR_ENCODING_8859_9:
1445             return("ISO-8859-9");
1446         case XML_CHAR_ENCODING_2022_JP:
1447             return("ISO-2022-JP");
1448         case XML_CHAR_ENCODING_SHIFT_JIS:
1449             return("Shift-JIS");
1450         case XML_CHAR_ENCODING_EUC_JP:
1451             return("EUC-JP");
1452         case XML_CHAR_ENCODING_ASCII:
1453             return(NULL);
1454     }
1455     return(NULL);
1456 }
1457
1458 /************************************************************************
1459  *                                                                      *
1460  *                      Char encoding handlers                          *
1461  *                                                                      *
1462  ************************************************************************/
1463
1464
1465 /* the size should be growable, but it's not a big deal ... */
1466 #define MAX_ENCODING_HANDLERS 50
1467 static xmlCharEncodingHandlerPtr *handlers = NULL;
1468 static int nbCharEncodingHandler = 0;
1469
1470 /*
1471  * The default is UTF-8 for XML, that's also the default used for the
1472  * parser internals, so the default encoding handler is NULL
1473  */
1474
1475 static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1476
1477 /**
1478  * xmlNewCharEncodingHandler:
1479  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1480  * @input:  the xmlCharEncodingInputFunc to read that encoding
1481  * @output:  the xmlCharEncodingOutputFunc to write that encoding
1482  *
1483  * Create and registers an xmlCharEncodingHandler.
1484  *
1485  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1486  */
1487 xmlCharEncodingHandlerPtr
1488 xmlNewCharEncodingHandler(const char *name,
1489                           xmlCharEncodingInputFunc input,
1490                           xmlCharEncodingOutputFunc output) {
1491     xmlCharEncodingHandlerPtr handler;
1492     const char *alias;
1493     char upper[500];
1494     int i;
1495     char *up = 0;
1496
1497     /*
1498      * Do the alias resolution
1499      */
1500     alias = xmlGetEncodingAlias(name);
1501     if (alias != NULL)
1502         name = alias;
1503
1504     /*
1505      * Keep only the uppercase version of the encoding.
1506      */
1507     if (name == NULL) {
1508         xmlGenericError(xmlGenericErrorContext,
1509                 "xmlNewCharEncodingHandler : no name !\n");
1510         return(NULL);
1511     }
1512     for (i = 0;i < 499;i++) {
1513         upper[i] = toupper(name[i]);
1514         if (upper[i] == 0) break;
1515     }
1516     upper[i] = 0;
1517     up = xmlMemStrdup(upper);
1518     if (up == NULL) {
1519         xmlGenericError(xmlGenericErrorContext,
1520                 "xmlNewCharEncodingHandler : out of memory !\n");
1521         return(NULL);
1522     }
1523
1524     /*
1525      * allocate and fill-up an handler block.
1526      */
1527     handler = (xmlCharEncodingHandlerPtr)
1528               xmlMalloc(sizeof(xmlCharEncodingHandler));
1529     if (handler == NULL) {
1530         xmlGenericError(xmlGenericErrorContext,
1531                 "xmlNewCharEncodingHandler : out of memory !\n");
1532         return(NULL);
1533     }
1534     handler->input = input;
1535     handler->output = output;
1536     handler->name = up;
1537
1538 #ifdef LIBXML_ICONV_ENABLED
1539     handler->iconv_in = NULL;
1540     handler->iconv_out = NULL;
1541 #endif /* LIBXML_ICONV_ENABLED */
1542
1543     /*
1544      * registers and returns the handler.
1545      */
1546     xmlRegisterCharEncodingHandler(handler);
1547 #ifdef DEBUG_ENCODING
1548     xmlGenericError(xmlGenericErrorContext,
1549             "Registered encoding handler for %s\n", name);
1550 #endif
1551     return(handler);
1552 }
1553
1554 /**
1555  * xmlInitCharEncodingHandlers:
1556  *
1557  * Initialize the char encoding support, it registers the default
1558  * encoding supported.
1559  * NOTE: while public, this function usually doesn't need to be called
1560  *       in normal processing.
1561  */
1562 void
1563 xmlInitCharEncodingHandlers(void) {
1564     unsigned short int tst = 0x1234;
1565     unsigned char *ptr = (unsigned char *) &tst;
1566
1567     if (handlers != NULL) return;
1568
1569     handlers = (xmlCharEncodingHandlerPtr *)
1570         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1571
1572     if (*ptr == 0x12) xmlLittleEndian = 0;
1573     else if (*ptr == 0x34) xmlLittleEndian = 1;
1574     else xmlGenericError(xmlGenericErrorContext,
1575             "Odd problem at endianness detection\n");
1576
1577     if (handlers == NULL) {
1578         xmlGenericError(xmlGenericErrorContext,
1579                 "xmlInitCharEncodingHandlers : out of memory !\n");
1580         return;
1581     }
1582     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1583     xmlUTF16LEHandler =
1584           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1585     xmlUTF16BEHandler =
1586           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1587     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1588     xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1589     xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
1590 #ifdef LIBXML_HTML_ENABLED
1591     xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1592 #endif
1593 }
1594
1595 /**
1596  * xmlCleanupCharEncodingHandlers:
1597  *
1598  * Cleanup the memory allocated for the char encoding support, it
1599  * unregisters all the encoding handlers and the aliases.
1600  */
1601 void
1602 xmlCleanupCharEncodingHandlers(void) {
1603     xmlCleanupEncodingAliases();
1604
1605     if (handlers == NULL) return;
1606
1607     for (;nbCharEncodingHandler > 0;) {
1608         nbCharEncodingHandler--;
1609         if (handlers[nbCharEncodingHandler] != NULL) {
1610             if (handlers[nbCharEncodingHandler]->name != NULL)
1611                 xmlFree(handlers[nbCharEncodingHandler]->name);
1612             xmlFree(handlers[nbCharEncodingHandler]);
1613         }
1614     }
1615     xmlFree(handlers);
1616     handlers = NULL;
1617     nbCharEncodingHandler = 0;
1618     xmlDefaultCharEncodingHandler = NULL;
1619 }
1620
1621 /**
1622  * xmlRegisterCharEncodingHandler:
1623  * @handler:  the xmlCharEncodingHandlerPtr handler block
1624  *
1625  * Register the char encoding handler, surprising, isn't it ?
1626  */
1627 void
1628 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1629     if (handlers == NULL) xmlInitCharEncodingHandlers();
1630     if (handler == NULL) {
1631         xmlGenericError(xmlGenericErrorContext,
1632                 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1633         return;
1634     }
1635
1636     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1637         xmlGenericError(xmlGenericErrorContext,
1638         "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1639         xmlGenericError(xmlGenericErrorContext,
1640                 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1641         return;
1642     }
1643     handlers[nbCharEncodingHandler++] = handler;
1644 }
1645
1646 /**
1647  * xmlGetCharEncodingHandler:
1648  * @enc:  an xmlCharEncoding value.
1649  *
1650  * Search in the registered set the handler able to read/write that encoding.
1651  *
1652  * Returns the handler or NULL if not found
1653  */
1654 xmlCharEncodingHandlerPtr
1655 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1656     xmlCharEncodingHandlerPtr handler;
1657
1658     if (handlers == NULL) xmlInitCharEncodingHandlers();
1659     switch (enc) {
1660         case XML_CHAR_ENCODING_ERROR:
1661             return(NULL);
1662         case XML_CHAR_ENCODING_NONE:
1663             return(NULL);
1664         case XML_CHAR_ENCODING_UTF8:
1665             return(NULL);
1666         case XML_CHAR_ENCODING_UTF16LE:
1667             return(xmlUTF16LEHandler);
1668         case XML_CHAR_ENCODING_UTF16BE:
1669             return(xmlUTF16BEHandler);
1670         case XML_CHAR_ENCODING_EBCDIC:
1671             handler = xmlFindCharEncodingHandler("EBCDIC");
1672             if (handler != NULL) return(handler);
1673             handler = xmlFindCharEncodingHandler("ebcdic");
1674             if (handler != NULL) return(handler);
1675             break;
1676         case XML_CHAR_ENCODING_UCS4BE:
1677             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1678             if (handler != NULL) return(handler);
1679             handler = xmlFindCharEncodingHandler("UCS-4");
1680             if (handler != NULL) return(handler);
1681             handler = xmlFindCharEncodingHandler("UCS4");
1682             if (handler != NULL) return(handler);
1683             break;
1684         case XML_CHAR_ENCODING_UCS4LE:
1685             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1686             if (handler != NULL) return(handler);
1687             handler = xmlFindCharEncodingHandler("UCS-4");
1688             if (handler != NULL) return(handler);
1689             handler = xmlFindCharEncodingHandler("UCS4");
1690             if (handler != NULL) return(handler);
1691             break;
1692         case XML_CHAR_ENCODING_UCS4_2143:
1693             break;
1694         case XML_CHAR_ENCODING_UCS4_3412:
1695             break;
1696         case XML_CHAR_ENCODING_UCS2:
1697             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1698             if (handler != NULL) return(handler);
1699             handler = xmlFindCharEncodingHandler("UCS-2");
1700             if (handler != NULL) return(handler);
1701             handler = xmlFindCharEncodingHandler("UCS2");
1702             if (handler != NULL) return(handler);
1703             break;
1704
1705             /*
1706              * We used to keep ISO Latin encodings native in the
1707              * generated data. This led to so many problems that
1708              * this has been removed. One can still change this
1709              * back by registering no-ops encoders for those
1710              */
1711         case XML_CHAR_ENCODING_8859_1:
1712             handler = xmlFindCharEncodingHandler("ISO-8859-1");
1713             if (handler != NULL) return(handler);
1714             break;
1715         case XML_CHAR_ENCODING_8859_2:
1716             handler = xmlFindCharEncodingHandler("ISO-8859-2");
1717             if (handler != NULL) return(handler);
1718             break;
1719         case XML_CHAR_ENCODING_8859_3:
1720             handler = xmlFindCharEncodingHandler("ISO-8859-3");
1721             if (handler != NULL) return(handler);
1722             break;
1723         case XML_CHAR_ENCODING_8859_4:
1724             handler = xmlFindCharEncodingHandler("ISO-8859-4");
1725             if (handler != NULL) return(handler);
1726             break;
1727         case XML_CHAR_ENCODING_8859_5:
1728             handler = xmlFindCharEncodingHandler("ISO-8859-5");
1729             if (handler != NULL) return(handler);
1730             break;
1731         case XML_CHAR_ENCODING_8859_6:
1732             handler = xmlFindCharEncodingHandler("ISO-8859-6");
1733             if (handler != NULL) return(handler);
1734             break;
1735         case XML_CHAR_ENCODING_8859_7:
1736             handler = xmlFindCharEncodingHandler("ISO-8859-7");
1737             if (handler != NULL) return(handler);
1738             break;
1739         case XML_CHAR_ENCODING_8859_8:
1740             handler = xmlFindCharEncodingHandler("ISO-8859-8");
1741             if (handler != NULL) return(handler);
1742             break;
1743         case XML_CHAR_ENCODING_8859_9:
1744             handler = xmlFindCharEncodingHandler("ISO-8859-9");
1745             if (handler != NULL) return(handler);
1746             break;
1747
1748
1749         case XML_CHAR_ENCODING_2022_JP:
1750             handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1751             if (handler != NULL) return(handler);
1752             break;
1753         case XML_CHAR_ENCODING_SHIFT_JIS:
1754             handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1755             if (handler != NULL) return(handler);
1756             handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1757             if (handler != NULL) return(handler);
1758             handler = xmlFindCharEncodingHandler("Shift_JIS");
1759             if (handler != NULL) return(handler);
1760             break;
1761         case XML_CHAR_ENCODING_EUC_JP:
1762             handler = xmlFindCharEncodingHandler("EUC-JP");
1763             if (handler != NULL) return(handler);
1764             break;
1765         default:
1766             break;
1767     }
1768
1769 #ifdef DEBUG_ENCODING
1770     xmlGenericError(xmlGenericErrorContext,
1771             "No handler found for encoding %d\n", enc);
1772 #endif
1773     return(NULL);
1774 }
1775
1776 /**
1777  * xmlFindCharEncodingHandler:
1778  * @name:  a string describing the char encoding.
1779  *
1780  * Search in the registered set the handler able to read/write that encoding.
1781  *
1782  * Returns the handler or NULL if not found
1783  */
1784 xmlCharEncodingHandlerPtr
1785 xmlFindCharEncodingHandler(const char *name) {
1786     const char *nalias;
1787     const char *norig;
1788     xmlCharEncoding alias;
1789 #ifdef LIBXML_ICONV_ENABLED
1790     xmlCharEncodingHandlerPtr enc;
1791     iconv_t icv_in, icv_out;
1792 #endif /* LIBXML_ICONV_ENABLED */
1793     char upper[100];
1794     int i;
1795
1796     if (handlers == NULL) xmlInitCharEncodingHandlers();
1797     if (name == NULL) return(xmlDefaultCharEncodingHandler);
1798     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1799
1800     /*
1801      * Do the alias resolution
1802      */
1803     norig = name;
1804     nalias = xmlGetEncodingAlias(name);
1805     if (nalias != NULL)
1806         name = nalias;
1807
1808     /*
1809      * Check first for directly registered encoding names
1810      */
1811     for (i = 0;i < 99;i++) {
1812         upper[i] = toupper(name[i]);
1813         if (upper[i] == 0) break;
1814     }
1815     upper[i] = 0;
1816
1817     for (i = 0;i < nbCharEncodingHandler; i++)
1818         if (!strcmp(upper, handlers[i]->name)) {
1819 #ifdef DEBUG_ENCODING
1820             xmlGenericError(xmlGenericErrorContext,
1821                     "Found registered handler for encoding %s\n", name);
1822 #endif
1823             return(handlers[i]);
1824         }
1825
1826 #ifdef LIBXML_ICONV_ENABLED
1827     /* check whether iconv can handle this */
1828     icv_in = iconv_open("UTF-8", name);
1829     icv_out = iconv_open(name, "UTF-8");
1830     if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1831             enc = (xmlCharEncodingHandlerPtr)
1832                   xmlMalloc(sizeof(xmlCharEncodingHandler));
1833             if (enc == NULL) {
1834                 iconv_close(icv_in);
1835                 iconv_close(icv_out);
1836                 return(NULL);
1837             }
1838             enc->name = xmlMemStrdup(name);
1839             enc->input = NULL;
1840             enc->output = NULL;
1841             enc->iconv_in = icv_in;
1842             enc->iconv_out = icv_out;
1843 #ifdef DEBUG_ENCODING
1844             xmlGenericError(xmlGenericErrorContext,
1845                     "Found iconv handler for encoding %s\n", name);
1846 #endif
1847             return enc;
1848     } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1849             xmlGenericError(xmlGenericErrorContext,
1850                     "iconv : problems with filters for '%s'\n", name);
1851     }
1852 #endif /* LIBXML_ICONV_ENABLED */
1853
1854 #ifdef DEBUG_ENCODING
1855     xmlGenericError(xmlGenericErrorContext,
1856             "No handler found for encoding %s\n", name);
1857 #endif
1858
1859     /*
1860      * Fallback using the canonical names
1861      */
1862     alias = xmlParseCharEncoding(norig);
1863     if (alias != XML_CHAR_ENCODING_ERROR) {
1864         const char* canon;
1865         canon = xmlGetCharEncodingName(alias);
1866         if ((canon != NULL) && (strcmp(name, canon))) {
1867             return(xmlFindCharEncodingHandler(canon));
1868         }
1869     }
1870
1871     return(NULL);
1872 }
1873
1874 /************************************************************************
1875  *                                                                      *
1876  *              ICONV based generic conversion functions                *
1877  *                                                                      *
1878  ************************************************************************/
1879
1880 #ifdef LIBXML_ICONV_ENABLED
1881 /**
1882  * xmlIconvWrapper:
1883  * @cd:         iconv converter data structure
1884  * @out:  a pointer to an array of bytes to store the result
1885  * @outlen:  the length of @out
1886  * @in:  a pointer to an array of ISO Latin 1 chars
1887  * @inlen:  the length of @in
1888  *
1889  * Returns 0 if success, or
1890  *     -1 by lack of space, or
1891  *     -2 if the transcoding fails (for *in is not valid utf8 string or
1892  *        the result of transformation can't fit into the encoding we want), or
1893  *     -3 if there the last byte can't form a single output char.
1894  *
1895  * The value of @inlen after return is the number of octets consumed
1896  *     as the return value is positive, else unpredictable.
1897  * The value of @outlen after return is the number of ocetes consumed.
1898  */
1899 static int
1900 xmlIconvWrapper(iconv_t cd,
1901     unsigned char *out, int *outlen,
1902     const unsigned char *in, int *inlen) {
1903
1904     size_t icv_inlen = *inlen, icv_outlen = *outlen;
1905     const char *icv_in = (const char *) in;
1906     char *icv_out = (char *) out;
1907     int ret;
1908
1909     ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1910     if (in != NULL) {
1911         *inlen -= icv_inlen;
1912         *outlen -= icv_outlen;
1913     } else {
1914         *inlen = 0;
1915         *outlen = 0;
1916     }
1917     if ((icv_inlen != 0) || (ret == -1)) {
1918 #ifdef EILSEQ
1919         if (errno == EILSEQ) {
1920             return -2;
1921         } else
1922 #endif
1923 #ifdef E2BIG
1924         if (errno == E2BIG) {
1925             return -1;
1926         } else
1927 #endif
1928 #ifdef EINVAL
1929         if (errno == EINVAL) {
1930             return -3;
1931         } else
1932 #endif
1933         {
1934             return -3;
1935         }
1936     }
1937     return 0;
1938 }
1939 #endif /* LIBXML_ICONV_ENABLED */
1940
1941 /************************************************************************
1942  *                                                                      *
1943  *              The real API used by libxml for on-the-fly conversion   *
1944  *                                                                      *
1945  ************************************************************************/
1946
1947 /**
1948  * xmlCharEncFirstLine:
1949  * @handler:    char enconding transformation data structure
1950  * @out:  an xmlBuffer for the output.
1951  * @in:  an xmlBuffer for the input
1952  *
1953  * Front-end for the encoding handler input function, but handle only
1954  * the very first line, i.e. limit itself to 45 chars.
1955  *
1956  * Returns the number of byte written if success, or
1957  *     -1 general error
1958  *     -2 if the transcoding fails (for *in is not valid utf8 string or
1959  *        the result of transformation can't fit into the encoding we want), or
1960  */
1961 int
1962 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1963                  xmlBufferPtr in) {
1964     int ret = -2;
1965     int written;
1966     int toconv;
1967
1968     if (handler == NULL) return(-1);
1969     if (out == NULL) return(-1);
1970     if (in == NULL) return(-1);
1971
1972     written = out->size - out->use;
1973     toconv = in->use;
1974     if (toconv * 2 >= written) {
1975         xmlBufferGrow(out, toconv);
1976         written = out->size - out->use - 1;
1977     }
1978
1979     /*
1980      * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1981      * 45 chars should be sufficient to reach the end of the encoding
1982      * declaration without going too far inside the document content.
1983      */
1984     written = 45;
1985
1986     if (handler->input != NULL) {
1987         ret = handler->input(&out->content[out->use], &written,
1988                              in->content, &toconv);
1989         xmlBufferShrink(in, toconv);
1990         out->use += written;
1991         out->content[out->use] = 0;
1992     }
1993 #ifdef LIBXML_ICONV_ENABLED
1994     else if (handler->iconv_in != NULL) {
1995         ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1996                               &written, in->content, &toconv);
1997         xmlBufferShrink(in, toconv);
1998         out->use += written;
1999         out->content[out->use] = 0;
2000         if (ret == -1) ret = -3;
2001     }
2002 #endif /* LIBXML_ICONV_ENABLED */
2003 #ifdef DEBUG_ENCODING
2004     switch (ret) {
2005         case 0:
2006             xmlGenericError(xmlGenericErrorContext,
2007                     "converted %d bytes to %d bytes of input\n",
2008                     toconv, written);
2009             break;
2010         case -1:
2011             xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2012                     toconv, written, in->use);
2013             break;
2014         case -2:
2015             xmlGenericError(xmlGenericErrorContext,
2016                     "input conversion failed due to input error\n");
2017             break;
2018         case -3:
2019             xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2020                     toconv, written, in->use);
2021             break;
2022         default:
2023             xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2024     }
2025 #endif /* DEBUG_ENCODING */
2026     /*
2027      * Ignore when input buffer is not on a boundary
2028      */
2029     if (ret == -3) ret = 0;
2030     if (ret == -1) ret = 0;
2031     return(ret);
2032 }
2033
2034 /**
2035  * xmlCharEncInFunc:
2036  * @handler:    char encoding transformation data structure
2037  * @out:  an xmlBuffer for the output.
2038  * @in:  an xmlBuffer for the input
2039  *
2040  * Generic front-end for the encoding handler input function
2041  *
2042  * Returns the number of byte written if success, or
2043  *     -1 general error
2044  *     -2 if the transcoding fails (for *in is not valid utf8 string or
2045  *        the result of transformation can't fit into the encoding we want), or
2046  */
2047 int
2048 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2049                  xmlBufferPtr in)
2050 {
2051     int ret = -2;
2052     int written;
2053     int toconv;
2054
2055     if (handler == NULL)
2056         return (-1);
2057     if (out == NULL)
2058         return (-1);
2059     if (in == NULL)
2060         return (-1);
2061
2062     toconv = in->use;
2063     if (toconv == 0)
2064         return (0);
2065     written = out->size - out->use;
2066     if (toconv * 2 >= written) {
2067         xmlBufferGrow(out, out->size + toconv * 2);
2068         written = out->size - out->use - 1;
2069     }
2070     if (handler->input != NULL) {
2071         ret = handler->input(&out->content[out->use], &written,
2072                              in->content, &toconv);
2073         xmlBufferShrink(in, toconv);
2074         out->use += written;
2075         out->content[out->use] = 0;
2076     }
2077 #ifdef LIBXML_ICONV_ENABLED
2078     else if (handler->iconv_in != NULL) {
2079         ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2080                               &written, in->content, &toconv);
2081         xmlBufferShrink(in, toconv);
2082         out->use += written;
2083         out->content[out->use] = 0;
2084         if (ret == -1)
2085             ret = -3;
2086     }
2087 #endif /* LIBXML_ICONV_ENABLED */
2088     switch (ret) {
2089         case 0:
2090 #ifdef DEBUG_ENCODING
2091             xmlGenericError(xmlGenericErrorContext,
2092                             "converted %d bytes to %d bytes of input\n",
2093                             toconv, written);
2094 #endif
2095             break;
2096         case -1:
2097 #ifdef DEBUG_ENCODING
2098             xmlGenericError(xmlGenericErrorContext,
2099                          "converted %d bytes to %d bytes of input, %d left\n",
2100                             toconv, written, in->use);
2101 #endif
2102             break;
2103         case -3:
2104 #ifdef DEBUG_ENCODING
2105             xmlGenericError(xmlGenericErrorContext,
2106                         "converted %d bytes to %d bytes of input, %d left\n",
2107                             toconv, written, in->use);
2108 #endif
2109             break;
2110         case -2:
2111             xmlGenericError(xmlGenericErrorContext,
2112                             "input conversion failed due to input error\n");
2113             xmlGenericError(xmlGenericErrorContext,
2114                             "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2115                             in->content[0], in->content[1],
2116                             in->content[2], in->content[3]);
2117     }
2118     /*
2119      * Ignore when input buffer is not on a boundary
2120      */
2121     if (ret == -3)
2122         ret = 0;
2123     return (written);
2124 }
2125
2126 /**
2127  * xmlCharEncOutFunc:
2128  * @handler:    char enconding transformation data structure
2129  * @out:  an xmlBuffer for the output.
2130  * @in:  an xmlBuffer for the input
2131  *
2132  * Generic front-end for the encoding handler output function
2133  * a first call with @in == NULL has to be made firs to initiate the
2134  * output in case of non-stateless encoding needing to initiate their
2135  * state or the output (like the BOM in UTF16).
2136  * In case of UTF8 sequence conversion errors for the given encoder,
2137  * the content will be automatically remapped to a CharRef sequence.
2138  *
2139  * Returns the number of byte written if success, or
2140  *     -1 general error
2141  *     -2 if the transcoding fails (for *in is not valid utf8 string or
2142  *        the result of transformation can't fit into the encoding we want), or
2143  */
2144 int
2145 xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2146                   xmlBufferPtr in) {
2147     int ret = -2;
2148     int written;
2149     int writtentot = 0;
2150     int toconv;
2151     int output = 0;
2152
2153     if (handler == NULL) return(-1);
2154     if (out == NULL) return(-1);
2155
2156 retry:
2157
2158     written = out->size - out->use;
2159
2160     /*
2161      * First specific handling of in = NULL, i.e. the initialization call
2162      */
2163     if (in == NULL) {
2164         toconv = 0;
2165         if (handler->output != NULL) {
2166             ret = handler->output(&out->content[out->use], &written,
2167                                   NULL, &toconv);
2168             out->use += written;
2169             out->content[out->use] = 0;
2170         }
2171 #ifdef LIBXML_ICONV_ENABLED
2172         else if (handler->iconv_out != NULL) {
2173             ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2174                                   &written, NULL, &toconv);
2175             out->use += written;
2176             out->content[out->use] = 0;
2177         }
2178 #endif /* LIBXML_ICONV_ENABLED */
2179 #ifdef DEBUG_ENCODING
2180         xmlGenericError(xmlGenericErrorContext,
2181                 "initialized encoder\n");
2182 #endif
2183         return(0);
2184     }
2185
2186     /*
2187      * Conversion itself.
2188      */
2189     toconv = in->use;
2190     if (toconv == 0)
2191         return(0);
2192     if (toconv * 2 >= written) {
2193         xmlBufferGrow(out, toconv * 2);
2194         written = out->size - out->use - 1;
2195     }
2196     if (handler->output != NULL) {
2197         ret = handler->output(&out->content[out->use], &written,
2198                               in->content, &toconv);
2199         xmlBufferShrink(in, toconv);
2200         out->use += written;
2201         writtentot += written;
2202         out->content[out->use] = 0;
2203     }
2204 #ifdef LIBXML_ICONV_ENABLED
2205     else if (handler->iconv_out != NULL) {
2206         ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2207                               &written, in->content, &toconv);
2208         xmlBufferShrink(in, toconv);
2209         out->use += written;
2210         writtentot += written;
2211         out->content[out->use] = 0;
2212         if (ret == -1) {
2213             if (written > 0) {
2214                 /*
2215                  * Can be a limitation of iconv
2216                  */
2217                 goto retry;
2218             }
2219             ret = -3;
2220         }
2221     }
2222 #endif /* LIBXML_ICONV_ENABLED */
2223     else {
2224         xmlGenericError(xmlGenericErrorContext,
2225                 "xmlCharEncOutFunc: no output function !\n");
2226         return(-1);
2227     }
2228
2229     if (ret >= 0) output += ret;
2230
2231     /*
2232      * Attempt to handle error cases
2233      */
2234     switch (ret) {
2235         case 0:
2236 #ifdef DEBUG_ENCODING
2237             xmlGenericError(xmlGenericErrorContext,
2238                     "converted %d bytes to %d bytes of output\n",
2239                     toconv, written);
2240 #endif
2241             break;
2242         case -1:
2243 #ifdef DEBUG_ENCODING
2244             xmlGenericError(xmlGenericErrorContext,
2245                     "output conversion failed by lack of space\n");
2246 #endif
2247             break;
2248         case -3:
2249             xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2250                     toconv, written, in->use);
2251             break;
2252         case -2: {
2253             int len = in->use;
2254             const xmlChar *utf = (const xmlChar *) in->content;
2255             int cur;
2256
2257             cur = xmlGetUTF8Char(utf, &len);
2258             if (cur > 0) {
2259                 xmlChar charref[20];
2260
2261 #ifdef DEBUG_ENCODING
2262                 xmlGenericError(xmlGenericErrorContext,
2263                         "handling output conversion error\n");
2264                 xmlGenericError(xmlGenericErrorContext,
2265                         "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2266                         in->content[0], in->content[1],
2267                         in->content[2], in->content[3]);
2268 #endif
2269                 /*
2270                  * Removes the UTF8 sequence, and replace it by a charref
2271                  * and continue the transcoding phase, hoping the error
2272                  * did not mangle the encoder state.
2273                  */
2274                 snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
2275                 xmlBufferShrink(in, len);
2276                 xmlBufferAddHead(in, charref, -1);
2277
2278                 goto retry;
2279             } else {
2280                 xmlGenericError(xmlGenericErrorContext,
2281                         "output conversion failed due to conv error\n");
2282                 xmlGenericError(xmlGenericErrorContext,
2283                         "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2284                         in->content[0], in->content[1],
2285                         in->content[2], in->content[3]);
2286                 in->content[0] = ' ';
2287             }
2288             break;
2289         }
2290     }
2291     return(ret);
2292 }
2293
2294 /**
2295  * xmlCharEncCloseFunc:
2296  * @handler:    char enconding transformation data structure
2297  *
2298  * Generic front-end for encoding handler close function
2299  *
2300  * Returns 0 if success, or -1 in case of error
2301  */
2302 int
2303 xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2304     int ret = 0;
2305     if (handler == NULL) return(-1);
2306     if (handler->name == NULL) return(-1);
2307 #ifdef LIBXML_ICONV_ENABLED
2308     /*
2309      * Iconv handlers can be used only once, free the whole block.
2310      * and the associated icon resources.
2311      */
2312     if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2313         if (handler->name != NULL)
2314             xmlFree(handler->name);
2315         handler->name = NULL;
2316         if (handler->iconv_out != NULL) {
2317             if (iconv_close(handler->iconv_out))
2318                 ret = -1;
2319             handler->iconv_out = NULL;
2320         }
2321         if (handler->iconv_in != NULL) {
2322             if (iconv_close(handler->iconv_in))
2323                 ret = -1;
2324             handler->iconv_in = NULL;
2325         }
2326         xmlFree(handler);
2327     }
2328 #endif /* LIBXML_ICONV_ENABLED */
2329 #ifdef DEBUG_ENCODING
2330     if (ret)
2331         xmlGenericError(xmlGenericErrorContext,
2332                 "failed to close the encoding handler\n");
2333     else
2334         xmlGenericError(xmlGenericErrorContext,
2335                 "closed the encoding handler\n");
2336 #endif
2337
2338     return(ret);
2339 }
2340