src/protocols/jabber/xmltok.c

   1 /*
   2 The contents of this file are subject to the Mozilla Public License
   3 Version 1.1 (the "License"); you may not use this file except in
   4 compliance with the License. You may obtain a copy of the License at
   5 http://www.mozilla.org/MPL/
   6
   7 Software distributed under the License is distributed on an "AS IS"
   8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
   9 License for the specific language governing rights and limitations
  10 under the License.
  11
  12 The Original Code is expat.
  13
  14 The Initial Developer of the Original Code is James Clark.
  15 Portions created by James Clark are Copyright (C) 1998, 1999
  16 James Clark. All Rights Reserved.
  17
  18 Contributor(s):
  19
  20 Alternatively, the contents of this file may be used under the terms
  21 of the GNU General Public License (the "GPL"), in which case the
  22 provisions of the GPL are applicable instead of those above.  If you
  23 wish to allow use of your version of this file only under the terms of
  24 the GPL and not to allow others to use your version of this file under
  25 the MPL, indicate your decision by deleting the provisions above and
  26 replace them with the notice and other provisions required by the
  27 GPL. If you do not delete the provisions above, a recipient may use
  28 your version of this file under either the MPL or the GPL.
  29 */
  30
  31 #include "xmldef.h"
  32 #include "xmltok.h"
  33 #include "nametab.h"
  34
  35 #define VTABLE1 \
  36   { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
  37   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  38   PREFIX(sameName), \
  39   PREFIX(nameMatchesAscii), \
  40   PREFIX(nameLength), \
  41   PREFIX(skipS), \
  42   PREFIX(getAtts), \
  43   PREFIX(charRefNumber), \
  44   PREFIX(predefinedEntityName), \
  45   PREFIX(updatePosition), \
  46   PREFIX(isPublicId)
  47
  48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  49
  50 #define UCS2_GET_NAMING(pages, hi, lo) \
  51    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  52
  53 /* A 2 byte UTF-8 representation splits the characters 11 bits
  54 between the bottom 5 and 6 bits of the bytes.
  55 We need 8 bits to index into pages, 3 bits to add to that index and
  56 5 bits to generate the mask. */
  57 #define UTF8_GET_NAMING2(pages, byte) \
  58     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  59                       + ((((byte)[0]) & 3) << 1) \
  60                       + ((((byte)[1]) >> 5) & 1)] \
  61          & (1 << (((byte)[1]) & 0x1F)))
  62
  63 /* A 3 byte UTF-8 representation splits the characters 16 bits
  64 between the bottom 4, 6 and 6 bits of the bytes.
  65 We need 8 bits to index into pages, 3 bits to add to that index and
  66 5 bits to generate the mask. */
  67 #define UTF8_GET_NAMING3(pages, byte) \
  68   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  69                              + ((((byte)[1]) >> 2) & 0xF)] \
  70                << 3) \
  71                       + ((((byte)[1]) & 3) << 1) \
  72                       + ((((byte)[2]) >> 5) & 1)] \
  73          & (1 << (((byte)[2]) & 0x1F)))
  74
  75 #define UTF8_GET_NAMING(pages, p, n) \
  76   ((n) == 2 \
  77   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  78   : ((n) == 3 \
  79      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  80      : 0))
  81
  82 #define UTF8_INVALID3(p) \
  83   ((*p) == 0xED \
  84   ? (((p)[1] & 0x20) != 0) \
  85   : ((*p) == 0xEF \
  86      ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
  87      : 0))
  88
  89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
  90
  91 static
  92 int isNever(const ENCODING *enc, const char *p)
  93 {
  94     return 0;
  95 }
  96
  97 static
  98 int utf8_isName2(const ENCODING *enc, const char *p)
  99 {
 100     return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
 101 }
 102
 103 static
 104 int utf8_isName3(const ENCODING *enc, const char *p)
 105 {
 106     return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
 107 }
 108
 109 #define utf8_isName4 isNever
 110
 111 static
 112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
 113 {
 114     return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
 115 }
 116
 117 static
 118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
 119 {
 120     return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
 121 }
 122
 123 #define utf8_isNmstrt4 isNever
 124
 125 #define utf8_isInvalid2 isNever
 126
 127 static
 128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
 129 {
 130     return UTF8_INVALID3((const unsigned char *)p);
 131 }
 132
 133 static
 134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
 135 {
 136     return UTF8_INVALID4((const unsigned char *)p);
 137 }
 138
 139 struct normal_encoding {
 140     ENCODING enc;
 141     unsigned char type[256];
 142 #ifdef XML_MIN_SIZE
 143     int (*byteType)(const ENCODING *, const char *);
 144     int (*isNameMin)(const ENCODING *, const char *);
 145     int (*isNmstrtMin)(const ENCODING *, const char *);
 146     int (*byteToAscii)(const ENCODING *, const char *);
 147     int (*charMatches)(const ENCODING *, const char *, int);
 148 #endif /* XML_MIN_SIZE */
 149     int (*isName2)(const ENCODING *, const char *);
 150     int (*isName3)(const ENCODING *, const char *);
 151     int (*isName4)(const ENCODING *, const char *);
 152     int (*isNmstrt2)(const ENCODING *, const char *);
 153     int (*isNmstrt3)(const ENCODING *, const char *);
 154     int (*isNmstrt4)(const ENCODING *, const char *);
 155     int (*isInvalid2)(const ENCODING *, const char *);
 156     int (*isInvalid3)(const ENCODING *, const char *);
 157     int (*isInvalid4)(const ENCODING *, const char *);
 158 };
 159
 160 #ifdef XML_MIN_SIZE
 161
 162 #define STANDARD_VTABLE(E) \
 163  E ## byteType, \
 164  E ## isNameMin, \
 165  E ## isNmstrtMin, \
 166  E ## byteToAscii, \
 167  E ## charMatches,
 168
 169 #else
 170
 171 #define STANDARD_VTABLE(E) /* as nothing */
 172
 173 #endif
 174
 175 #define NORMAL_VTABLE(E) \
 176  E ## isName2, \
 177  E ## isName3, \
 178  E ## isName4, \
 179  E ## isNmstrt2, \
 180  E ## isNmstrt3, \
 181  E ## isNmstrt4, \
 182  E ## isInvalid2, \
 183  E ## isInvalid3, \
 184  E ## isInvalid4
 185
 186 static int checkCharRefNumber(int);
 187
 188 #include "xmltok_impl.h"
 189
 190 #ifdef XML_MIN_SIZE
 191 #define sb_isNameMin isNever
 192 #define sb_isNmstrtMin isNever
 193 #endif
 194
 195 #ifdef XML_MIN_SIZE
 196 #define MINBPC(enc) ((enc)->minBytesPerChar)
 197 #else
 198 /* minimum bytes per character */
 199 #define MINBPC(enc) 1
 200 #endif
 201
 202 #define SB_BYTE_TYPE(enc, p) \
 203   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
 204
 205 #ifdef XML_MIN_SIZE
 206 static
 207 int sb_byteType(const ENCODING *enc, const char *p)
 208 {
 209     return SB_BYTE_TYPE(enc, p);
 210 }
 211 #define BYTE_TYPE(enc, p) \
 212  (((const struct normal_encoding *)(enc))->byteType(enc, p))
 213 #else
 214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
 215 #endif
 216
 217 #ifdef XML_MIN_SIZE
 218 #define BYTE_TO_ASCII(enc, p) \
 219  (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
 220 static
 221 int sb_byteToAscii(const ENCODING *enc, const char *p)
 222 {
 223     return *p;
 224 }
 225 #else
 226 #define BYTE_TO_ASCII(enc, p) (*p)
 227 #endif
 228
 229 #define IS_NAME_CHAR(enc, p, n) \
 230  (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
 231 #define IS_NMSTRT_CHAR(enc, p, n) \
 232  (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
 233 #define IS_INVALID_CHAR(enc, p, n) \
 234  (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
 235
 236 #ifdef XML_MIN_SIZE
 237 #define IS_NAME_CHAR_MINBPC(enc, p) \
 238  (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
 239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
 240  (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
 241 #else
 242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
 243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
 244 #endif
 245
 246 #ifdef XML_MIN_SIZE
 247 #define CHAR_MATCHES(enc, p, c) \
 248  (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
 249 static
 250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
 251 {
 252     return *p == c;
 253 }
 254 #else
 255 /* c is an ASCII character */
 256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
 257 #endif
 258
 259 #define PREFIX(ident) normal_ ## ident
 260 #include "xmltok_impl.c"
 261
 262 #undef MINBPC
 263 #undef BYTE_TYPE
 264 #undef BYTE_TO_ASCII
 265 #undef CHAR_MATCHES
 266 #undef IS_NAME_CHAR
 267 #undef IS_NAME_CHAR_MINBPC
 268 #undef IS_NMSTRT_CHAR
 269 #undef IS_NMSTRT_CHAR_MINBPC
 270 #undef IS_INVALID_CHAR
 271
 272 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
 273     UTF8_cval1 = 0x00,
 274     UTF8_cval2 = 0xc0,
 275     UTF8_cval3 = 0xe0,
 276     UTF8_cval4 = 0xf0
 277 };
 278
 279 static
 280 void utf8_toUtf8(const ENCODING *enc,
 281                  const char **fromP, const char *fromLim,
 282                  char **toP, const char *toLim)
 283 {
 284     char *to;
 285     const char *from;
 286     if (fromLim - *fromP > toLim - *toP) {
 287         /* Avoid copying partial characters. */
 288         for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
 289             if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
 290                 break;
 291     }
 292     for (to = *toP, from = *fromP; from != fromLim; from++, to++)
 293         *to = *from;
 294     *fromP = from;
 295     *toP = to;
 296 }
 297
 298 static
 299 void utf8_toUtf16(const ENCODING *enc,
 300                   const char **fromP, const char *fromLim,
 301                   unsigned short **toP, const unsigned short *toLim)
 302 {
 303     unsigned short *to = *toP;
 304     const char *from = *fromP;
 305     while (from != fromLim && to != toLim) {
 306         switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
 307         case BT_LEAD2:
 308             *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
 309             from += 2;
 310             break;
 311         case BT_LEAD3:
 312             *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
 313             from += 3;
 314             break;
 315         case BT_LEAD4:
 316             {
 317                 unsigned long n;
 318                 if (to + 1 == toLim)
 319                     break;
 320                 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
 321                 n -= 0x10000;
 322                 to[0] = (unsigned short)((n >> 10) | 0xD800);
 323                 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
 324                 to += 2;
 325                 from += 4;
 326             }
 327             break;
 328         default:
 329             *to++ = *from++;
 330             break;
 331         }
 332     }
 333     *fromP = from;
 334     *toP = to;
 335 }
 336
 337 #ifdef XML_NS
 338 static const struct normal_encoding utf8_encoding_ns = {
 339         { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 340             {
 341 #include "asciitab.h"
 342 #include "utf8tab.h"
 343             },
 344             STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 345         };
 346 #endif
 347
 348 static const struct normal_encoding utf8_encoding = {
 349         { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 350             {
 351 #define BT_COLON BT_NMSTRT
 352 #include "asciitab.h"
 353 #undef BT_COLON
 354 #include "utf8tab.h"
 355             },
 356             STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 357         };
 358
 359 #ifdef XML_NS
 360
 361 static const struct normal_encoding internal_utf8_encoding_ns = {
 362         { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 363             {
 364 #include "iasciitab.h"
 365 #include "utf8tab.h"
 366             },
 367             STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 368         };
 369
 370 #endif
 371
 372 static const struct normal_encoding internal_utf8_encoding = {
 373         { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 374             {
 375 #define BT_COLON BT_NMSTRT
 376 #include "iasciitab.h"
 377 #undef BT_COLON
 378 #include "utf8tab.h"
 379             },
 380             STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 381         };
 382
 383 static
 384 void latin1_toUtf8(const ENCODING *enc,
 385                    const char **fromP, const char *fromLim,
 386                    char **toP, const char *toLim)
 387 {
 388     for (;;) {
 389         unsigned char c;
 390         if (*fromP == fromLim)
 391             break;
 392         c = (unsigned char)**fromP;
 393         if (c & 0x80) {
 394             if (toLim - *toP < 2)
 395                 break;
 396             *(*toP)++ = ((c >> 6) | UTF8_cval2);
 397             *(*toP)++ = ((c & 0x3f) | 0x80);
 398             (*fromP)++;
 399         }
 400         else {
 401             if (*toP == toLim)
 402                 break;
 403             *(*toP)++ = *(*fromP)++;
 404         }
 405     }
 406 }
 407
 408 static
 409 void latin1_toUtf16(const ENCODING *enc,
 410                     const char **fromP, const char *fromLim,
 411                     unsigned short **toP, const unsigned short *toLim)
 412 {
 413     while (*fromP != fromLim && *toP != toLim)
 414         *(*toP)++ = (unsigned char)*(*fromP)++;
 415 }
 416
 417 #ifdef XML_NS
 418
 419 static const struct normal_encoding latin1_encoding_ns = {
 420         { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
 421             {
 422 #include "asciitab.h"
 423 #include "latin1tab.h"
 424             },
 425             STANDARD_VTABLE(sb_)
 426         };
 427
 428 #endif
 429
 430 static const struct normal_encoding latin1_encoding = {
 431         { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
 432             {
 433 #define BT_COLON BT_NMSTRT
 434 #include "asciitab.h"
 435 #undef BT_COLON
 436 #include "latin1tab.h"
 437             },
 438             STANDARD_VTABLE(sb_)
 439         };
 440
 441 static
 442 void ascii_toUtf8(const ENCODING *enc,
 443                   const char **fromP, const char *fromLim,
 444                   char **toP, const char *toLim)
 445 {
 446     while (*fromP != fromLim && *toP != toLim)
 447         *(*toP)++ = *(*fromP)++;
 448 }
 449
 450 #ifdef XML_NS
 451
 452 static const struct normal_encoding ascii_encoding_ns = {
 453         { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
 454             {
 455 #include "asciitab.h"
 456                 /* BT_NONXML == 0 */
 457             },
 458             STANDARD_VTABLE(sb_)
 459         };
 460
 461 #endif
 462
 463 static const struct normal_encoding ascii_encoding = {
 464         { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
 465             {
 466 #define BT_COLON BT_NMSTRT
 467 #include "asciitab.h"
 468 #undef BT_COLON
 469                 /* BT_NONXML == 0 */
 470             },
 471             STANDARD_VTABLE(sb_)
 472         };
 473
 474 static int unicode_byte_type(char hi, char lo)
 475 {
 476     switch ((unsigned char)hi) {
 477 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
 478         return BT_LEAD4;
 479 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
 480         return BT_TRAIL;
 481     case 0xFF:
 482         switch ((unsigned char)lo) {
 483         case 0xFF:
 484         case 0xFE:
 485             return BT_NONXML;
 486         }
 487         break;
 488     }
 489     return BT_NONASCII;
 490 }
 491
 492 #define DEFINE_UTF16_TO_UTF8(E) \
 493 static \
 494 void E ## toUtf8(const ENCODING *enc, \
 495          const char **fromP, const char *fromLim, \
 496          char **toP, const char *toLim) \
 497 { \
 498   const char *from; \
 499   for (from = *fromP; from != fromLim; from += 2) { \
 500     int plane; \
 501     unsigned char lo2; \
 502     unsigned char lo = GET_LO(from); \
 503     unsigned char hi = GET_HI(from); \
 504     switch (hi) { \
 505     case 0: \
 506       if (lo < 0x80) { \
 507         if (*toP == toLim) { \
 508           *fromP = from; \
 509       return; \
 510         } \
 511         *(*toP)++ = lo; \
 512         break; \
 513       } \
 514       /* fall through */ \
 515     case 0x1: case 0x2: case 0x3: \
 516     case 0x4: case 0x5: case 0x6: case 0x7: \
 517       if (toLim -  *toP < 2) { \
 518         *fromP = from; \
 519     return; \
 520       } \
 521       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
 522       *(*toP)++ = ((lo & 0x3f) | 0x80); \
 523       break; \
 524     default: \
 525       if (toLim -  *toP < 3)  { \
 526         *fromP = from; \
 527     return; \
 528       } \
 529       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
 530       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
 531       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
 532       *(*toP)++ = ((lo & 0x3f) | 0x80); \
 533       break; \
 534     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
 535       if (toLim -  *toP < 4) { \
 536     *fromP = from; \
 537     return; \
 538       } \
 539       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
 540       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
 541       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
 542       from += 2; \
 543       lo2 = GET_LO(from); \
 544       *(*toP)++ = (((lo & 0x3) << 4) \
 545                | ((GET_HI(from) & 0x3) << 2) \
 546            | (lo2 >> 6) \
 547            | 0x80); \
 548       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
 549       break; \
 550     } \
 551   } \
 552   *fromP = from; \
 553 }
 554
 555 #define DEFINE_UTF16_TO_UTF16(E) \
 556 static \
 557 void E ## toUtf16(const ENCODING *enc, \
 558           const char **fromP, const char *fromLim, \
 559           unsigned short **toP, const unsigned short *toLim) \
 560 { \
 561   /* Avoid copying first half only of surrogate */ \
 562   if (fromLim - *fromP > ((toLim - *toP) << 1) \
 563       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
 564     fromLim -= 2; \
 565   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
 566     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
 567 }
 568
 569 #define SET2(ptr, ch) \
 570   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
 571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
 572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
 573
 574 DEFINE_UTF16_TO_UTF8(little2_)
 575 DEFINE_UTF16_TO_UTF16(little2_)
 576
 577 #undef SET2
 578 #undef GET_LO
 579 #undef GET_HI
 580
 581 #define SET2(ptr, ch) \
 582   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
 583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
 584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
 585
 586 DEFINE_UTF16_TO_UTF8(big2_)
 587 DEFINE_UTF16_TO_UTF16(big2_)
 588
 589 #undef SET2
 590 #undef GET_LO
 591 #undef GET_HI
 592
 593 #define LITTLE2_BYTE_TYPE(enc, p) \
 594  ((p)[1] == 0 \
 595   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
 596   : unicode_byte_type((p)[1], (p)[0]))
 597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
 598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
 599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
 600   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
 601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
 602   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
 603
 604 #ifdef XML_MIN_SIZE
 605
 606 static
 607 int little2_byteType(const ENCODING *enc, const char *p)
 608 {
 609     return LITTLE2_BYTE_TYPE(enc, p);
 610 }
 611
 612 static
 613 int little2_byteToAscii(const ENCODING *enc, const char *p)
 614 {
 615     return LITTLE2_BYTE_TO_ASCII(enc, p);
 616 }
 617
 618 static
 619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
 620 {
 621     return LITTLE2_CHAR_MATCHES(enc, p, c);
 622 }
 623
 624 static
 625 int little2_isNameMin(const ENCODING *enc, const char *p)
 626 {
 627     return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
 628 }
 629
 630 static
 631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
 632 {
 633     return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
 634 }
 635
 636 #undef VTABLE
 637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
 638
 639 #else /* not XML_MIN_SIZE */
 640
 641 #undef PREFIX
 642 #define PREFIX(ident) little2_ ## ident
 643 #define MINBPC(enc) 2
 644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
 646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
 647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
 648 #define IS_NAME_CHAR(enc, p, n) 0
 649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
 650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
 651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
 652
 653 #include "xmltok_impl.c"
 654
 655 #undef MINBPC
 656 #undef BYTE_TYPE
 657 #undef BYTE_TO_ASCII
 658 #undef CHAR_MATCHES
 659 #undef IS_NAME_CHAR
 660 #undef IS_NAME_CHAR_MINBPC
 661 #undef IS_NMSTRT_CHAR
 662 #undef IS_NMSTRT_CHAR_MINBPC
 663 #undef IS_INVALID_CHAR
 664
 665 #endif /* not XML_MIN_SIZE */
 666
 667 #ifdef XML_NS
 668
 669 static const struct normal_encoding little2_encoding_ns = {
 670             { VTABLE, 2, 0,
 671 #if XML_BYTE_ORDER == 12
 672                 1
 673 #else
 674 0
 675 #endif
 676             },
 677             {
 678 #include "asciitab.h"
 679 #include "latin1tab.h"
 680             },
 681             STANDARD_VTABLE(little2_)
 682         };
 683
 684 #endif
 685
 686 static const struct normal_encoding little2_encoding = {
 687             { VTABLE, 2, 0,
 688 #if XML_BYTE_ORDER == 12
 689                 1
 690 #else
 691                 0
 692 #endif
 693             },
 694             {
 695 #define BT_COLON BT_NMSTRT
 696 #include "asciitab.h"
 697 #undef BT_COLON
 698 #include "latin1tab.h"
 699             },
 700             STANDARD_VTABLE(little2_)
 701         };
 702
 703 #if XML_BYTE_ORDER != 21
 704
 705 #ifdef XML_NS
 706
 707 static const struct normal_encoding internal_little2_encoding_ns = {
 708         { VTABLE, 2, 0, 1 },
 709             {
 710 #include "iasciitab.h"
 711 #include "latin1tab.h"
 712             },
 713             STANDARD_VTABLE(little2_)
 714         };
 715
 716 #endif
 717
 718 static const struct normal_encoding internal_little2_encoding = {
 719         { VTABLE, 2, 0, 1 },
 720             {
 721 #define BT_COLON BT_NMSTRT
 722 #include "iasciitab.h"
 723 #undef BT_COLON
 724 #include "latin1tab.h"
 725             },
 726             STANDARD_VTABLE(little2_)
 727         };
 728
 729 #endif
 730
 731
 732 #define BIG2_BYTE_TYPE(enc, p) \
 733  ((p)[0] == 0 \
 734   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
 735   : unicode_byte_type((p)[0], (p)[1]))
 736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
 737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
 738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
 739   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
 740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
 741   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
 742
 743 #ifdef XML_MIN_SIZE
 744
 745 static
 746 int big2_byteType(const ENCODING *enc, const char *p)
 747 {
 748     return BIG2_BYTE_TYPE(enc, p);
 749 }
 750
 751 static
 752 int big2_byteToAscii(const ENCODING *enc, const char *p)
 753 {
 754     return BIG2_BYTE_TO_ASCII(enc, p);
 755 }
 756
 757 static
 758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
 759 {
 760     return BIG2_CHAR_MATCHES(enc, p, c);
 761 }
 762
 763 static
 764 int big2_isNameMin(const ENCODING *enc, const char *p)
 765 {
 766     return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
 767 }
 768
 769 static
 770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
 771 {
 772     return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
 773 }
 774
 775 #undef VTABLE
 776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
 777
 778 #else /* not XML_MIN_SIZE */
 779
 780 #undef PREFIX
 781 #define PREFIX(ident) big2_ ## ident
 782 #define MINBPC(enc) 2
 783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
 785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
 786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
 787 #define IS_NAME_CHAR(enc, p, n) 0
 788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
 789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
 790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
 791
 792 #include "xmltok_impl.c"
 793
 794 #undef MINBPC
 795 #undef BYTE_TYPE
 796 #undef BYTE_TO_ASCII
 797 #undef CHAR_MATCHES
 798 #undef IS_NAME_CHAR
 799 #undef IS_NAME_CHAR_MINBPC
 800 #undef IS_NMSTRT_CHAR
 801 #undef IS_NMSTRT_CHAR_MINBPC
 802 #undef IS_INVALID_CHAR
 803
 804 #endif /* not XML_MIN_SIZE */
 805
 806 #ifdef XML_NS
 807
 808 static const struct normal_encoding big2_encoding_ns = {
 809             { VTABLE, 2, 0,
 810 #if XML_BYTE_ORDER == 21
 811                 1
 812 #else
 813 0
 814 #endif
 815             },
 816             {
 817 #include "asciitab.h"
 818 #include "latin1tab.h"
 819             },
 820             STANDARD_VTABLE(big2_)
 821         };
 822
 823 #endif
 824
 825 static const struct normal_encoding big2_encoding = {
 826             { VTABLE, 2, 0,
 827 #if XML_BYTE_ORDER == 21
 828                 1
 829 #else
 830                 0
 831 #endif
 832             },
 833             {
 834 #define BT_COLON BT_NMSTRT
 835 #include "asciitab.h"
 836 #undef BT_COLON
 837 #include "latin1tab.h"
 838             },
 839             STANDARD_VTABLE(big2_)
 840         };
 841
 842 #if XML_BYTE_ORDER != 12
 843
 844 #ifdef XML_NS
 845
 846 static const struct normal_encoding internal_big2_encoding_ns = {
 847         { VTABLE, 2, 0, 1 },
 848             {
 849 #include "iasciitab.h"
 850 #include "latin1tab.h"
 851             },
 852             STANDARD_VTABLE(big2_)
 853         };
 854
 855 #endif
 856
 857 static const struct normal_encoding internal_big2_encoding = {
 858         { VTABLE, 2, 0, 1 },
 859             {
 860 #define BT_COLON BT_NMSTRT
 861 #include "iasciitab.h"
 862 #undef BT_COLON
 863 #include "latin1tab.h"
 864             },
 865             STANDARD_VTABLE(big2_)
 866         };
 867
 868 #endif
 869
 870 #undef PREFIX
 871
 872 static
 873 int streqci(const char *s1, const char *s2)
 874 {
 875     for (;;) {
 876         char c1 = *s1++;
 877         char c2 = *s2++;
 878         if ('a' <= c1 && c1 <= 'z')
 879             c1 += 'A' - 'a';
 880         if ('a' <= c2 && c2 <= 'z')
 881             c2 += 'A' - 'a';
 882         if (c1 != c2)
 883             return 0;
 884         if (!c1)
 885             break;
 886     }
 887     return 1;
 888 }
 889
 890 static
 891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
 892                         const char *end, POSITION *pos)
 893 {
 894     normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
 895 }
 896
 897 static
 898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
 899 {
 900     char buf[1];
 901     char *p = buf;
 902     XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
 903     if (p == buf)
 904         return -1;
 905     else
 906         return buf[0];
 907 }
 908
 909 static
 910 int isSpace(int c)
 911 {
 912     switch (c) {
 913     case 0x20:
 914     case 0xD:
 915     case 0xA:
 916     case 0x9:
 917         return 1;
 918     }
 919     return 0;
 920 }
 921
 922 /* Return 1 if there's just optional white space
 923 or there's an S followed by name=val. */
 924 static
 925 int parsePseudoAttribute(const ENCODING *enc,
 926                          const char *ptr,
 927                          const char *end,
 928                          const char **namePtr,
 929                          const char **valPtr,
 930                          const char **nextTokPtr)
 931 {
 932     int c;
 933     char open;
 934     if (ptr == end) {
 935         *namePtr = 0;
 936         return 1;
 937     }
 938     if (!isSpace(toAscii(enc, ptr, end))) {
 939         *nextTokPtr = ptr;
 940         return 0;
 941     }
 942     do {
 943         ptr += enc->minBytesPerChar;
 944     } while (isSpace(toAscii(enc, ptr, end)));
 945     if (ptr == end) {
 946         *namePtr = 0;
 947         return 1;
 948     }
 949     *namePtr = ptr;
 950     for (;;) {
 951         c = toAscii(enc, ptr, end);
 952         if (c == -1) {
 953             *nextTokPtr = ptr;
 954             return 0;
 955         }
 956         if (c == '=')
 957             break;
 958         if (isSpace(c)) {
 959             do {
 960                 ptr += enc->minBytesPerChar;
 961             } while (isSpace(c = toAscii(enc, ptr, end)));
 962             if (c != '=') {
 963                 *nextTokPtr = ptr;
 964                 return 0;
 965             }
 966             break;
 967         }
 968         ptr += enc->minBytesPerChar;
 969     }
 970     if (ptr == *namePtr) {
 971         *nextTokPtr = ptr;
 972         return 0;
 973     }
 974     ptr += enc->minBytesPerChar;
 975     c = toAscii(enc, ptr, end);
 976     while (isSpace(c)) {
 977         ptr += enc->minBytesPerChar;
 978         c = toAscii(enc, ptr, end);
 979     }
 980     if (c != '"' && c != '\'') {
 981         *nextTokPtr = ptr;
 982         return 0;
 983     }
 984     open = c;
 985     ptr += enc->minBytesPerChar;
 986     *valPtr = ptr;
 987     for (;; ptr += enc->minBytesPerChar) {
 988         c = toAscii(enc, ptr, end);
 989         if (c == open)
 990             break;
 991         if (!('a' <= c && c <= 'z')
 992                 && !('A' <= c && c <= 'Z')
 993                 && !('0' <= c && c <= '9')
 994                 && c != '.'
 995                 && c != '-'
 996                 && c != '_') {
 997             *nextTokPtr = ptr;
 998             return 0;
 999         }
1000     }
1001     *nextTokPtr = ptr + enc->minBytesPerChar;
1002     return 1;
1003 }
1004
1005 static
1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1007                    const char *,
1008                    const char *),
1009                    int isGeneralTextEntity,
1010                    const ENCODING *enc,
1011                    const char *ptr,
1012                    const char *end,
1013                    const char **badPtr,
1014                    const char **versionPtr,
1015                    const char **encodingName,
1016                    const ENCODING **encoding,
1017                    int *standalone)
1018 {
1019     const char *val = 0;
1020     const char *name = 0;
1021     ptr += 5 * enc->minBytesPerChar;
1022     end -= 2 * enc->minBytesPerChar;
1023     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
1024         *badPtr = ptr;
1025         return 0;
1026     }
1027     if (!XmlNameMatchesAscii(enc, name, "version")) {
1028         if (!isGeneralTextEntity) {
1029             *badPtr = name;
1030             return 0;
1031         }
1032     }
1033     else {
1034         if (versionPtr)
1035             *versionPtr = val;
1036         if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1037             *badPtr = ptr;
1038             return 0;
1039         }
1040         if (!name) {
1041             if (isGeneralTextEntity) {
1042                 /* a TextDecl must have an EncodingDecl */
1043                 *badPtr = ptr;
1044                 return 0;
1045             }
1046             return 1;
1047         }
1048     }
1049     if (XmlNameMatchesAscii(enc, name, "encoding")) {
1050         int c = toAscii(enc, val, end);
1051         if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
1052             *badPtr = val;
1053             return 0;
1054         }
1055         if (encodingName)
1056             *encodingName = val;
1057         if (encoding)
1058             *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1059         if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1060             *badPtr = ptr;
1061             return 0;
1062         }
1063         if (!name)
1064             return 1;
1065     }
1066     if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
1067         *badPtr = name;
1068         return 0;
1069     }
1070     if (XmlNameMatchesAscii(enc, val, "yes")) {
1071         if (standalone)
1072             *standalone = 1;
1073     }
1074     else if (XmlNameMatchesAscii(enc, val, "no")) {
1075         if (standalone)
1076             *standalone = 0;
1077     }
1078     else {
1079         *badPtr = val;
1080         return 0;
1081     }
1082     while (isSpace(toAscii(enc, ptr, end)))
1083         ptr += enc->minBytesPerChar;
1084     if (ptr != end) {
1085         *badPtr = ptr;
1086         return 0;
1087     }
1088     return 1;
1089 }
1090
1091 static
1092 int checkCharRefNumber(int result)
1093 {
1094     switch (result >> 8) {
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1097         return -1;
1098     case 0:
1099         if (latin1_encoding.type[result] == BT_NONXML)
1100             return -1;
1101         break;
1102     case 0xFF:
1103         if (result == 0xFFFE || result == 0xFFFF)
1104             return -1;
1105         break;
1106     }
1107     return result;
1108 }
1109
1110 int XmlUtf8Encode(int c, char *buf)
1111 {
1112     enum {
1113         /* minN is minimum legal resulting value for N byte sequence */
1114         min2 = 0x80,
1115         min3 = 0x800,
1116         min4 = 0x10000
1117     };
1118
1119     if (c < 0)
1120         return 0;
1121     if (c < min2) {
1122         buf[0] = (c | UTF8_cval1);
1123         return 1;
1124     }
1125     if (c < min3) {
1126         buf[0] = ((c >> 6) | UTF8_cval2);
1127         buf[1] = ((c & 0x3f) | 0x80);
1128         return 2;
1129     }
1130     if (c < min4) {
1131         buf[0] = ((c >> 12) | UTF8_cval3);
1132         buf[1] = (((c >> 6) & 0x3f) | 0x80);
1133         buf[2] = ((c & 0x3f) | 0x80);
1134         return 3;
1135     }
1136     if (c < 0x110000) {
1137         buf[0] = ((c >> 18) | UTF8_cval4);
1138         buf[1] = (((c >> 12) & 0x3f) | 0x80);
1139         buf[2] = (((c >> 6) & 0x3f) | 0x80);
1140         buf[3] = ((c & 0x3f) | 0x80);
1141         return 4;
1142     }
1143     return 0;
1144 }
1145
1146 int XmlUtf16Encode(int charNum, unsigned short *buf)
1147 {
1148     if (charNum < 0)
1149         return 0;
1150     if (charNum < 0x10000) {
1151         buf[0] = charNum;
1152         return 1;
1153     }
1154     if (charNum < 0x110000) {
1155         charNum -= 0x10000;
1156         buf[0] = (charNum >> 10) + 0xD800;
1157         buf[1] = (charNum & 0x3FF) + 0xDC00;
1158         return 2;
1159     }
1160     return 0;
1161 }
1162
1163 struct unknown_encoding {
1164     struct normal_encoding normal;
1165     int (*convert)(void *userData, const char *p);
1166     void *userData;
1167     unsigned short utf16[256];
1168     char utf8[256][4];
1169 };
1170
1171 int XmlSizeOfUnknownEncoding()
1172 {
1173     return sizeof(struct unknown_encoding);
1174 }
1175
1176 static
1177 int unknown_isName(const ENCODING *enc, const char *p)
1178 {
1179     int c = ((const struct unknown_encoding *)enc)
1180             ->convert(((const struct unknown_encoding *)enc)->userData, p);
1181     if (c & ~0xFFFF)
1182         return 0;
1183     return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1184 }
1185
1186 static
1187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
1188 {
1189     int c = ((const struct unknown_encoding *)enc)
1190             ->convert(((const struct unknown_encoding *)enc)->userData, p);
1191     if (c & ~0xFFFF)
1192         return 0;
1193     return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1194 }
1195
1196 static
1197 int unknown_isInvalid(const ENCODING *enc, const char *p)
1198 {
1199     int c = ((const struct unknown_encoding *)enc)
1200             ->convert(((const struct unknown_encoding *)enc)->userData, p);
1201     return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1202 }
1203
1204 static
1205 void unknown_toUtf8(const ENCODING *enc,
1206                     const char **fromP, const char *fromLim,
1207                     char **toP, const char *toLim)
1208 {
1209     char buf[XML_UTF8_ENCODE_MAX];
1210     for (;;) {
1211         const char *utf8;
1212         int n;
1213         if (*fromP == fromLim)
1214             break;
1215         utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1216         n = *utf8++;
1217         if (n == 0) {
1218             int c = ((const struct unknown_encoding *)enc)
1219                     ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1220             n = XmlUtf8Encode(c, buf);
1221             if (n > toLim - *toP)
1222                 break;
1223             utf8 = buf;
1224             *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1225                       - (BT_LEAD2 - 2);
1226         }
1227         else {
1228             if (n > toLim - *toP)
1229                 break;
1230             (*fromP)++;
1231         }
1232         do {
1233             *(*toP)++ = *utf8++;
1234         } while (--n != 0);
1235     }
1236 }
1237
1238 static
1239 void unknown_toUtf16(const ENCODING *enc,
1240                      const char **fromP, const char *fromLim,
1241                      unsigned short **toP, const unsigned short *toLim)
1242 {
1243     while (*fromP != fromLim && *toP != toLim) {
1244         unsigned short c
1245         = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1246         if (c == 0) {
1247             c = (unsigned short)((const struct unknown_encoding *)enc)
1248                 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1249             *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1250                       - (BT_LEAD2 - 2);
1251         }
1252         else
1253             (*fromP)++;
1254         *(*toP)++ = c;
1255     }
1256 }
1257
1258 ENCODING *
1259 XmlInitUnknownEncoding(void *mem,
1260                        int *table,
1261                        int (*convert)(void *userData, const char *p),
1262                        void *userData)
1263 {
1264     int i;
1265     struct unknown_encoding *e = mem;
1266     for (i = 0; i < sizeof(struct normal_encoding); i++)
1267         ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1268     for (i = 0; i < 128; i++)
1269         if (latin1_encoding.type[i] != BT_OTHER
1270                 && latin1_encoding.type[i] != BT_NONXML
1271                 && table[i] != i)
1272             return 0;
1273     for (i = 0; i < 256; i++) {
1274         int c = table[i];
1275         if (c == -1) {
1276             e->normal.type[i] = BT_MALFORM;
1277             /* This shouldn't really get used. */
1278             e->utf16[i] = 0xFFFF;
1279             e->utf8[i][0] = 1;
1280             e->utf8[i][1] = 0;
1281         }
1282         else if (c < 0) {
1283             if (c < -4)
1284                 return 0;
1285             e->normal.type[i] = BT_LEAD2 - (c + 2);
1286             e->utf8[i][0] = 0;
1287             e->utf16[i] = 0;
1288         }
1289         else if (c < 0x80) {
1290             if (latin1_encoding.type[c] != BT_OTHER
1291                     && latin1_encoding.type[c] != BT_NONXML
1292                     && c != i)
1293                 return 0;
1294             e->normal.type[i] = latin1_encoding.type[c];
1295             e->utf8[i][0] = 1;
1296             e->utf8[i][1] = (char)c;
1297             e->utf16[i] = c == 0 ? 0xFFFF : c;
1298         }
1299         else if (checkCharRefNumber(c) < 0) {
1300             e->normal.type[i] = BT_NONXML;
1301             /* This shouldn't really get used. */
1302             e->utf16[i] = 0xFFFF;
1303             e->utf8[i][0] = 1;
1304             e->utf8[i][1] = 0;
1305         }
1306         else {
1307             if (c > 0xFFFF)
1308                 return 0;
1309             if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1310                 e->normal.type[i] = BT_NMSTRT;
1311             else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1312                 e->normal.type[i] = BT_NAME;
1313             else
1314                 e->normal.type[i] = BT_OTHER;
1315             e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1316             e->utf16[i] = c;
1317         }
1318     }
1319     e->userData = userData;
1320     e->convert = convert;
1321     if (convert) {
1322         e->normal.isName2 = unknown_isName;
1323         e->normal.isName3 = unknown_isName;
1324         e->normal.isName4 = unknown_isName;
1325         e->normal.isNmstrt2 = unknown_isNmstrt;
1326         e->normal.isNmstrt3 = unknown_isNmstrt;
1327         e->normal.isNmstrt4 = unknown_isNmstrt;
1328         e->normal.isInvalid2 = unknown_isInvalid;
1329         e->normal.isInvalid3 = unknown_isInvalid;
1330         e->normal.isInvalid4 = unknown_isInvalid;
1331     }
1332     e->normal.enc.utf8Convert = unknown_toUtf8;
1333     e->normal.enc.utf16Convert = unknown_toUtf16;
1334     return &(e->normal.enc);
1335 }
1336
1337 /* If this enumeration is changed, getEncodingIndex and encodings
1338 must also be changed. */
1339 enum {
1340     UNKNOWN_ENC = -1,
1341     ISO_8859_1_ENC = 0,
1342     US_ASCII_ENC,
1343     UTF_8_ENC,
1344     UTF_16_ENC,
1345     UTF_16BE_ENC,
1346     UTF_16LE_ENC,
1347     /* must match encodingNames up to here */
1348     NO_ENC
1349 };
1350
1351 static
1352 int getEncodingIndex(const char *name)
1353 {
1354     static const char *encodingNames[] = {
1355         "ISO-8859-1",
1356         "US-ASCII",
1357         "UTF-8",
1358         "UTF-16",
1359         "UTF-16BE"
1360         "UTF-16LE",
1361     };
1362     int i;
1363     if (name == 0)
1364         return NO_ENC;
1365     for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
1366         if (streqci(name, encodingNames[i]))
1367             return i;
1368     return UNKNOWN_ENC;
1369 }
1370
1371 /* For binary compatibility, we store the index of the encoding specified
1372 at initialization in the isUtf16 member. */
1373
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1375
1376 /* This is what detects the encoding.
1377 encodingTable maps from encoding indices to encodings;
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1379 state is XML_CONTENT_STATE if we're parsing an external text entity,
1380 and XML_PROLOG_STATE otherwise.
1381 */
1382
1383
1384 static
1385 int initScan(const ENCODING **encodingTable,
1386              const INIT_ENCODING *enc,
1387              int state,
1388              const char *ptr,
1389              const char *end,
1390              const char **nextTokPtr)
1391 {
1392     const ENCODING **encPtr;
1393
1394     if (ptr == end)
1395         return XML_TOK_NONE;
1396     encPtr = enc->encPtr;
1397     if (ptr + 1 == end) {
1398         /* only a single byte available for auto-detection */
1399         /* a well-formed document entity must have more than one byte */
1400         if (state != XML_CONTENT_STATE)
1401             return XML_TOK_PARTIAL;
1402         /* so we're parsing an external text entity... */
1403         /* if UTF-16 was externally specified, then we need at least 2 bytes */
1404         switch (INIT_ENC_INDEX(enc)) {
1405         case UTF_16_ENC:
1406         case UTF_16LE_ENC:
1407         case UTF_16BE_ENC:
1408             return XML_TOK_PARTIAL;
1409         }
1410         switch ((unsigned char)*ptr) {
1411         case 0xFE:
1412         case 0xFF:
1413         case 0xEF: /* possibly first byte of UTF-8 BOM */
1414             if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1415                     && state == XML_CONTENT_STATE)
1416                 break;
1417             /* fall through */
1418         case 0x00:
1419         case 0x3C:
1420             return XML_TOK_PARTIAL;
1421         }
1422     }
1423     else {
1424         switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1425         case 0xFEFF:
1426             if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1427                     && state == XML_CONTENT_STATE)
1428                 break;
1429             *nextTokPtr = ptr + 2;
1430             *encPtr = encodingTable[UTF_16BE_ENC];
1431             return XML_TOK_BOM;
1432             /* 00 3C is handled in the default case */
1433         case 0x3C00:
1434             if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1435                     || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1436                     && state == XML_CONTENT_STATE)
1437                 break;
1438             *encPtr = encodingTable[UTF_16LE_ENC];
1439             return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1440         case 0xFFFE:
1441             if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1442                     && state == XML_CONTENT_STATE)
1443                 break;
1444             *nextTokPtr = ptr + 2;
1445             *encPtr = encodingTable[UTF_16LE_ENC];
1446             return XML_TOK_BOM;
1447         case 0xEFBB:
1448             /* Maybe a UTF-8 BOM (EF BB BF) */
1449             /* If there's an explicitly specified (external) encoding
1450                of ISO-8859-1 or some flavour of UTF-16
1451                and this is an external text entity,
1452             don't look for the BOM,
1453                because it might be a legal data. */
1454             if (state == XML_CONTENT_STATE) {
1455                 int e = INIT_ENC_INDEX(enc);
1456                 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1457                     break;
1458             }
1459             if (ptr + 2 == end)
1460                 return XML_TOK_PARTIAL;
1461             if ((unsigned char)ptr[2] == 0xBF) {
1462                 *encPtr = encodingTable[UTF_8_ENC];
1463                 return XML_TOK_BOM;
1464             }
1465             break;
1466         default:
1467             if (ptr[0] == '\0') {
1468                 /* 0 isn't a legal data character. Furthermore a document entity can only
1469                    start with ASCII characters.  So the only way this can fail to be big-endian
1470                    UTF-16 if it it's an external parsed general entity that's labelled as
1471                    UTF-16LE. */
1472                 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1473                     break;
1474                 *encPtr = encodingTable[UTF_16BE_ENC];
1475                 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1476             }
1477             else if (ptr[1] == '\0') {
1478                 /* We could recover here in the case:
1479                     - parsing an external entity
1480                     - second byte is 0
1481                     - no externally specified encoding
1482                     - no encoding declaration
1483                    by assuming UTF-16LE.  But we don't, because this would mean when
1484                    presented just with a single byte, we couldn't reliably determine
1485                    whether we needed further bytes. */
1486                 if (state == XML_CONTENT_STATE)
1487                     break;
1488                 *encPtr = encodingTable[UTF_16LE_ENC];
1489                 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1490             }
1491             break;
1492         }
1493     }
1494     *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1495     return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1496 }
1497
1498
1499 #define NS(x) x
1500 #define ns(x) x
1501 #include "xmltok_ns.c"
1502 #undef NS
1503 #undef ns
1504
1505 #ifdef XML_NS
1506
1507 #define NS(x) x ## NS
1508 #define ns(x) x ## _ns
1509
1510 #include "xmltok_ns.c"
1511
1512 #undef NS
1513 #undef ns
1514
1515 ENCODING *
1516 XmlInitUnknownEncodingNS(void *mem,
1517                          int *table,
1518                          int (*convert)(void *userData, const char *p),
1519                          void *userData)
1520 {
1521     ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1522     if (enc)
1523         ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
1524     return enc;
1525 }
1526
1527 #endif /* XML_NS */