2 The contents of this file are subject to the Mozilla Public License
3 Version 1.1 (the "License"); you may not use this file except in
4 compliance with the License. You may obtain a copy of the License at
5 http://www.mozilla.org/MPL/
7 Software distributed under the License is distributed on an "AS IS"
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9 License for the specific language governing rights and limitations
12 The Original Code is expat.
14 The Initial Developer of the Original Code is James Clark.
15 Portions created by James Clark are Copyright (C) 1998, 1999
16 James Clark. All Rights Reserved.
20 Alternatively, the contents of this file may be used under the terms
21 of the GNU General Public License (the "GPL"), in which case the
22 provisions of the GPL are applicable instead of those above. If you
23 wish to allow use of your version of this file only under the terms of
24 the GPL and not to allow others to use your version of this file under
25 the MPL, indicate your decision by deleting the provisions above and
26 replace them with the notice and other provisions required by the
27 GPL. If you do not delete the provisions above, a recipient may use
28 your version of this file under either the MPL or the GPL.
36 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
37 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
39 PREFIX(nameMatchesAscii), \
43 PREFIX(charRefNumber), \
44 PREFIX(predefinedEntityName), \
45 PREFIX(updatePosition), \
48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
50 #define UCS2_GET_NAMING(pages, hi, lo) \
51 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
53 /* A 2 byte UTF-8 representation splits the characters 11 bits
54 between the bottom 5 and 6 bits of the bytes.
55 We need 8 bits to index into pages, 3 bits to add to that index and
56 5 bits to generate the mask. */
57 #define UTF8_GET_NAMING2(pages, byte) \
58 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
59 + ((((byte)[0]) & 3) << 1) \
60 + ((((byte)[1]) >> 5) & 1)] \
61 & (1 << (((byte)[1]) & 0x1F)))
63 /* A 3 byte UTF-8 representation splits the characters 16 bits
64 between the bottom 4, 6 and 6 bits of the bytes.
65 We need 8 bits to index into pages, 3 bits to add to that index and
66 5 bits to generate the mask. */
67 #define UTF8_GET_NAMING3(pages, byte) \
68 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
69 + ((((byte)[1]) >> 2) & 0xF)] \
71 + ((((byte)[1]) & 3) << 1) \
72 + ((((byte)[2]) >> 5) & 1)] \
73 & (1 << (((byte)[2]) & 0x1F)))
75 #define UTF8_GET_NAMING(pages, p, n) \
77 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
79 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
82 #define UTF8_INVALID3(p) \
84 ? (((p)[1] & 0x20) != 0) \
86 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
92 int isNever(const ENCODING
*enc
, const char *p
)
98 int utf8_isName2(const ENCODING
*enc
, const char *p
)
100 return UTF8_GET_NAMING2(namePages
, (const unsigned char *)p
);
104 int utf8_isName3(const ENCODING
*enc
, const char *p
)
106 return UTF8_GET_NAMING3(namePages
, (const unsigned char *)p
);
109 #define utf8_isName4 isNever
112 int utf8_isNmstrt2(const ENCODING
*enc
, const char *p
)
114 return UTF8_GET_NAMING2(nmstrtPages
, (const unsigned char *)p
);
118 int utf8_isNmstrt3(const ENCODING
*enc
, const char *p
)
120 return UTF8_GET_NAMING3(nmstrtPages
, (const unsigned char *)p
);
123 #define utf8_isNmstrt4 isNever
125 #define utf8_isInvalid2 isNever
128 int utf8_isInvalid3(const ENCODING
*enc
, const char *p
)
130 return UTF8_INVALID3((const unsigned char *)p
);
134 int utf8_isInvalid4(const ENCODING
*enc
, const char *p
)
136 return UTF8_INVALID4((const unsigned char *)p
);
139 struct normal_encoding
{
141 unsigned char type
[256];
143 int (*byteType
)(const ENCODING
*, const char *);
144 int (*isNameMin
)(const ENCODING
*, const char *);
145 int (*isNmstrtMin
)(const ENCODING
*, const char *);
146 int (*byteToAscii
)(const ENCODING
*, const char *);
147 int (*charMatches
)(const ENCODING
*, const char *, int);
148 #endif /* XML_MIN_SIZE */
149 int (*isName2
)(const ENCODING
*, const char *);
150 int (*isName3
)(const ENCODING
*, const char *);
151 int (*isName4
)(const ENCODING
*, const char *);
152 int (*isNmstrt2
)(const ENCODING
*, const char *);
153 int (*isNmstrt3
)(const ENCODING
*, const char *);
154 int (*isNmstrt4
)(const ENCODING
*, const char *);
155 int (*isInvalid2
)(const ENCODING
*, const char *);
156 int (*isInvalid3
)(const ENCODING
*, const char *);
157 int (*isInvalid4
)(const ENCODING
*, const char *);
162 #define STANDARD_VTABLE(E) \
171 #define STANDARD_VTABLE(E) /* as nothing */
175 #define NORMAL_VTABLE(E) \
186 static int checkCharRefNumber(int);
188 #include "xmltok_impl.h"
191 #define sb_isNameMin isNever
192 #define sb_isNmstrtMin isNever
196 #define MINBPC(enc) ((enc)->minBytesPerChar)
198 /* minimum bytes per character */
199 #define MINBPC(enc) 1
202 #define SB_BYTE_TYPE(enc, p) \
203 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
207 int sb_byteType(const ENCODING
*enc
, const char *p
)
209 return SB_BYTE_TYPE(enc
, p
);
211 #define BYTE_TYPE(enc, p) \
212 (((const struct normal_encoding *)(enc))->byteType(enc, p))
214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
218 #define BYTE_TO_ASCII(enc, p) \
219 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
221 int sb_byteToAscii(const ENCODING
*enc
, const char *p
)
226 #define BYTE_TO_ASCII(enc, p) (*p)
229 #define IS_NAME_CHAR(enc, p, n) \
230 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
231 #define IS_NMSTRT_CHAR(enc, p, n) \
232 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
233 #define IS_INVALID_CHAR(enc, p, n) \
234 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
237 #define IS_NAME_CHAR_MINBPC(enc, p) \
238 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
240 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
247 #define CHAR_MATCHES(enc, p, c) \
248 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
250 int sb_charMatches(const ENCODING
*enc
, const char *p
, int c
)
255 /* c is an ASCII character */
256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
259 #define PREFIX(ident) normal_ ## ident
260 #include "xmltok_impl.c"
267 #undef IS_NAME_CHAR_MINBPC
268 #undef IS_NMSTRT_CHAR
269 #undef IS_NMSTRT_CHAR_MINBPC
270 #undef IS_INVALID_CHAR
272 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
280 void utf8_toUtf8(const ENCODING
*enc
,
281 const char **fromP
, const char *fromLim
,
282 char **toP
, const char *toLim
)
286 if (fromLim
- *fromP
> toLim
- *toP
) {
287 /* Avoid copying partial characters. */
288 for (fromLim
= *fromP
+ (toLim
- *toP
); fromLim
> *fromP
; fromLim
--)
289 if (((unsigned char)fromLim
[-1] & 0xc0) != 0x80)
292 for (to
= *toP
, from
= *fromP
; from
!= fromLim
; from
++, to
++)
299 void utf8_toUtf16(const ENCODING
*enc
,
300 const char **fromP
, const char *fromLim
,
301 unsigned short **toP
, const unsigned short *toLim
)
303 unsigned short *to
= *toP
;
304 const char *from
= *fromP
;
305 while (from
!= fromLim
&& to
!= toLim
) {
306 switch (((struct normal_encoding
*)enc
)->type
[(unsigned char)*from
]) {
308 *to
++ = ((from
[0] & 0x1f) << 6) | (from
[1] & 0x3f);
312 *to
++ = ((from
[0] & 0xf) << 12) | ((from
[1] & 0x3f) << 6) | (from
[2] & 0x3f);
320 n
= ((from
[0] & 0x7) << 18) | ((from
[1] & 0x3f) << 12) | ((from
[2] & 0x3f) << 6) | (from
[3] & 0x3f);
322 to
[0] = (unsigned short)((n
>> 10) | 0xD800);
323 to
[1] = (unsigned short)((n
& 0x3FF) | 0xDC00);
338 static const struct normal_encoding utf8_encoding_ns
= {
339 { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 },
341 #include "asciitab.h"
344 STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
)
348 static const struct normal_encoding utf8_encoding
= {
349 { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 },
351 #define BT_COLON BT_NMSTRT
352 #include "asciitab.h"
356 STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
)
361 static const struct normal_encoding internal_utf8_encoding_ns
= {
362 { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 },
364 #include "iasciitab.h"
367 STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
)
372 static const struct normal_encoding internal_utf8_encoding
= {
373 { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 },
375 #define BT_COLON BT_NMSTRT
376 #include "iasciitab.h"
380 STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
)
384 void latin1_toUtf8(const ENCODING
*enc
,
385 const char **fromP
, const char *fromLim
,
386 char **toP
, const char *toLim
)
390 if (*fromP
== fromLim
)
392 c
= (unsigned char)**fromP
;
394 if (toLim
- *toP
< 2)
396 *(*toP
)++ = ((c
>> 6) | UTF8_cval2
);
397 *(*toP
)++ = ((c
& 0x3f) | 0x80);
403 *(*toP
)++ = *(*fromP
)++;
409 void latin1_toUtf16(const ENCODING
*enc
,
410 const char **fromP
, const char *fromLim
,
411 unsigned short **toP
, const unsigned short *toLim
)
413 while (*fromP
!= fromLim
&& *toP
!= toLim
)
414 *(*toP
)++ = (unsigned char)*(*fromP
)++;
419 static const struct normal_encoding latin1_encoding_ns
= {
420 { VTABLE1
, latin1_toUtf8
, latin1_toUtf16
, 1, 0, 0 },
422 #include "asciitab.h"
423 #include "latin1tab.h"
430 static const struct normal_encoding latin1_encoding
= {
431 { VTABLE1
, latin1_toUtf8
, latin1_toUtf16
, 1, 0, 0 },
433 #define BT_COLON BT_NMSTRT
434 #include "asciitab.h"
436 #include "latin1tab.h"
442 void ascii_toUtf8(const ENCODING
*enc
,
443 const char **fromP
, const char *fromLim
,
444 char **toP
, const char *toLim
)
446 while (*fromP
!= fromLim
&& *toP
!= toLim
)
447 *(*toP
)++ = *(*fromP
)++;
452 static const struct normal_encoding ascii_encoding_ns
= {
453 { VTABLE1
, ascii_toUtf8
, latin1_toUtf16
, 1, 1, 0 },
455 #include "asciitab.h"
463 static const struct normal_encoding ascii_encoding
= {
464 { VTABLE1
, ascii_toUtf8
, latin1_toUtf16
, 1, 1, 0 },
466 #define BT_COLON BT_NMSTRT
467 #include "asciitab.h"
474 static int unicode_byte_type(char hi
, char lo
)
476 switch ((unsigned char)hi
) {
477 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
479 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
482 switch ((unsigned char)lo
) {
492 #define DEFINE_UTF16_TO_UTF8(E) \
494 void E ## toUtf8(const ENCODING *enc, \
495 const char **fromP, const char *fromLim, \
496 char **toP, const char *toLim) \
499 for (from = *fromP; from != fromLim; from += 2) { \
502 unsigned char lo = GET_LO(from); \
503 unsigned char hi = GET_HI(from); \
507 if (*toP == toLim) { \
515 case 0x1: case 0x2: case 0x3: \
516 case 0x4: case 0x5: case 0x6: case 0x7: \
517 if (toLim - *toP < 2) { \
521 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
522 *(*toP)++ = ((lo & 0x3f) | 0x80); \
525 if (toLim - *toP < 3) { \
529 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
530 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
531 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
532 *(*toP)++ = ((lo & 0x3f) | 0x80); \
534 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
535 if (toLim - *toP < 4) { \
539 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
540 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
541 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
543 lo2 = GET_LO(from); \
544 *(*toP)++ = (((lo & 0x3) << 4) \
545 | ((GET_HI(from) & 0x3) << 2) \
548 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
555 #define DEFINE_UTF16_TO_UTF16(E) \
557 void E ## toUtf16(const ENCODING *enc, \
558 const char **fromP, const char *fromLim, \
559 unsigned short **toP, const unsigned short *toLim) \
561 /* Avoid copying first half only of surrogate */ \
562 if (fromLim - *fromP > ((toLim - *toP) << 1) \
563 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
565 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
566 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
569 #define SET2(ptr, ch) \
570 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
574 DEFINE_UTF16_TO_UTF8(little2_
)
575 DEFINE_UTF16_TO_UTF16(little2_
)
581 #define SET2(ptr, ch) \
582 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
586 DEFINE_UTF16_TO_UTF8(big2_
)
587 DEFINE_UTF16_TO_UTF16(big2_
)
593 #define LITTLE2_BYTE_TYPE(enc, p) \
595 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
596 : unicode_byte_type((p)[1], (p)[0]))
597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
600 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
602 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
607 int little2_byteType(const ENCODING
*enc
, const char *p
)
609 return LITTLE2_BYTE_TYPE(enc
, p
);
613 int little2_byteToAscii(const ENCODING
*enc
, const char *p
)
615 return LITTLE2_BYTE_TO_ASCII(enc
, p
);
619 int little2_charMatches(const ENCODING
*enc
, const char *p
, int c
)
621 return LITTLE2_CHAR_MATCHES(enc
, p
, c
);
625 int little2_isNameMin(const ENCODING
*enc
, const char *p
)
627 return LITTLE2_IS_NAME_CHAR_MINBPC(enc
, p
);
631 int little2_isNmstrtMin(const ENCODING
*enc
, const char *p
)
633 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc
, p
);
637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
639 #else /* not XML_MIN_SIZE */
642 #define PREFIX(ident) little2_ ## ident
643 #define MINBPC(enc) 2
644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
648 #define IS_NAME_CHAR(enc, p, n) 0
649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
653 #include "xmltok_impl.c"
660 #undef IS_NAME_CHAR_MINBPC
661 #undef IS_NMSTRT_CHAR
662 #undef IS_NMSTRT_CHAR_MINBPC
663 #undef IS_INVALID_CHAR
665 #endif /* not XML_MIN_SIZE */
669 static const struct normal_encoding little2_encoding_ns
= {
671 #if XML_BYTE_ORDER == 12
678 #include "asciitab.h"
679 #include "latin1tab.h"
681 STANDARD_VTABLE(little2_
)
686 static const struct normal_encoding little2_encoding
= {
688 #if XML_BYTE_ORDER == 12
695 #define BT_COLON BT_NMSTRT
696 #include "asciitab.h"
698 #include "latin1tab.h"
700 STANDARD_VTABLE(little2_
)
703 #if XML_BYTE_ORDER != 21
707 static const struct normal_encoding internal_little2_encoding_ns
= {
710 #include "iasciitab.h"
711 #include "latin1tab.h"
713 STANDARD_VTABLE(little2_
)
718 static const struct normal_encoding internal_little2_encoding
= {
721 #define BT_COLON BT_NMSTRT
722 #include "iasciitab.h"
724 #include "latin1tab.h"
726 STANDARD_VTABLE(little2_
)
732 #define BIG2_BYTE_TYPE(enc, p) \
734 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
735 : unicode_byte_type((p)[0], (p)[1]))
736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
739 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
741 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
746 int big2_byteType(const ENCODING
*enc
, const char *p
)
748 return BIG2_BYTE_TYPE(enc
, p
);
752 int big2_byteToAscii(const ENCODING
*enc
, const char *p
)
754 return BIG2_BYTE_TO_ASCII(enc
, p
);
758 int big2_charMatches(const ENCODING
*enc
, const char *p
, int c
)
760 return BIG2_CHAR_MATCHES(enc
, p
, c
);
764 int big2_isNameMin(const ENCODING
*enc
, const char *p
)
766 return BIG2_IS_NAME_CHAR_MINBPC(enc
, p
);
770 int big2_isNmstrtMin(const ENCODING
*enc
, const char *p
)
772 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc
, p
);
776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
778 #else /* not XML_MIN_SIZE */
781 #define PREFIX(ident) big2_ ## ident
782 #define MINBPC(enc) 2
783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
787 #define IS_NAME_CHAR(enc, p, n) 0
788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
792 #include "xmltok_impl.c"
799 #undef IS_NAME_CHAR_MINBPC
800 #undef IS_NMSTRT_CHAR
801 #undef IS_NMSTRT_CHAR_MINBPC
802 #undef IS_INVALID_CHAR
804 #endif /* not XML_MIN_SIZE */
808 static const struct normal_encoding big2_encoding_ns
= {
810 #if XML_BYTE_ORDER == 21
817 #include "asciitab.h"
818 #include "latin1tab.h"
820 STANDARD_VTABLE(big2_
)
825 static const struct normal_encoding big2_encoding
= {
827 #if XML_BYTE_ORDER == 21
834 #define BT_COLON BT_NMSTRT
835 #include "asciitab.h"
837 #include "latin1tab.h"
839 STANDARD_VTABLE(big2_
)
842 #if XML_BYTE_ORDER != 12
846 static const struct normal_encoding internal_big2_encoding_ns
= {
849 #include "iasciitab.h"
850 #include "latin1tab.h"
852 STANDARD_VTABLE(big2_
)
857 static const struct normal_encoding internal_big2_encoding
= {
860 #define BT_COLON BT_NMSTRT
861 #include "iasciitab.h"
863 #include "latin1tab.h"
865 STANDARD_VTABLE(big2_
)
873 int streqci(const char *s1
, const char *s2
)
878 if ('a' <= c1
&& c1
<= 'z')
880 if ('a' <= c2
&& c2
<= 'z')
891 void initUpdatePosition(const ENCODING
*enc
, const char *ptr
,
892 const char *end
, POSITION
*pos
)
894 normal_updatePosition(&utf8_encoding
.enc
, ptr
, end
, pos
);
898 int toAscii(const ENCODING
*enc
, const char *ptr
, const char *end
)
902 XmlUtf8Convert(enc
, &ptr
, end
, &p
, p
+ 1);
922 /* Return 1 if there's just optional white space
923 or there's an S followed by name=val. */
925 int parsePseudoAttribute(const ENCODING
*enc
,
928 const char **namePtr
,
930 const char **nextTokPtr
)
938 if (!isSpace(toAscii(enc
, ptr
, end
))) {
943 ptr
+= enc
->minBytesPerChar
;
944 } while (isSpace(toAscii(enc
, ptr
, end
)));
951 c
= toAscii(enc
, ptr
, end
);
960 ptr
+= enc
->minBytesPerChar
;
961 } while (isSpace(c
= toAscii(enc
, ptr
, end
)));
968 ptr
+= enc
->minBytesPerChar
;
970 if (ptr
== *namePtr
) {
974 ptr
+= enc
->minBytesPerChar
;
975 c
= toAscii(enc
, ptr
, end
);
977 ptr
+= enc
->minBytesPerChar
;
978 c
= toAscii(enc
, ptr
, end
);
980 if (c
!= '"' && c
!= '\'') {
985 ptr
+= enc
->minBytesPerChar
;
987 for (;; ptr
+= enc
->minBytesPerChar
) {
988 c
= toAscii(enc
, ptr
, end
);
991 if (!('a' <= c
&& c
<= 'z')
992 && !('A' <= c
&& c
<= 'Z')
993 && !('0' <= c
&& c
<= '9')
1001 *nextTokPtr
= ptr
+ enc
->minBytesPerChar
;
1006 int doParseXmlDecl(const ENCODING
*(*encodingFinder
)(const ENCODING
*,
1009 int isGeneralTextEntity
,
1010 const ENCODING
*enc
,
1013 const char **badPtr
,
1014 const char **versionPtr
,
1015 const char **encodingName
,
1016 const ENCODING
**encoding
,
1019 const char *val
= 0;
1020 const char *name
= 0;
1021 ptr
+= 5 * enc
->minBytesPerChar
;
1022 end
-= 2 * enc
->minBytesPerChar
;
1023 if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &val
, &ptr
) || !name
) {
1027 if (!XmlNameMatchesAscii(enc
, name
, "version")) {
1028 if (!isGeneralTextEntity
) {
1036 if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &val
, &ptr
)) {
1041 if (isGeneralTextEntity
) {
1042 /* a TextDecl must have an EncodingDecl */
1049 if (XmlNameMatchesAscii(enc
, name
, "encoding")) {
1050 int c
= toAscii(enc
, val
, end
);
1051 if (!('a' <= c
&& c
<= 'z') && !('A' <= c
&& c
<= 'Z')) {
1056 *encodingName
= val
;
1058 *encoding
= encodingFinder(enc
, val
, ptr
- enc
->minBytesPerChar
);
1059 if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &val
, &ptr
)) {
1066 if (!XmlNameMatchesAscii(enc
, name
, "standalone") || isGeneralTextEntity
) {
1070 if (XmlNameMatchesAscii(enc
, val
, "yes")) {
1074 else if (XmlNameMatchesAscii(enc
, val
, "no")) {
1082 while (isSpace(toAscii(enc
, ptr
, end
)))
1083 ptr
+= enc
->minBytesPerChar
;
1092 int checkCharRefNumber(int result
)
1094 switch (result
>> 8) {
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1099 if (latin1_encoding
.type
[result
] == BT_NONXML
)
1103 if (result
== 0xFFFE || result
== 0xFFFF)
1110 int XmlUtf8Encode(int c
, char *buf
)
1113 /* minN is minimum legal resulting value for N byte sequence */
1122 buf
[0] = (c
| UTF8_cval1
);
1126 buf
[0] = ((c
>> 6) | UTF8_cval2
);
1127 buf
[1] = ((c
& 0x3f) | 0x80);
1131 buf
[0] = ((c
>> 12) | UTF8_cval3
);
1132 buf
[1] = (((c
>> 6) & 0x3f) | 0x80);
1133 buf
[2] = ((c
& 0x3f) | 0x80);
1137 buf
[0] = ((c
>> 18) | UTF8_cval4
);
1138 buf
[1] = (((c
>> 12) & 0x3f) | 0x80);
1139 buf
[2] = (((c
>> 6) & 0x3f) | 0x80);
1140 buf
[3] = ((c
& 0x3f) | 0x80);
1146 int XmlUtf16Encode(int charNum
, unsigned short *buf
)
1150 if (charNum
< 0x10000) {
1154 if (charNum
< 0x110000) {
1156 buf
[0] = (charNum
>> 10) + 0xD800;
1157 buf
[1] = (charNum
& 0x3FF) + 0xDC00;
1163 struct unknown_encoding
{
1164 struct normal_encoding normal
;
1165 int (*convert
)(void *userData
, const char *p
);
1167 unsigned short utf16
[256];
1171 int XmlSizeOfUnknownEncoding()
1173 return sizeof(struct unknown_encoding
);
1177 int unknown_isName(const ENCODING
*enc
, const char *p
)
1179 int c
= ((const struct unknown_encoding
*)enc
)
1180 ->convert(((const struct unknown_encoding
*)enc
)->userData
, p
);
1183 return UCS2_GET_NAMING(namePages
, c
>> 8, c
& 0xFF);
1187 int unknown_isNmstrt(const ENCODING
*enc
, const char *p
)
1189 int c
= ((const struct unknown_encoding
*)enc
)
1190 ->convert(((const struct unknown_encoding
*)enc
)->userData
, p
);
1193 return UCS2_GET_NAMING(nmstrtPages
, c
>> 8, c
& 0xFF);
1197 int unknown_isInvalid(const ENCODING
*enc
, const char *p
)
1199 int c
= ((const struct unknown_encoding
*)enc
)
1200 ->convert(((const struct unknown_encoding
*)enc
)->userData
, p
);
1201 return (c
& ~0xFFFF) || checkCharRefNumber(c
) < 0;
1205 void unknown_toUtf8(const ENCODING
*enc
,
1206 const char **fromP
, const char *fromLim
,
1207 char **toP
, const char *toLim
)
1209 char buf
[XML_UTF8_ENCODE_MAX
];
1213 if (*fromP
== fromLim
)
1215 utf8
= ((const struct unknown_encoding
*)enc
)->utf8
[(unsigned char)**fromP
];
1218 int c
= ((const struct unknown_encoding
*)enc
)
1219 ->convert(((const struct unknown_encoding
*)enc
)->userData
, *fromP
);
1220 n
= XmlUtf8Encode(c
, buf
);
1221 if (n
> toLim
- *toP
)
1224 *fromP
+= ((const struct normal_encoding
*)enc
)->type
[(unsigned char)**fromP
]
1228 if (n
> toLim
- *toP
)
1233 *(*toP
)++ = *utf8
++;
1239 void unknown_toUtf16(const ENCODING
*enc
,
1240 const char **fromP
, const char *fromLim
,
1241 unsigned short **toP
, const unsigned short *toLim
)
1243 while (*fromP
!= fromLim
&& *toP
!= toLim
) {
1245 = ((const struct unknown_encoding
*)enc
)->utf16
[(unsigned char)**fromP
];
1247 c
= (unsigned short)((const struct unknown_encoding
*)enc
)
1248 ->convert(((const struct unknown_encoding
*)enc
)->userData
, *fromP
);
1249 *fromP
+= ((const struct normal_encoding
*)enc
)->type
[(unsigned char)**fromP
]
1259 XmlInitUnknownEncoding(void *mem
,
1261 int (*convert
)(void *userData
, const char *p
),
1265 struct unknown_encoding
*e
= mem
;
1266 for (i
= 0; i
< sizeof(struct normal_encoding
); i
++)
1267 ((char *)mem
)[i
] = ((char *)&latin1_encoding
)[i
];
1268 for (i
= 0; i
< 128; i
++)
1269 if (latin1_encoding
.type
[i
] != BT_OTHER
1270 && latin1_encoding
.type
[i
] != BT_NONXML
1273 for (i
= 0; i
< 256; i
++) {
1276 e
->normal
.type
[i
] = BT_MALFORM
;
1277 /* This shouldn't really get used. */
1278 e
->utf16
[i
] = 0xFFFF;
1285 e
->normal
.type
[i
] = BT_LEAD2
- (c
+ 2);
1289 else if (c
< 0x80) {
1290 if (latin1_encoding
.type
[c
] != BT_OTHER
1291 && latin1_encoding
.type
[c
] != BT_NONXML
1294 e
->normal
.type
[i
] = latin1_encoding
.type
[c
];
1296 e
->utf8
[i
][1] = (char)c
;
1297 e
->utf16
[i
] = c
== 0 ? 0xFFFF : c
;
1299 else if (checkCharRefNumber(c
) < 0) {
1300 e
->normal
.type
[i
] = BT_NONXML
;
1301 /* This shouldn't really get used. */
1302 e
->utf16
[i
] = 0xFFFF;
1309 if (UCS2_GET_NAMING(nmstrtPages
, c
>> 8, c
& 0xff))
1310 e
->normal
.type
[i
] = BT_NMSTRT
;
1311 else if (UCS2_GET_NAMING(namePages
, c
>> 8, c
& 0xff))
1312 e
->normal
.type
[i
] = BT_NAME
;
1314 e
->normal
.type
[i
] = BT_OTHER
;
1315 e
->utf8
[i
][0] = (char)XmlUtf8Encode(c
, e
->utf8
[i
] + 1);
1319 e
->userData
= userData
;
1320 e
->convert
= convert
;
1322 e
->normal
.isName2
= unknown_isName
;
1323 e
->normal
.isName3
= unknown_isName
;
1324 e
->normal
.isName4
= unknown_isName
;
1325 e
->normal
.isNmstrt2
= unknown_isNmstrt
;
1326 e
->normal
.isNmstrt3
= unknown_isNmstrt
;
1327 e
->normal
.isNmstrt4
= unknown_isNmstrt
;
1328 e
->normal
.isInvalid2
= unknown_isInvalid
;
1329 e
->normal
.isInvalid3
= unknown_isInvalid
;
1330 e
->normal
.isInvalid4
= unknown_isInvalid
;
1332 e
->normal
.enc
.utf8Convert
= unknown_toUtf8
;
1333 e
->normal
.enc
.utf16Convert
= unknown_toUtf16
;
1334 return &(e
->normal
.enc
);
1337 /* If this enumeration is changed, getEncodingIndex and encodings
1338 must also be changed. */
1347 /* must match encodingNames up to here */
1352 int getEncodingIndex(const char *name
)
1354 static const char *encodingNames
[] = {
1365 for (i
= 0; i
< sizeof(encodingNames
)/sizeof(encodingNames
[0]); i
++)
1366 if (streqci(name
, encodingNames
[i
]))
1371 /* For binary compatibility, we store the index of the encoding specified
1372 at initialization in the isUtf16 member. */
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1376 /* This is what detects the encoding.
1377 encodingTable maps from encoding indices to encodings;
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1379 state is XML_CONTENT_STATE if we're parsing an external text entity,
1380 and XML_PROLOG_STATE otherwise.
1385 int initScan(const ENCODING
**encodingTable
,
1386 const INIT_ENCODING
*enc
,
1390 const char **nextTokPtr
)
1392 const ENCODING
**encPtr
;
1395 return XML_TOK_NONE
;
1396 encPtr
= enc
->encPtr
;
1397 if (ptr
+ 1 == end
) {
1398 /* only a single byte available for auto-detection */
1399 /* a well-formed document entity must have more than one byte */
1400 if (state
!= XML_CONTENT_STATE
)
1401 return XML_TOK_PARTIAL
;
1402 /* so we're parsing an external text entity... */
1403 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1404 switch (INIT_ENC_INDEX(enc
)) {
1408 return XML_TOK_PARTIAL
;
1410 switch ((unsigned char)*ptr
) {
1413 case 0xEF: /* possibly first byte of UTF-8 BOM */
1414 if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
1415 && state
== XML_CONTENT_STATE
)
1420 return XML_TOK_PARTIAL
;
1424 switch (((unsigned char)ptr
[0] << 8) | (unsigned char)ptr
[1]) {
1426 if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
1427 && state
== XML_CONTENT_STATE
)
1429 *nextTokPtr
= ptr
+ 2;
1430 *encPtr
= encodingTable
[UTF_16BE_ENC
];
1432 /* 00 3C is handled in the default case */
1434 if ((INIT_ENC_INDEX(enc
) == UTF_16BE_ENC
1435 || INIT_ENC_INDEX(enc
) == UTF_16_ENC
)
1436 && state
== XML_CONTENT_STATE
)
1438 *encPtr
= encodingTable
[UTF_16LE_ENC
];
1439 return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
);
1441 if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
1442 && state
== XML_CONTENT_STATE
)
1444 *nextTokPtr
= ptr
+ 2;
1445 *encPtr
= encodingTable
[UTF_16LE_ENC
];
1448 /* Maybe a UTF-8 BOM (EF BB BF) */
1449 /* If there's an explicitly specified (external) encoding
1450 of ISO-8859-1 or some flavour of UTF-16
1451 and this is an external text entity,
1452 don't look for the BOM,
1453 because it might be a legal data. */
1454 if (state
== XML_CONTENT_STATE
) {
1455 int e
= INIT_ENC_INDEX(enc
);
1456 if (e
== ISO_8859_1_ENC
|| e
== UTF_16BE_ENC
|| e
== UTF_16LE_ENC
|| e
== UTF_16_ENC
)
1460 return XML_TOK_PARTIAL
;
1461 if ((unsigned char)ptr
[2] == 0xBF) {
1462 *encPtr
= encodingTable
[UTF_8_ENC
];
1467 if (ptr
[0] == '\0') {
1468 /* 0 isn't a legal data character. Furthermore a document entity can only
1469 start with ASCII characters. So the only way this can fail to be big-endian
1470 UTF-16 if it it's an external parsed general entity that's labelled as
1472 if (state
== XML_CONTENT_STATE
&& INIT_ENC_INDEX(enc
) == UTF_16LE_ENC
)
1474 *encPtr
= encodingTable
[UTF_16BE_ENC
];
1475 return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
);
1477 else if (ptr
[1] == '\0') {
1478 /* We could recover here in the case:
1479 - parsing an external entity
1481 - no externally specified encoding
1482 - no encoding declaration
1483 by assuming UTF-16LE. But we don't, because this would mean when
1484 presented just with a single byte, we couldn't reliably determine
1485 whether we needed further bytes. */
1486 if (state
== XML_CONTENT_STATE
)
1488 *encPtr
= encodingTable
[UTF_16LE_ENC
];
1489 return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
);
1494 *encPtr
= encodingTable
[INIT_ENC_INDEX(enc
)];
1495 return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
);
1501 #include "xmltok_ns.c"
1507 #define NS(x) x ## NS
1508 #define ns(x) x ## _ns
1510 #include "xmltok_ns.c"
1516 XmlInitUnknownEncodingNS(void *mem
,
1518 int (*convert
)(void *userData
, const char *p
),
1521 ENCODING
*enc
= XmlInitUnknownEncoding(mem
, table
, convert
, userData
);
1523 ((struct normal_encoding
*)enc
)->type
[':'] = BT_COLON
;