2 The contents of this file are subject to the Mozilla Public License
3 Version 1.1 (the "License"); you may not use this file except in
4 compliance with the License. You may obtain a copy of the License at
5 http://www.mozilla.org/MPL/
7 Software distributed under the License is distributed on an "AS IS"
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9 License for the specific language governing rights and limitations
12 The Original Code is expat.
14 The Initial Developer of the Original Code is James Clark.
15 Portions created by James Clark are Copyright (C) 1998, 1999
16 James Clark. All Rights Reserved.
20 Alternatively, the contents of this file may be used under the terms
21 of the GNU General Public License (the "GPL"), in which case the
22 provisions of the GPL are applicable instead of those above. If you
23 wish to allow use of your version of this file only under the terms of
24 the GPL and not to allow others to use your version of this file under
25 the MPL, indicate your decision by deleting the provisions above and
26 replace them with the notice and other provisions required by the
27 GPL. If you do not delete the provisions above, a recipient may use
28 your version of this file under either the MPL or the GPL.
31 #ifndef IS_INVALID_CHAR
32 #define IS_INVALID_CHAR(enc, ptr, n) (0)
35 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
38 return XML_TOK_PARTIAL_CHAR; \
39 if (IS_INVALID_CHAR(enc, ptr, n)) { \
40 *(nextTokPtr) = (ptr); \
41 return XML_TOK_INVALID; \
46 #define INVALID_CASES(ptr, nextTokPtr) \
47 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
48 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
49 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
53 *(nextTokPtr) = (ptr); \
54 return XML_TOK_INVALID;
56 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
59 return XML_TOK_PARTIAL_CHAR; \
60 if (!IS_NAME_CHAR(enc, ptr, n)) { \
62 return XML_TOK_INVALID; \
67 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
69 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
71 return XML_TOK_INVALID; \
80 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
81 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
82 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
84 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
87 return XML_TOK_PARTIAL_CHAR; \
88 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
90 return XML_TOK_INVALID; \
95 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
97 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
99 return XML_TOK_INVALID; \
103 ptr += MINBPC(enc); \
105 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
106 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
107 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
110 #define PREFIX(ident) ident
113 /* ptr points to character following "<!-" */
116 int PREFIX(scanComment
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
117 const char **nextTokPtr
)
120 if (!CHAR_MATCHES(enc
, ptr
, '-')) {
122 return XML_TOK_INVALID
;
126 switch (BYTE_TYPE(enc
, ptr
)) {
127 INVALID_CASES(ptr
, nextTokPtr
)
129 if ((ptr
+= MINBPC(enc
)) == end
)
130 return XML_TOK_PARTIAL
;
131 if (CHAR_MATCHES(enc
, ptr
, '-')) {
132 if ((ptr
+= MINBPC(enc
)) == end
)
133 return XML_TOK_PARTIAL
;
134 if (!CHAR_MATCHES(enc
, ptr
, '>')) {
136 return XML_TOK_INVALID
;
138 *nextTokPtr
= ptr
+ MINBPC(enc
);
139 return XML_TOK_COMMENT
;
148 return XML_TOK_PARTIAL
;
151 /* ptr points to character following "<!" */
154 int PREFIX(scanDecl
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
155 const char **nextTokPtr
)
158 return XML_TOK_PARTIAL
;
159 switch (BYTE_TYPE(enc
, ptr
)) {
161 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
163 *nextTokPtr
= ptr
+ MINBPC(enc
);
164 return XML_TOK_COND_SECT_OPEN
;
171 return XML_TOK_INVALID
;
174 switch (BYTE_TYPE(enc
, ptr
)) {
176 if (ptr
+ MINBPC(enc
) == end
)
177 return XML_TOK_PARTIAL
;
178 /* don't allow <!ENTITY% foo "whatever"> */
179 switch (BYTE_TYPE(enc
, ptr
+ MINBPC(enc
))) {
180 case BT_S
: case BT_CR
: case BT_LF
: case BT_PERCNT
:
182 return XML_TOK_INVALID
;
185 case BT_S
: case BT_CR
: case BT_LF
:
187 return XML_TOK_DECL_OPEN
;
194 return XML_TOK_INVALID
;
197 return XML_TOK_PARTIAL
;
201 int PREFIX(checkPiTarget
)(const ENCODING
*enc
, const char *ptr
, const char *end
, int *tokPtr
)
204 *tokPtr
= XML_TOK_PI
;
205 if (end
- ptr
!= MINBPC(enc
)*3)
207 switch (BYTE_TO_ASCII(enc
, ptr
)) {
217 switch (BYTE_TO_ASCII(enc
, ptr
)) {
227 switch (BYTE_TO_ASCII(enc
, ptr
)) {
238 *tokPtr
= XML_TOK_XML_DECL
;
242 /* ptr points to character following "<?" */
245 int PREFIX(scanPi
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
246 const char **nextTokPtr
)
249 const char *target
= ptr
;
251 return XML_TOK_PARTIAL
;
252 switch (BYTE_TYPE(enc
, ptr
)) {
253 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
256 return XML_TOK_INVALID
;
259 switch (BYTE_TYPE(enc
, ptr
)) {
260 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
261 case BT_S
: case BT_CR
: case BT_LF
:
262 if (!PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
264 return XML_TOK_INVALID
;
268 switch (BYTE_TYPE(enc
, ptr
)) {
269 INVALID_CASES(ptr
, nextTokPtr
)
273 return XML_TOK_PARTIAL
;
274 if (CHAR_MATCHES(enc
, ptr
, '>')) {
275 *nextTokPtr
= ptr
+ MINBPC(enc
);
284 return XML_TOK_PARTIAL
;
286 if (!PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
288 return XML_TOK_INVALID
;
292 return XML_TOK_PARTIAL
;
293 if (CHAR_MATCHES(enc
, ptr
, '>')) {
294 *nextTokPtr
= ptr
+ MINBPC(enc
);
300 return XML_TOK_INVALID
;
303 return XML_TOK_PARTIAL
;
308 int PREFIX(scanCdataSection
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
309 const char **nextTokPtr
)
313 if (end
- ptr
< 6 * MINBPC(enc
))
314 return XML_TOK_PARTIAL
;
315 for (i
= 0; i
< 6; i
++, ptr
+= MINBPC(enc
)) {
316 if (!CHAR_MATCHES(enc
, ptr
, "CDATA["[i
])) {
318 return XML_TOK_INVALID
;
322 return XML_TOK_CDATA_SECT_OPEN
;
326 int PREFIX(cdataSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
327 const char **nextTokPtr
)
331 if (MINBPC(enc
) > 1) {
332 size_t n
= end
- ptr
;
333 if (n
& (MINBPC(enc
) - 1)) {
334 n
&= ~(MINBPC(enc
) - 1);
336 return XML_TOK_PARTIAL
;
340 switch (BYTE_TYPE(enc
, ptr
)) {
344 return XML_TOK_PARTIAL
;
345 if (!CHAR_MATCHES(enc
, ptr
, ']'))
349 return XML_TOK_PARTIAL
;
350 if (!CHAR_MATCHES(enc
, ptr
, '>')) {
354 *nextTokPtr
= ptr
+ MINBPC(enc
);
355 return XML_TOK_CDATA_SECT_CLOSE
;
359 return XML_TOK_PARTIAL
;
360 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
363 return XML_TOK_DATA_NEWLINE
;
365 *nextTokPtr
= ptr
+ MINBPC(enc
);
366 return XML_TOK_DATA_NEWLINE
;
367 INVALID_CASES(ptr
, nextTokPtr
)
373 switch (BYTE_TYPE(enc
, ptr
)) {
374 #define LEAD_CASE(n) \
376 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
378 return XML_TOK_DATA_CHARS; \
382 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
391 return XML_TOK_DATA_CHARS
;
398 return XML_TOK_DATA_CHARS
;
401 /* ptr points to character following "</" */
404 int PREFIX(scanEndTag
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
405 const char **nextTokPtr
)
408 return XML_TOK_PARTIAL
;
409 switch (BYTE_TYPE(enc
, ptr
)) {
410 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
413 return XML_TOK_INVALID
;
416 switch (BYTE_TYPE(enc
, ptr
)) {
417 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
418 case BT_S
: case BT_CR
: case BT_LF
:
419 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
420 switch (BYTE_TYPE(enc
, ptr
)) {
421 case BT_S
: case BT_CR
: case BT_LF
:
424 *nextTokPtr
= ptr
+ MINBPC(enc
);
425 return XML_TOK_END_TAG
;
428 return XML_TOK_INVALID
;
431 return XML_TOK_PARTIAL
;
434 /* no need to check qname syntax here, since end-tag must match exactly */
439 *nextTokPtr
= ptr
+ MINBPC(enc
);
440 return XML_TOK_END_TAG
;
443 return XML_TOK_INVALID
;
446 return XML_TOK_PARTIAL
;
449 /* ptr points to character following "&#X" */
452 int PREFIX(scanHexCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
453 const char **nextTokPtr
)
456 switch (BYTE_TYPE(enc
, ptr
)) {
462 return XML_TOK_INVALID
;
464 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
465 switch (BYTE_TYPE(enc
, ptr
)) {
470 *nextTokPtr
= ptr
+ MINBPC(enc
);
471 return XML_TOK_CHAR_REF
;
474 return XML_TOK_INVALID
;
478 return XML_TOK_PARTIAL
;
481 /* ptr points to character following "&#" */
484 int PREFIX(scanCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
485 const char **nextTokPtr
)
488 if (CHAR_MATCHES(enc
, ptr
, 'x'))
489 return PREFIX(scanHexCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
490 switch (BYTE_TYPE(enc
, ptr
)) {
495 return XML_TOK_INVALID
;
497 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
498 switch (BYTE_TYPE(enc
, ptr
)) {
502 *nextTokPtr
= ptr
+ MINBPC(enc
);
503 return XML_TOK_CHAR_REF
;
506 return XML_TOK_INVALID
;
510 return XML_TOK_PARTIAL
;
513 /* ptr points to character following "&" */
516 int PREFIX(scanRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
517 const char **nextTokPtr
)
520 return XML_TOK_PARTIAL
;
521 switch (BYTE_TYPE(enc
, ptr
)) {
522 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
524 return PREFIX(scanCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
527 return XML_TOK_INVALID
;
530 switch (BYTE_TYPE(enc
, ptr
)) {
531 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
533 *nextTokPtr
= ptr
+ MINBPC(enc
);
534 return XML_TOK_ENTITY_REF
;
537 return XML_TOK_INVALID
;
540 return XML_TOK_PARTIAL
;
543 /* ptr points to character following first character of attribute name */
546 int PREFIX(scanAtts
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
547 const char **nextTokPtr
)
553 switch (BYTE_TYPE(enc
, ptr
)) {
554 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
559 return XML_TOK_INVALID
;
564 return XML_TOK_PARTIAL
;
565 switch (BYTE_TYPE(enc
, ptr
)) {
566 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
569 return XML_TOK_INVALID
;
573 case BT_S
: case BT_CR
: case BT_LF
:
579 return XML_TOK_PARTIAL
;
580 t
= BYTE_TYPE(enc
, ptr
);
590 return XML_TOK_INVALID
;
604 return XML_TOK_PARTIAL
;
605 open
= BYTE_TYPE(enc
, ptr
);
606 if (open
== BT_QUOT
|| open
== BT_APOS
)
615 return XML_TOK_INVALID
;
619 /* in attribute value */
623 return XML_TOK_PARTIAL
;
624 t
= BYTE_TYPE(enc
, ptr
);
628 INVALID_CASES(ptr
, nextTokPtr
)
631 int tok
= PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, &ptr
);
633 if (tok
== XML_TOK_INVALID
)
641 return XML_TOK_INVALID
;
649 return XML_TOK_PARTIAL
;
650 switch (BYTE_TYPE(enc
, ptr
)) {
661 return XML_TOK_INVALID
;
663 /* ptr points to closing quote */
667 return XML_TOK_PARTIAL
;
668 switch (BYTE_TYPE(enc
, ptr
)) {
669 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
670 case BT_S
: case BT_CR
: case BT_LF
:
674 *nextTokPtr
= ptr
+ MINBPC(enc
);
675 return XML_TOK_START_TAG_WITH_ATTS
;
680 return XML_TOK_PARTIAL
;
681 if (!CHAR_MATCHES(enc
, ptr
, '>')) {
683 return XML_TOK_INVALID
;
685 *nextTokPtr
= ptr
+ MINBPC(enc
);
686 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS
;
689 return XML_TOK_INVALID
;
697 return XML_TOK_INVALID
;
700 return XML_TOK_PARTIAL
;
703 /* ptr points to character following "<" */
706 int PREFIX(scanLt
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
707 const char **nextTokPtr
)
713 return XML_TOK_PARTIAL
;
714 switch (BYTE_TYPE(enc
, ptr
)) {
715 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
717 if ((ptr
+= MINBPC(enc
)) == end
)
718 return XML_TOK_PARTIAL
;
719 switch (BYTE_TYPE(enc
, ptr
)) {
721 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
723 return PREFIX(scanCdataSection
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
726 return XML_TOK_INVALID
;
728 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
730 return PREFIX(scanEndTag
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
733 return XML_TOK_INVALID
;
738 /* we have a start-tag */
740 switch (BYTE_TYPE(enc
, ptr
)) {
741 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
746 return XML_TOK_INVALID
;
751 return XML_TOK_PARTIAL
;
752 switch (BYTE_TYPE(enc
, ptr
)) {
753 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
756 return XML_TOK_INVALID
;
760 case BT_S
: case BT_CR
: case BT_LF
:
764 switch (BYTE_TYPE(enc
, ptr
)) {
765 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
770 case BT_S
: case BT_CR
: case BT_LF
:
775 return XML_TOK_INVALID
;
777 return PREFIX(scanAtts
)(enc
, ptr
, end
, nextTokPtr
);
779 return XML_TOK_PARTIAL
;
783 *nextTokPtr
= ptr
+ MINBPC(enc
);
784 return XML_TOK_START_TAG_NO_ATTS
;
789 return XML_TOK_PARTIAL
;
790 if (!CHAR_MATCHES(enc
, ptr
, '>')) {
792 return XML_TOK_INVALID
;
794 *nextTokPtr
= ptr
+ MINBPC(enc
);
795 return XML_TOK_EMPTY_ELEMENT_NO_ATTS
;
798 return XML_TOK_INVALID
;
801 return XML_TOK_PARTIAL
;
805 int PREFIX(contentTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
806 const char **nextTokPtr
)
810 if (MINBPC(enc
) > 1) {
811 size_t n
= end
- ptr
;
812 if (n
& (MINBPC(enc
) - 1)) {
813 n
&= ~(MINBPC(enc
) - 1);
815 return XML_TOK_PARTIAL
;
819 switch (BYTE_TYPE(enc
, ptr
)) {
821 return PREFIX(scanLt
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
823 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
827 return XML_TOK_TRAILING_CR
;
828 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
831 return XML_TOK_DATA_NEWLINE
;
833 *nextTokPtr
= ptr
+ MINBPC(enc
);
834 return XML_TOK_DATA_NEWLINE
;
838 return XML_TOK_TRAILING_RSQB
;
839 if (!CHAR_MATCHES(enc
, ptr
, ']'))
843 return XML_TOK_TRAILING_RSQB
;
844 if (!CHAR_MATCHES(enc
, ptr
, '>')) {
849 return XML_TOK_INVALID
;
850 INVALID_CASES(ptr
, nextTokPtr
)
856 switch (BYTE_TYPE(enc
, ptr
)) {
857 #define LEAD_CASE(n) \
859 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
861 return XML_TOK_DATA_CHARS; \
865 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
868 if (ptr
+ MINBPC(enc
) != end
) {
869 if (!CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ']')) {
873 if (ptr
+ 2*MINBPC(enc
) != end
) {
874 if (!CHAR_MATCHES(enc
, ptr
+ 2*MINBPC(enc
), '>')) {
878 *nextTokPtr
= ptr
+ 2*MINBPC(enc
);
879 return XML_TOK_INVALID
;
891 return XML_TOK_DATA_CHARS
;
898 return XML_TOK_DATA_CHARS
;
901 /* ptr points to character following "%" */
904 int PREFIX(scanPercent
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
905 const char **nextTokPtr
)
908 return XML_TOK_PARTIAL
;
909 switch (BYTE_TYPE(enc
, ptr
)) {
910 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
911 case BT_S
: case BT_LF
: case BT_CR
: case BT_PERCNT
:
913 return XML_TOK_PERCENT
;
916 return XML_TOK_INVALID
;
919 switch (BYTE_TYPE(enc
, ptr
)) {
920 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
922 *nextTokPtr
= ptr
+ MINBPC(enc
);
923 return XML_TOK_PARAM_ENTITY_REF
;
926 return XML_TOK_INVALID
;
929 return XML_TOK_PARTIAL
;
933 int PREFIX(scanPoundName
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
934 const char **nextTokPtr
)
937 return XML_TOK_PARTIAL
;
938 switch (BYTE_TYPE(enc
, ptr
)) {
939 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
942 return XML_TOK_INVALID
;
945 switch (BYTE_TYPE(enc
, ptr
)) {
946 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
947 case BT_CR
: case BT_LF
: case BT_S
:
948 case BT_RPAR
: case BT_GT
: case BT_PERCNT
: case BT_VERBAR
:
950 return XML_TOK_POUND_NAME
;
953 return XML_TOK_INVALID
;
956 return XML_TOK_PARTIAL
;
960 int PREFIX(scanLit
)(int open
, const ENCODING
*enc
,
961 const char *ptr
, const char *end
,
962 const char **nextTokPtr
)
965 int t
= BYTE_TYPE(enc
, ptr
);
967 INVALID_CASES(ptr
, nextTokPtr
)
974 return XML_TOK_PARTIAL
;
976 switch (BYTE_TYPE(enc
, ptr
)) {
977 case BT_S
: case BT_CR
: case BT_LF
:
978 case BT_GT
: case BT_PERCNT
: case BT_LSQB
:
979 return XML_TOK_LITERAL
;
981 return XML_TOK_INVALID
;
988 return XML_TOK_PARTIAL
;
992 int PREFIX(prologTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
993 const char **nextTokPtr
)
998 if (MINBPC(enc
) > 1) {
999 size_t n
= end
- ptr
;
1000 if (n
& (MINBPC(enc
) - 1)) {
1001 n
&= ~(MINBPC(enc
) - 1);
1003 return XML_TOK_PARTIAL
;
1007 switch (BYTE_TYPE(enc
, ptr
)) {
1009 return PREFIX(scanLit
)(BT_QUOT
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1011 return PREFIX(scanLit
)(BT_APOS
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1016 return XML_TOK_PARTIAL
;
1017 switch (BYTE_TYPE(enc
, ptr
)) {
1019 return PREFIX(scanDecl
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1021 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1028 *nextTokPtr
= ptr
- MINBPC(enc
);
1029 return XML_TOK_INSTANCE_START
;
1032 return XML_TOK_INVALID
;
1035 if (ptr
+ MINBPC(enc
) == end
)
1036 return XML_TOK_TRAILING_CR
;
1038 case BT_S
: case BT_LF
:
1043 switch (BYTE_TYPE(enc
, ptr
)) {
1044 case BT_S
: case BT_LF
:
1047 /* don't split CR/LF pair */
1048 if (ptr
+ MINBPC(enc
) != end
)
1053 return XML_TOK_PROLOG_S
;
1057 return XML_TOK_PROLOG_S
;
1059 return PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1061 *nextTokPtr
= ptr
+ MINBPC(enc
);
1062 return XML_TOK_COMMA
;
1064 *nextTokPtr
= ptr
+ MINBPC(enc
);
1065 return XML_TOK_OPEN_BRACKET
;
1069 return XML_TOK_PARTIAL
;
1070 if (CHAR_MATCHES(enc
, ptr
, ']')) {
1071 if (ptr
+ MINBPC(enc
) == end
)
1072 return XML_TOK_PARTIAL
;
1073 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), '>')) {
1074 *nextTokPtr
= ptr
+ 2*MINBPC(enc
);
1075 return XML_TOK_COND_SECT_CLOSE
;
1079 return XML_TOK_CLOSE_BRACKET
;
1081 *nextTokPtr
= ptr
+ MINBPC(enc
);
1082 return XML_TOK_OPEN_PAREN
;
1086 return XML_TOK_PARTIAL
;
1087 switch (BYTE_TYPE(enc
, ptr
)) {
1089 *nextTokPtr
= ptr
+ MINBPC(enc
);
1090 return XML_TOK_CLOSE_PAREN_ASTERISK
;
1092 *nextTokPtr
= ptr
+ MINBPC(enc
);
1093 return XML_TOK_CLOSE_PAREN_QUESTION
;
1095 *nextTokPtr
= ptr
+ MINBPC(enc
);
1096 return XML_TOK_CLOSE_PAREN_PLUS
;
1097 case BT_CR
: case BT_LF
: case BT_S
:
1098 case BT_GT
: case BT_COMMA
: case BT_VERBAR
:
1101 return XML_TOK_CLOSE_PAREN
;
1104 return XML_TOK_INVALID
;
1106 *nextTokPtr
= ptr
+ MINBPC(enc
);
1109 *nextTokPtr
= ptr
+ MINBPC(enc
);
1110 return XML_TOK_DECL_CLOSE
;
1112 return PREFIX(scanPoundName
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1113 #define LEAD_CASE(n) \
1114 case BT_LEAD ## n: \
1115 if (end - ptr < n) \
1116 return XML_TOK_PARTIAL_CHAR; \
1117 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1119 tok = XML_TOK_NAME; \
1122 if (IS_NAME_CHAR(enc, ptr, n)) { \
1124 tok = XML_TOK_NMTOKEN; \
1127 *nextTokPtr = ptr; \
1128 return XML_TOK_INVALID;
1129 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1142 tok
= XML_TOK_NMTOKEN
;
1146 if (IS_NMSTRT_CHAR_MINBPC(enc
, ptr
)) {
1151 if (IS_NAME_CHAR_MINBPC(enc
, ptr
)) {
1153 tok
= XML_TOK_NMTOKEN
;
1159 return XML_TOK_INVALID
;
1161 while (ptr
!= end
) {
1162 switch (BYTE_TYPE(enc
, ptr
)) {
1163 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1164 case BT_GT
: case BT_RPAR
: case BT_COMMA
:
1165 case BT_VERBAR
: case BT_LSQB
: case BT_PERCNT
:
1166 case BT_S
: case BT_CR
: case BT_LF
:
1175 return XML_TOK_PARTIAL
;
1176 tok
= XML_TOK_PREFIXED_NAME
;
1177 switch (BYTE_TYPE(enc
, ptr
)) {
1178 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1180 tok
= XML_TOK_NMTOKEN
;
1184 case XML_TOK_PREFIXED_NAME
:
1185 tok
= XML_TOK_NMTOKEN
;
1191 if (tok
== XML_TOK_NMTOKEN
) {
1193 return XML_TOK_INVALID
;
1195 *nextTokPtr
= ptr
+ MINBPC(enc
);
1196 return XML_TOK_NAME_PLUS
;
1198 if (tok
== XML_TOK_NMTOKEN
) {
1200 return XML_TOK_INVALID
;
1202 *nextTokPtr
= ptr
+ MINBPC(enc
);
1203 return XML_TOK_NAME_ASTERISK
;
1205 if (tok
== XML_TOK_NMTOKEN
) {
1207 return XML_TOK_INVALID
;
1209 *nextTokPtr
= ptr
+ MINBPC(enc
);
1210 return XML_TOK_NAME_QUESTION
;
1213 return XML_TOK_INVALID
;
1216 return XML_TOK_PARTIAL
;
1220 int PREFIX(attributeValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1221 const char **nextTokPtr
)
1225 return XML_TOK_NONE
;
1227 while (ptr
!= end
) {
1228 switch (BYTE_TYPE(enc
, ptr
)) {
1229 #define LEAD_CASE(n) \
1230 case BT_LEAD ## n: ptr += n; break;
1231 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1235 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1237 return XML_TOK_DATA_CHARS
;
1239 /* this is for inside entity references */
1241 return XML_TOK_INVALID
;
1244 *nextTokPtr
= ptr
+ MINBPC(enc
);
1245 return XML_TOK_DATA_NEWLINE
;
1248 return XML_TOK_DATA_CHARS
;
1253 return XML_TOK_TRAILING_CR
;
1254 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1257 return XML_TOK_DATA_NEWLINE
;
1260 return XML_TOK_DATA_CHARS
;
1263 *nextTokPtr
= ptr
+ MINBPC(enc
);
1264 return XML_TOK_ATTRIBUTE_VALUE_S
;
1267 return XML_TOK_DATA_CHARS
;
1274 return XML_TOK_DATA_CHARS
;
1278 int PREFIX(entityValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1279 const char **nextTokPtr
)
1283 return XML_TOK_NONE
;
1285 while (ptr
!= end
) {
1286 switch (BYTE_TYPE(enc
, ptr
)) {
1287 #define LEAD_CASE(n) \
1288 case BT_LEAD ## n: ptr += n; break;
1289 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1293 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1295 return XML_TOK_DATA_CHARS
;
1298 return PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1300 return XML_TOK_DATA_CHARS
;
1303 *nextTokPtr
= ptr
+ MINBPC(enc
);
1304 return XML_TOK_DATA_NEWLINE
;
1307 return XML_TOK_DATA_CHARS
;
1312 return XML_TOK_TRAILING_CR
;
1313 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1316 return XML_TOK_DATA_NEWLINE
;
1319 return XML_TOK_DATA_CHARS
;
1326 return XML_TOK_DATA_CHARS
;
1330 int PREFIX(isPublicId
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1331 const char **badPtr
)
1335 for (; ptr
!= end
; ptr
+= MINBPC(enc
)) {
1336 switch (BYTE_TYPE(enc
, ptr
)) {
1360 if (CHAR_MATCHES(enc
, ptr
, '\t')) {
1367 if (!(BYTE_TO_ASCII(enc
, ptr
) & ~0x7f))
1370 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1384 /* This must only be called for a well-formed start-tag or empty element tag.
1385 Returns the number of attributes. Pointers to the first attsMax attributes
1386 are stored in atts. */
1389 int PREFIX(getAtts
)(const ENCODING
*enc
, const char *ptr
,
1390 int attsMax
, ATTRIBUTE
*atts
)
1392 enum { other
, inName
, inValue
} state
= inName
;
1396 for (ptr
+= MINBPC(enc
);; ptr
+= MINBPC(enc
)) {
1397 switch (BYTE_TYPE(enc
, ptr
)) {
1398 #define START_NAME \
1399 if (state == other) { \
1400 if (nAtts < attsMax) { \
1401 atts[nAtts].name = ptr; \
1402 atts[nAtts].normalized = 1; \
1406 #define LEAD_CASE(n) \
1407 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1408 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1417 if (state
!= inValue
) {
1418 if (nAtts
< attsMax
)
1419 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1423 else if (open
== BT_QUOT
) {
1425 if (nAtts
< attsMax
)
1426 atts
[nAtts
].valueEnd
= ptr
;
1431 if (state
!= inValue
) {
1432 if (nAtts
< attsMax
)
1433 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1437 else if (open
== BT_APOS
) {
1439 if (nAtts
< attsMax
)
1440 atts
[nAtts
].valueEnd
= ptr
;
1445 if (nAtts
< attsMax
)
1446 atts
[nAtts
].normalized
= 0;
1449 if (state
== inName
)
1451 else if (state
== inValue
1453 && atts
[nAtts
].normalized
1454 && (ptr
== atts
[nAtts
].valuePtr
1455 || BYTE_TO_ASCII(enc
, ptr
) != ' '
1456 || BYTE_TO_ASCII(enc
, ptr
+ MINBPC(enc
)) == ' '
1457 || BYTE_TYPE(enc
, ptr
+ MINBPC(enc
)) == open
))
1458 atts
[nAtts
].normalized
= 0;
1460 case BT_CR
: case BT_LF
:
1461 /* This case ensures that the first attribute name is counted
1462 Apart from that we could just change state on the quote. */
1463 if (state
== inName
)
1465 else if (state
== inValue
&& nAtts
< attsMax
)
1466 atts
[nAtts
].normalized
= 0;
1470 if (state
!= inValue
)
1481 int PREFIX(charRefNumber
)(const ENCODING
*enc
, const char *ptr
)
1485 ptr
+= 2*MINBPC(enc
);
1486 if (CHAR_MATCHES(enc
, ptr
, 'x')) {
1487 for (ptr
+= MINBPC(enc
); !CHAR_MATCHES(enc
, ptr
, ';'); ptr
+= MINBPC(enc
)) {
1488 int c
= BYTE_TO_ASCII(enc
, ptr
);
1490 case '0': case '1': case '2': case '3': case '4':
1491 case '5': case '6': case '7': case '8': case '9':
1493 result
|= (c
- '0');
1495 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1497 result
+= 10 + (c
- 'A');
1499 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1501 result
+= 10 + (c
- 'a');
1504 if (result
>= 0x110000)
1509 for (; !CHAR_MATCHES(enc
, ptr
, ';'); ptr
+= MINBPC(enc
)) {
1510 int c
= BYTE_TO_ASCII(enc
, ptr
);
1512 result
+= (c
- '0');
1513 if (result
>= 0x110000)
1517 return checkCharRefNumber(result
);
1521 int PREFIX(predefinedEntityName
)(const ENCODING
*enc
, const char *ptr
, const char *end
)
1523 switch ((end
- ptr
)/MINBPC(enc
)) {
1525 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), 't')) {
1526 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1535 if (CHAR_MATCHES(enc
, ptr
, 'a')) {
1537 if (CHAR_MATCHES(enc
, ptr
, 'm')) {
1539 if (CHAR_MATCHES(enc
, ptr
, 'p'))
1545 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1548 if (CHAR_MATCHES(enc
, ptr
, 'u')) {
1550 if (CHAR_MATCHES(enc
, ptr
, 'o')) {
1552 if (CHAR_MATCHES(enc
, ptr
, 't'))
1559 if (CHAR_MATCHES(enc
, ptr
, 'p')) {
1561 if (CHAR_MATCHES(enc
, ptr
, 'o')) {
1563 if (CHAR_MATCHES(enc
, ptr
, 's'))
1574 int PREFIX(sameName
)(const ENCODING
*enc
, const char *ptr1
, const char *ptr2
)
1577 switch (BYTE_TYPE(enc
, ptr1
)) {
1578 #define LEAD_CASE(n) \
1579 case BT_LEAD ## n: \
1580 if (*ptr1++ != *ptr2++) \
1582 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1585 if (*ptr1
++ != *ptr2
++)
1597 if (*ptr2
++ != *ptr1
++)
1599 if (MINBPC(enc
) > 1) {
1600 if (*ptr2
++ != *ptr1
++)
1602 if (MINBPC(enc
) > 2) {
1603 if (*ptr2
++ != *ptr1
++)
1605 if (MINBPC(enc
) > 3) {
1606 if (*ptr2
++ != *ptr1
++)
1613 if (MINBPC(enc
) == 1 && *ptr1
== *ptr2
)
1615 switch (BYTE_TYPE(enc
, ptr2
)) {
1638 int PREFIX(nameMatchesAscii
)(const ENCODING
*enc
, const char *ptr1
, const char *ptr2
)
1640 for (; *ptr2
; ptr1
+= MINBPC(enc
), ptr2
++) {
1641 if (!CHAR_MATCHES(enc
, ptr1
, *ptr2
))
1644 switch (BYTE_TYPE(enc
, ptr1
)) {
1664 int PREFIX(nameLength
)(const ENCODING
*enc
, const char *ptr
)
1666 const char *start
= ptr
;
1668 switch (BYTE_TYPE(enc
, ptr
)) {
1669 #define LEAD_CASE(n) \
1670 case BT_LEAD ## n: ptr += n; break;
1671 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1691 const char *PREFIX(skipS
)(const ENCODING
*enc
, const char *ptr
)
1694 switch (BYTE_TYPE(enc
, ptr
)) {
1707 void PREFIX(updatePosition
)(const ENCODING
*enc
,
1712 while (ptr
!= end
) {
1713 switch (BYTE_TYPE(enc
, ptr
)) {
1714 #define LEAD_CASE(n) \
1715 case BT_LEAD ## n: \
1718 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1721 pos
->columnNumber
= (unsigned)-1;
1728 if (ptr
!= end
&& BYTE_TYPE(enc
, ptr
) == BT_LF
)
1730 pos
->columnNumber
= (unsigned)-1;
1736 pos
->columnNumber
++;
1741 #undef MULTIBYTE_CASES
1742 #undef INVALID_CASES
1743 #undef CHECK_NAME_CASE
1744 #undef CHECK_NAME_CASES
1745 #undef CHECK_NMSTRT_CASE
1746 #undef CHECK_NMSTRT_CASES