2 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 See the file COPYING for copying permission.
6 #ifndef IS_INVALID_CHAR
7 #define IS_INVALID_CHAR(enc, ptr, n) (0)
10 #ifndef INVALID_LEAD_CASE
11 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
14 return XML_TOK_PARTIAL_CHAR; \
15 if (IS_INVALID_CHAR(enc, ptr, n)) { \
16 *(nextTokPtr) = (ptr); \
17 return XML_TOK_INVALID; \
23 #define INVALID_CASES(ptr, nextTokPtr) \
24 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
30 *(nextTokPtr) = (ptr); \
31 return XML_TOK_INVALID;
33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
36 return XML_TOK_PARTIAL_CHAR; \
37 if (!IS_NAME_CHAR(enc, ptr, n)) { \
39 return XML_TOK_INVALID; \
44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
46 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
48 return XML_TOK_INVALID; \
57 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
64 return XML_TOK_PARTIAL_CHAR; \
65 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
67 return XML_TOK_INVALID; \
72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
74 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
76 return XML_TOK_INVALID; \
82 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
87 #define PREFIX(ident) ident
90 /* ptr points to character following "<!-" */
93 int PREFIX(scanComment
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
94 const char **nextTokPtr
)
97 if (!CHAR_MATCHES(enc
, ptr
, ASCII_MINUS
)) {
99 return XML_TOK_INVALID
;
103 switch (BYTE_TYPE(enc
, ptr
)) {
104 INVALID_CASES(ptr
, nextTokPtr
)
106 if ((ptr
+= MINBPC(enc
)) == end
)
107 return XML_TOK_PARTIAL
;
108 if (CHAR_MATCHES(enc
, ptr
, ASCII_MINUS
)) {
109 if ((ptr
+= MINBPC(enc
)) == end
)
110 return XML_TOK_PARTIAL
;
111 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
113 return XML_TOK_INVALID
;
115 *nextTokPtr
= ptr
+ MINBPC(enc
);
116 return XML_TOK_COMMENT
;
125 return XML_TOK_PARTIAL
;
128 /* ptr points to character following "<!" */
131 int PREFIX(scanDecl
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
132 const char **nextTokPtr
)
135 return XML_TOK_PARTIAL
;
136 switch (BYTE_TYPE(enc
, ptr
)) {
138 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
140 *nextTokPtr
= ptr
+ MINBPC(enc
);
141 return XML_TOK_COND_SECT_OPEN
;
148 return XML_TOK_INVALID
;
151 switch (BYTE_TYPE(enc
, ptr
)) {
153 if (ptr
+ MINBPC(enc
) == end
)
154 return XML_TOK_PARTIAL
;
155 /* don't allow <!ENTITY% foo "whatever"> */
156 switch (BYTE_TYPE(enc
, ptr
+ MINBPC(enc
))) {
157 case BT_S
: case BT_CR
: case BT_LF
: case BT_PERCNT
:
159 return XML_TOK_INVALID
;
162 case BT_S
: case BT_CR
: case BT_LF
:
164 return XML_TOK_DECL_OPEN
;
171 return XML_TOK_INVALID
;
174 return XML_TOK_PARTIAL
;
178 int PREFIX(checkPiTarget
)(const ENCODING
*enc
, const char *ptr
, const char *end
, int *tokPtr
)
182 *tokPtr
= XML_TOK_PI
;
183 if (end
- ptr
!= MINBPC(enc
)*3)
185 switch (BYTE_TO_ASCII(enc
, ptr
)) {
195 switch (BYTE_TO_ASCII(enc
, ptr
)) {
205 switch (BYTE_TO_ASCII(enc
, ptr
)) {
216 *tokPtr
= XML_TOK_XML_DECL
;
220 /* ptr points to character following "<?" */
223 int PREFIX(scanPi
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
224 const char **nextTokPtr
)
227 const char *target
= ptr
;
229 return XML_TOK_PARTIAL
;
230 switch (BYTE_TYPE(enc
, ptr
)) {
231 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
234 return XML_TOK_INVALID
;
237 switch (BYTE_TYPE(enc
, ptr
)) {
238 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
239 case BT_S
: case BT_CR
: case BT_LF
:
240 if (!PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
242 return XML_TOK_INVALID
;
246 switch (BYTE_TYPE(enc
, ptr
)) {
247 INVALID_CASES(ptr
, nextTokPtr
)
251 return XML_TOK_PARTIAL
;
252 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
253 *nextTokPtr
= ptr
+ MINBPC(enc
);
262 return XML_TOK_PARTIAL
;
264 if (!PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
266 return XML_TOK_INVALID
;
270 return XML_TOK_PARTIAL
;
271 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
272 *nextTokPtr
= ptr
+ MINBPC(enc
);
278 return XML_TOK_INVALID
;
281 return XML_TOK_PARTIAL
;
286 int PREFIX(scanCdataSection
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
287 const char **nextTokPtr
)
289 static const char CDATA_LSQB
[] = { ASCII_C
, ASCII_D
, ASCII_A
, ASCII_T
, ASCII_A
, ASCII_LSQB
};
293 if (end
- ptr
< 6 * MINBPC(enc
))
294 return XML_TOK_PARTIAL
;
295 for (i
= 0; i
< 6; i
++, ptr
+= MINBPC(enc
)) {
296 if (!CHAR_MATCHES(enc
, ptr
, CDATA_LSQB
[i
])) {
298 return XML_TOK_INVALID
;
302 return XML_TOK_CDATA_SECT_OPEN
;
306 int PREFIX(cdataSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
307 const char **nextTokPtr
)
311 if (MINBPC(enc
) > 1) {
312 size_t n
= end
- ptr
;
313 if (n
& (MINBPC(enc
) - 1)) {
314 n
&= ~(MINBPC(enc
) - 1);
316 return XML_TOK_PARTIAL
;
320 switch (BYTE_TYPE(enc
, ptr
)) {
324 return XML_TOK_PARTIAL
;
325 if (!CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
))
329 return XML_TOK_PARTIAL
;
330 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
334 *nextTokPtr
= ptr
+ MINBPC(enc
);
335 return XML_TOK_CDATA_SECT_CLOSE
;
339 return XML_TOK_PARTIAL
;
340 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
343 return XML_TOK_DATA_NEWLINE
;
345 *nextTokPtr
= ptr
+ MINBPC(enc
);
346 return XML_TOK_DATA_NEWLINE
;
347 INVALID_CASES(ptr
, nextTokPtr
)
353 switch (BYTE_TYPE(enc
, ptr
)) {
354 #define LEAD_CASE(n) \
356 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
358 return XML_TOK_DATA_CHARS; \
362 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371 return XML_TOK_DATA_CHARS
;
378 return XML_TOK_DATA_CHARS
;
381 /* ptr points to character following "</" */
384 int PREFIX(scanEndTag
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
385 const char **nextTokPtr
)
388 return XML_TOK_PARTIAL
;
389 switch (BYTE_TYPE(enc
, ptr
)) {
390 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
393 return XML_TOK_INVALID
;
396 switch (BYTE_TYPE(enc
, ptr
)) {
397 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
398 case BT_S
: case BT_CR
: case BT_LF
:
399 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
400 switch (BYTE_TYPE(enc
, ptr
)) {
401 case BT_S
: case BT_CR
: case BT_LF
:
404 *nextTokPtr
= ptr
+ MINBPC(enc
);
405 return XML_TOK_END_TAG
;
408 return XML_TOK_INVALID
;
411 return XML_TOK_PARTIAL
;
414 /* no need to check qname syntax here, since end-tag must match exactly */
419 *nextTokPtr
= ptr
+ MINBPC(enc
);
420 return XML_TOK_END_TAG
;
423 return XML_TOK_INVALID
;
426 return XML_TOK_PARTIAL
;
429 /* ptr points to character following "&#X" */
432 int PREFIX(scanHexCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
433 const char **nextTokPtr
)
436 switch (BYTE_TYPE(enc
, ptr
)) {
442 return XML_TOK_INVALID
;
444 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
445 switch (BYTE_TYPE(enc
, ptr
)) {
450 *nextTokPtr
= ptr
+ MINBPC(enc
);
451 return XML_TOK_CHAR_REF
;
454 return XML_TOK_INVALID
;
458 return XML_TOK_PARTIAL
;
461 /* ptr points to character following "&#" */
464 int PREFIX(scanCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
465 const char **nextTokPtr
)
468 if (CHAR_MATCHES(enc
, ptr
, ASCII_x
))
469 return PREFIX(scanHexCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
470 switch (BYTE_TYPE(enc
, ptr
)) {
475 return XML_TOK_INVALID
;
477 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
478 switch (BYTE_TYPE(enc
, ptr
)) {
482 *nextTokPtr
= ptr
+ MINBPC(enc
);
483 return XML_TOK_CHAR_REF
;
486 return XML_TOK_INVALID
;
490 return XML_TOK_PARTIAL
;
493 /* ptr points to character following "&" */
496 int PREFIX(scanRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
497 const char **nextTokPtr
)
500 return XML_TOK_PARTIAL
;
501 switch (BYTE_TYPE(enc
, ptr
)) {
502 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
504 return PREFIX(scanCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
507 return XML_TOK_INVALID
;
510 switch (BYTE_TYPE(enc
, ptr
)) {
511 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
513 *nextTokPtr
= ptr
+ MINBPC(enc
);
514 return XML_TOK_ENTITY_REF
;
517 return XML_TOK_INVALID
;
520 return XML_TOK_PARTIAL
;
523 /* ptr points to character following first character of attribute name */
526 int PREFIX(scanAtts
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
527 const char **nextTokPtr
)
533 switch (BYTE_TYPE(enc
, ptr
)) {
534 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
539 return XML_TOK_INVALID
;
544 return XML_TOK_PARTIAL
;
545 switch (BYTE_TYPE(enc
, ptr
)) {
546 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
549 return XML_TOK_INVALID
;
553 case BT_S
: case BT_CR
: case BT_LF
:
559 return XML_TOK_PARTIAL
;
560 t
= BYTE_TYPE(enc
, ptr
);
570 return XML_TOK_INVALID
;
584 return XML_TOK_PARTIAL
;
585 open
= BYTE_TYPE(enc
, ptr
);
586 if (open
== BT_QUOT
|| open
== BT_APOS
)
595 return XML_TOK_INVALID
;
599 /* in attribute value */
603 return XML_TOK_PARTIAL
;
604 t
= BYTE_TYPE(enc
, ptr
);
608 INVALID_CASES(ptr
, nextTokPtr
)
611 int tok
= PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, &ptr
);
613 if (tok
== XML_TOK_INVALID
)
621 return XML_TOK_INVALID
;
629 return XML_TOK_PARTIAL
;
630 switch (BYTE_TYPE(enc
, ptr
)) {
641 return XML_TOK_INVALID
;
643 /* ptr points to closing quote */
647 return XML_TOK_PARTIAL
;
648 switch (BYTE_TYPE(enc
, ptr
)) {
649 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
650 case BT_S
: case BT_CR
: case BT_LF
:
654 *nextTokPtr
= ptr
+ MINBPC(enc
);
655 return XML_TOK_START_TAG_WITH_ATTS
;
660 return XML_TOK_PARTIAL
;
661 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
663 return XML_TOK_INVALID
;
665 *nextTokPtr
= ptr
+ MINBPC(enc
);
666 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS
;
669 return XML_TOK_INVALID
;
677 return XML_TOK_INVALID
;
680 return XML_TOK_PARTIAL
;
683 /* ptr points to character following "<" */
686 int PREFIX(scanLt
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
687 const char **nextTokPtr
)
693 return XML_TOK_PARTIAL
;
694 switch (BYTE_TYPE(enc
, ptr
)) {
695 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
697 if ((ptr
+= MINBPC(enc
)) == end
)
698 return XML_TOK_PARTIAL
;
699 switch (BYTE_TYPE(enc
, ptr
)) {
701 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
703 return PREFIX(scanCdataSection
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
706 return XML_TOK_INVALID
;
708 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
710 return PREFIX(scanEndTag
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
713 return XML_TOK_INVALID
;
718 /* we have a start-tag */
720 switch (BYTE_TYPE(enc
, ptr
)) {
721 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
726 return XML_TOK_INVALID
;
731 return XML_TOK_PARTIAL
;
732 switch (BYTE_TYPE(enc
, ptr
)) {
733 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
736 return XML_TOK_INVALID
;
740 case BT_S
: case BT_CR
: case BT_LF
:
744 switch (BYTE_TYPE(enc
, ptr
)) {
745 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
750 case BT_S
: case BT_CR
: case BT_LF
:
755 return XML_TOK_INVALID
;
757 return PREFIX(scanAtts
)(enc
, ptr
, end
, nextTokPtr
);
759 return XML_TOK_PARTIAL
;
763 *nextTokPtr
= ptr
+ MINBPC(enc
);
764 return XML_TOK_START_TAG_NO_ATTS
;
769 return XML_TOK_PARTIAL
;
770 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
772 return XML_TOK_INVALID
;
774 *nextTokPtr
= ptr
+ MINBPC(enc
);
775 return XML_TOK_EMPTY_ELEMENT_NO_ATTS
;
778 return XML_TOK_INVALID
;
781 return XML_TOK_PARTIAL
;
785 int PREFIX(contentTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
786 const char **nextTokPtr
)
790 if (MINBPC(enc
) > 1) {
791 size_t n
= end
- ptr
;
792 if (n
& (MINBPC(enc
) - 1)) {
793 n
&= ~(MINBPC(enc
) - 1);
795 return XML_TOK_PARTIAL
;
799 switch (BYTE_TYPE(enc
, ptr
)) {
801 return PREFIX(scanLt
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
803 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
807 return XML_TOK_TRAILING_CR
;
808 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
811 return XML_TOK_DATA_NEWLINE
;
813 *nextTokPtr
= ptr
+ MINBPC(enc
);
814 return XML_TOK_DATA_NEWLINE
;
818 return XML_TOK_TRAILING_RSQB
;
819 if (!CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
))
823 return XML_TOK_TRAILING_RSQB
;
824 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
829 return XML_TOK_INVALID
;
830 INVALID_CASES(ptr
, nextTokPtr
)
836 switch (BYTE_TYPE(enc
, ptr
)) {
837 #define LEAD_CASE(n) \
839 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
841 return XML_TOK_DATA_CHARS; \
845 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
848 if (ptr
+ MINBPC(enc
) != end
) {
849 if (!CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_RSQB
)) {
853 if (ptr
+ 2*MINBPC(enc
) != end
) {
854 if (!CHAR_MATCHES(enc
, ptr
+ 2*MINBPC(enc
), ASCII_GT
)) {
858 *nextTokPtr
= ptr
+ 2*MINBPC(enc
);
859 return XML_TOK_INVALID
;
871 return XML_TOK_DATA_CHARS
;
878 return XML_TOK_DATA_CHARS
;
881 /* ptr points to character following "%" */
884 int PREFIX(scanPercent
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
885 const char **nextTokPtr
)
888 return XML_TOK_PARTIAL
;
889 switch (BYTE_TYPE(enc
, ptr
)) {
890 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
891 case BT_S
: case BT_LF
: case BT_CR
: case BT_PERCNT
:
893 return XML_TOK_PERCENT
;
896 return XML_TOK_INVALID
;
899 switch (BYTE_TYPE(enc
, ptr
)) {
900 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
902 *nextTokPtr
= ptr
+ MINBPC(enc
);
903 return XML_TOK_PARAM_ENTITY_REF
;
906 return XML_TOK_INVALID
;
909 return XML_TOK_PARTIAL
;
913 int PREFIX(scanPoundName
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
914 const char **nextTokPtr
)
917 return XML_TOK_PARTIAL
;
918 switch (BYTE_TYPE(enc
, ptr
)) {
919 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
922 return XML_TOK_INVALID
;
925 switch (BYTE_TYPE(enc
, ptr
)) {
926 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
927 case BT_CR
: case BT_LF
: case BT_S
:
928 case BT_RPAR
: case BT_GT
: case BT_PERCNT
: case BT_VERBAR
:
930 return XML_TOK_POUND_NAME
;
933 return XML_TOK_INVALID
;
936 return -XML_TOK_POUND_NAME
;
940 int PREFIX(scanLit
)(int open
, const ENCODING
*enc
,
941 const char *ptr
, const char *end
,
942 const char **nextTokPtr
)
945 int t
= BYTE_TYPE(enc
, ptr
);
947 INVALID_CASES(ptr
, nextTokPtr
)
954 return -XML_TOK_LITERAL
;
956 switch (BYTE_TYPE(enc
, ptr
)) {
957 case BT_S
: case BT_CR
: case BT_LF
:
958 case BT_GT
: case BT_PERCNT
: case BT_LSQB
:
959 return XML_TOK_LITERAL
;
961 return XML_TOK_INVALID
;
968 return XML_TOK_PARTIAL
;
972 int PREFIX(prologTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
973 const char **nextTokPtr
)
978 if (MINBPC(enc
) > 1) {
979 size_t n
= end
- ptr
;
980 if (n
& (MINBPC(enc
) - 1)) {
981 n
&= ~(MINBPC(enc
) - 1);
983 return XML_TOK_PARTIAL
;
987 switch (BYTE_TYPE(enc
, ptr
)) {
989 return PREFIX(scanLit
)(BT_QUOT
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
991 return PREFIX(scanLit
)(BT_APOS
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
996 return XML_TOK_PARTIAL
;
997 switch (BYTE_TYPE(enc
, ptr
)) {
999 return PREFIX(scanDecl
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1001 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1008 *nextTokPtr
= ptr
- MINBPC(enc
);
1009 return XML_TOK_INSTANCE_START
;
1012 return XML_TOK_INVALID
;
1015 if (ptr
+ MINBPC(enc
) == end
)
1016 return -XML_TOK_PROLOG_S
;
1018 case BT_S
: case BT_LF
:
1023 switch (BYTE_TYPE(enc
, ptr
)) {
1024 case BT_S
: case BT_LF
:
1027 /* don't split CR/LF pair */
1028 if (ptr
+ MINBPC(enc
) != end
)
1033 return XML_TOK_PROLOG_S
;
1037 return XML_TOK_PROLOG_S
;
1039 return PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1041 *nextTokPtr
= ptr
+ MINBPC(enc
);
1042 return XML_TOK_COMMA
;
1044 *nextTokPtr
= ptr
+ MINBPC(enc
);
1045 return XML_TOK_OPEN_BRACKET
;
1049 return -XML_TOK_CLOSE_BRACKET
;
1050 if (CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
)) {
1051 if (ptr
+ MINBPC(enc
) == end
)
1052 return XML_TOK_PARTIAL
;
1053 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_GT
)) {
1054 *nextTokPtr
= ptr
+ 2*MINBPC(enc
);
1055 return XML_TOK_COND_SECT_CLOSE
;
1059 return XML_TOK_CLOSE_BRACKET
;
1061 *nextTokPtr
= ptr
+ MINBPC(enc
);
1062 return XML_TOK_OPEN_PAREN
;
1066 return -XML_TOK_CLOSE_PAREN
;
1067 switch (BYTE_TYPE(enc
, ptr
)) {
1069 *nextTokPtr
= ptr
+ MINBPC(enc
);
1070 return XML_TOK_CLOSE_PAREN_ASTERISK
;
1072 *nextTokPtr
= ptr
+ MINBPC(enc
);
1073 return XML_TOK_CLOSE_PAREN_QUESTION
;
1075 *nextTokPtr
= ptr
+ MINBPC(enc
);
1076 return XML_TOK_CLOSE_PAREN_PLUS
;
1077 case BT_CR
: case BT_LF
: case BT_S
:
1078 case BT_GT
: case BT_COMMA
: case BT_VERBAR
:
1081 return XML_TOK_CLOSE_PAREN
;
1084 return XML_TOK_INVALID
;
1086 *nextTokPtr
= ptr
+ MINBPC(enc
);
1089 *nextTokPtr
= ptr
+ MINBPC(enc
);
1090 return XML_TOK_DECL_CLOSE
;
1092 return PREFIX(scanPoundName
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1093 #define LEAD_CASE(n) \
1094 case BT_LEAD ## n: \
1095 if (end - ptr < n) \
1096 return XML_TOK_PARTIAL_CHAR; \
1097 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1099 tok = XML_TOK_NAME; \
1102 if (IS_NAME_CHAR(enc, ptr, n)) { \
1104 tok = XML_TOK_NMTOKEN; \
1107 *nextTokPtr = ptr; \
1108 return XML_TOK_INVALID;
1109 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1122 tok
= XML_TOK_NMTOKEN
;
1126 if (IS_NMSTRT_CHAR_MINBPC(enc
, ptr
)) {
1131 if (IS_NAME_CHAR_MINBPC(enc
, ptr
)) {
1133 tok
= XML_TOK_NMTOKEN
;
1139 return XML_TOK_INVALID
;
1141 while (ptr
!= end
) {
1142 switch (BYTE_TYPE(enc
, ptr
)) {
1143 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1144 case BT_GT
: case BT_RPAR
: case BT_COMMA
:
1145 case BT_VERBAR
: case BT_LSQB
: case BT_PERCNT
:
1146 case BT_S
: case BT_CR
: case BT_LF
:
1155 return XML_TOK_PARTIAL
;
1156 tok
= XML_TOK_PREFIXED_NAME
;
1157 switch (BYTE_TYPE(enc
, ptr
)) {
1158 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1160 tok
= XML_TOK_NMTOKEN
;
1164 case XML_TOK_PREFIXED_NAME
:
1165 tok
= XML_TOK_NMTOKEN
;
1171 if (tok
== XML_TOK_NMTOKEN
) {
1173 return XML_TOK_INVALID
;
1175 *nextTokPtr
= ptr
+ MINBPC(enc
);
1176 return XML_TOK_NAME_PLUS
;
1178 if (tok
== XML_TOK_NMTOKEN
) {
1180 return XML_TOK_INVALID
;
1182 *nextTokPtr
= ptr
+ MINBPC(enc
);
1183 return XML_TOK_NAME_ASTERISK
;
1185 if (tok
== XML_TOK_NMTOKEN
) {
1187 return XML_TOK_INVALID
;
1189 *nextTokPtr
= ptr
+ MINBPC(enc
);
1190 return XML_TOK_NAME_QUESTION
;
1193 return XML_TOK_INVALID
;
1200 int PREFIX(attributeValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1201 const char **nextTokPtr
)
1205 return XML_TOK_NONE
;
1207 while (ptr
!= end
) {
1208 switch (BYTE_TYPE(enc
, ptr
)) {
1209 #define LEAD_CASE(n) \
1210 case BT_LEAD ## n: ptr += n; break;
1211 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1215 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1217 return XML_TOK_DATA_CHARS
;
1219 /* this is for inside entity references */
1221 return XML_TOK_INVALID
;
1224 *nextTokPtr
= ptr
+ MINBPC(enc
);
1225 return XML_TOK_DATA_NEWLINE
;
1228 return XML_TOK_DATA_CHARS
;
1233 return XML_TOK_TRAILING_CR
;
1234 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1237 return XML_TOK_DATA_NEWLINE
;
1240 return XML_TOK_DATA_CHARS
;
1243 *nextTokPtr
= ptr
+ MINBPC(enc
);
1244 return XML_TOK_ATTRIBUTE_VALUE_S
;
1247 return XML_TOK_DATA_CHARS
;
1254 return XML_TOK_DATA_CHARS
;
1258 int PREFIX(entityValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1259 const char **nextTokPtr
)
1263 return XML_TOK_NONE
;
1265 while (ptr
!= end
) {
1266 switch (BYTE_TYPE(enc
, ptr
)) {
1267 #define LEAD_CASE(n) \
1268 case BT_LEAD ## n: ptr += n; break;
1269 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1273 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1275 return XML_TOK_DATA_CHARS
;
1278 int tok
= PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
),
1280 return (tok
== XML_TOK_PERCENT
) ? XML_TOK_INVALID
: tok
;
1283 return XML_TOK_DATA_CHARS
;
1286 *nextTokPtr
= ptr
+ MINBPC(enc
);
1287 return XML_TOK_DATA_NEWLINE
;
1290 return XML_TOK_DATA_CHARS
;
1295 return XML_TOK_TRAILING_CR
;
1296 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1299 return XML_TOK_DATA_NEWLINE
;
1302 return XML_TOK_DATA_CHARS
;
1309 return XML_TOK_DATA_CHARS
;
1315 int PREFIX(ignoreSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1316 const char **nextTokPtr
)
1319 if (MINBPC(enc
) > 1) {
1320 size_t n
= end
- ptr
;
1321 if (n
& (MINBPC(enc
) - 1)) {
1322 n
&= ~(MINBPC(enc
) - 1);
1326 while (ptr
!= end
) {
1327 switch (BYTE_TYPE(enc
, ptr
)) {
1328 INVALID_CASES(ptr
, nextTokPtr
)
1330 if ((ptr
+= MINBPC(enc
)) == end
)
1331 return XML_TOK_PARTIAL
;
1332 if (CHAR_MATCHES(enc
, ptr
, ASCII_EXCL
)) {
1333 if ((ptr
+= MINBPC(enc
)) == end
)
1334 return XML_TOK_PARTIAL
;
1335 if (CHAR_MATCHES(enc
, ptr
, ASCII_LSQB
)) {
1342 if ((ptr
+= MINBPC(enc
)) == end
)
1343 return XML_TOK_PARTIAL
;
1344 if (CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
)) {
1345 if ((ptr
+= MINBPC(enc
)) == end
)
1346 return XML_TOK_PARTIAL
;
1347 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
1351 return XML_TOK_IGNORE_SECT
;
1362 return XML_TOK_PARTIAL
;
1365 #endif /* XML_DTD */
1368 int PREFIX(isPublicId
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1369 const char **badPtr
)
1373 for (; ptr
!= end
; ptr
+= MINBPC(enc
)) {
1374 switch (BYTE_TYPE(enc
, ptr
)) {
1398 if (CHAR_MATCHES(enc
, ptr
, ASCII_TAB
)) {
1405 if (!(BYTE_TO_ASCII(enc
, ptr
) & ~0x7f))
1408 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1422 /* This must only be called for a well-formed start-tag or empty element tag.
1423 Returns the number of attributes. Pointers to the first attsMax attributes
1424 are stored in atts. */
1427 int PREFIX(getAtts
)(const ENCODING
*enc
, const char *ptr
,
1428 int attsMax
, ATTRIBUTE
*atts
)
1430 enum { other
, inName
, inValue
} state
= inName
;
1432 int open
= 0; /* defined when state == inValue;
1433 initialization just to shut up compilers */
1435 for (ptr
+= MINBPC(enc
);; ptr
+= MINBPC(enc
)) {
1436 switch (BYTE_TYPE(enc
, ptr
)) {
1437 #define START_NAME \
1438 if (state == other) { \
1439 if (nAtts < attsMax) { \
1440 atts[nAtts].name = ptr; \
1441 atts[nAtts].normalized = 1; \
1445 #define LEAD_CASE(n) \
1446 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1456 if (state
!= inValue
) {
1457 if (nAtts
< attsMax
)
1458 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1462 else if (open
== BT_QUOT
) {
1464 if (nAtts
< attsMax
)
1465 atts
[nAtts
].valueEnd
= ptr
;
1470 if (state
!= inValue
) {
1471 if (nAtts
< attsMax
)
1472 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1476 else if (open
== BT_APOS
) {
1478 if (nAtts
< attsMax
)
1479 atts
[nAtts
].valueEnd
= ptr
;
1484 if (nAtts
< attsMax
)
1485 atts
[nAtts
].normalized
= 0;
1488 if (state
== inName
)
1490 else if (state
== inValue
1492 && atts
[nAtts
].normalized
1493 && (ptr
== atts
[nAtts
].valuePtr
1494 || BYTE_TO_ASCII(enc
, ptr
) != ASCII_SPACE
1495 || BYTE_TO_ASCII(enc
, ptr
+ MINBPC(enc
)) == ASCII_SPACE
1496 || BYTE_TYPE(enc
, ptr
+ MINBPC(enc
)) == open
))
1497 atts
[nAtts
].normalized
= 0;
1499 case BT_CR
: case BT_LF
:
1500 /* This case ensures that the first attribute name is counted
1501 Apart from that we could just change state on the quote. */
1502 if (state
== inName
)
1504 else if (state
== inValue
&& nAtts
< attsMax
)
1505 atts
[nAtts
].normalized
= 0;
1509 if (state
!= inValue
)
1520 int PREFIX(charRefNumber
)(const ENCODING
*enc
, const char *ptr
)
1525 ptr
+= 2*MINBPC(enc
);
1526 if (CHAR_MATCHES(enc
, ptr
, ASCII_x
)) {
1527 for (ptr
+= MINBPC(enc
); !CHAR_MATCHES(enc
, ptr
, ASCII_SEMI
); ptr
+= MINBPC(enc
)) {
1528 int c
= BYTE_TO_ASCII(enc
, ptr
);
1530 case ASCII_0
: case ASCII_1
: case ASCII_2
: case ASCII_3
: case ASCII_4
:
1531 case ASCII_5
: case ASCII_6
: case ASCII_7
: case ASCII_8
: case ASCII_9
:
1533 result
|= (c
- ASCII_0
);
1535 case ASCII_A
: case ASCII_B
: case ASCII_C
: case ASCII_D
: case ASCII_E
: case ASCII_F
:
1537 result
+= 10 + (c
- ASCII_A
);
1539 case ASCII_a
: case ASCII_b
: case ASCII_c
: case ASCII_d
: case ASCII_e
: case ASCII_f
:
1541 result
+= 10 + (c
- ASCII_a
);
1544 if (result
>= 0x110000)
1549 for (; !CHAR_MATCHES(enc
, ptr
, ASCII_SEMI
); ptr
+= MINBPC(enc
)) {
1550 int c
= BYTE_TO_ASCII(enc
, ptr
);
1552 result
+= (c
- ASCII_0
);
1553 if (result
>= 0x110000)
1557 return checkCharRefNumber(result
);
1561 int PREFIX(predefinedEntityName
)(const ENCODING
*enc
, const char *ptr
, const char *end
)
1564 switch ((end
- ptr
)/MINBPC(enc
)) {
1566 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_t
)) {
1567 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1576 if (CHAR_MATCHES(enc
, ptr
, ASCII_a
)) {
1578 if (CHAR_MATCHES(enc
, ptr
, ASCII_m
)) {
1580 if (CHAR_MATCHES(enc
, ptr
, ASCII_p
))
1586 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1589 if (CHAR_MATCHES(enc
, ptr
, ASCII_u
)) {
1591 if (CHAR_MATCHES(enc
, ptr
, ASCII_o
)) {
1593 if (CHAR_MATCHES(enc
, ptr
, ASCII_t
))
1600 if (CHAR_MATCHES(enc
, ptr
, ASCII_p
)) {
1602 if (CHAR_MATCHES(enc
, ptr
, ASCII_o
)) {
1604 if (CHAR_MATCHES(enc
, ptr
, ASCII_s
))
1615 int PREFIX(sameName
)(const ENCODING
*enc
, const char *ptr1
, const char *ptr2
)
1618 switch (BYTE_TYPE(enc
, ptr1
)) {
1619 #define LEAD_CASE(n) \
1620 case BT_LEAD ## n: \
1621 if (*ptr1++ != *ptr2++) \
1623 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1626 if (*ptr1
++ != *ptr2
++)
1638 if (*ptr2
++ != *ptr1
++)
1640 if (MINBPC(enc
) > 1) {
1641 if (*ptr2
++ != *ptr1
++)
1643 if (MINBPC(enc
) > 2) {
1644 if (*ptr2
++ != *ptr1
++)
1646 if (MINBPC(enc
) > 3) {
1647 if (*ptr2
++ != *ptr1
++)
1654 if (MINBPC(enc
) == 1 && *ptr1
== *ptr2
)
1656 switch (BYTE_TYPE(enc
, ptr2
)) {
1679 int PREFIX(nameMatchesAscii
)(const ENCODING
*enc
, const char *ptr1
,
1680 const char *end1
, const char *ptr2
)
1683 for (; *ptr2
; ptr1
+= MINBPC(enc
), ptr2
++) {
1686 if (!CHAR_MATCHES(enc
, ptr1
, *ptr2
))
1689 return ptr1
== end1
;
1693 int PREFIX(nameLength
)(const ENCODING
*enc
, const char *ptr
)
1695 const char *start
= ptr
;
1697 switch (BYTE_TYPE(enc
, ptr
)) {
1698 #define LEAD_CASE(n) \
1699 case BT_LEAD ## n: ptr += n; break;
1700 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1720 const char *PREFIX(skipS
)(const ENCODING
*enc
, const char *ptr
)
1723 switch (BYTE_TYPE(enc
, ptr
)) {
1736 void PREFIX(updatePosition
)(const ENCODING
*enc
,
1741 while (ptr
!= end
) {
1742 switch (BYTE_TYPE(enc
, ptr
)) {
1743 #define LEAD_CASE(n) \
1744 case BT_LEAD ## n: \
1747 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1750 pos
->columnNumber
= (unsigned)-1;
1757 if (ptr
!= end
&& BYTE_TYPE(enc
, ptr
) == BT_LF
)
1759 pos
->columnNumber
= (unsigned)-1;
1765 pos
->columnNumber
++;
1770 #undef MULTIBYTE_CASES
1771 #undef INVALID_CASES
1772 #undef CHECK_NAME_CASE
1773 #undef CHECK_NAME_CASES
1774 #undef CHECK_NMSTRT_CASE
1775 #undef CHECK_NMSTRT_CASES