[gaim-migrate @ 3063]
[pidgin-git.git] / src / protocols / jabber / xmltok_impl.c
blob7d9c8d9642fc99a63e0f315c317344eb2d0f2962
1 /*
2 The contents of this file are subject to the Mozilla Public License
3 Version 1.1 (the "License"); you may not use this file except in
4 compliance with the License. You may obtain a copy of the License at
5 http://www.mozilla.org/MPL/
7 Software distributed under the License is distributed on an "AS IS"
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9 License for the specific language governing rights and limitations
10 under the License.
12 The Original Code is expat.
14 The Initial Developer of the Original Code is James Clark.
15 Portions created by James Clark are Copyright (C) 1998, 1999
16 James Clark. All Rights Reserved.
18 Contributor(s):
20 Alternatively, the contents of this file may be used under the terms
21 of the GNU General Public License (the "GPL"), in which case the
22 provisions of the GPL are applicable instead of those above. If you
23 wish to allow use of your version of this file only under the terms of
24 the GPL and not to allow others to use your version of this file under
25 the MPL, indicate your decision by deleting the provisions above and
26 replace them with the notice and other provisions required by the
27 GPL. If you do not delete the provisions above, a recipient may use
28 your version of this file under either the MPL or the GPL.
31 #ifndef IS_INVALID_CHAR
32 #define IS_INVALID_CHAR(enc, ptr, n) (0)
33 #endif
35 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
36 case BT_LEAD ## n: \
37 if (end - ptr < n) \
38 return XML_TOK_PARTIAL_CHAR; \
39 if (IS_INVALID_CHAR(enc, ptr, n)) { \
40 *(nextTokPtr) = (ptr); \
41 return XML_TOK_INVALID; \
42 } \
43 ptr += n; \
44 break;
46 #define INVALID_CASES(ptr, nextTokPtr) \
47 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
48 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
49 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
50 case BT_NONXML: \
51 case BT_MALFORM: \
52 case BT_TRAIL: \
53 *(nextTokPtr) = (ptr); \
54 return XML_TOK_INVALID;
56 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
57 case BT_LEAD ## n: \
58 if (end - ptr < n) \
59 return XML_TOK_PARTIAL_CHAR; \
60 if (!IS_NAME_CHAR(enc, ptr, n)) { \
61 *nextTokPtr = ptr; \
62 return XML_TOK_INVALID; \
63 } \
64 ptr += n; \
65 break;
67 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
68 case BT_NONASCII: \
69 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
70 *nextTokPtr = ptr; \
71 return XML_TOK_INVALID; \
72 } \
73 case BT_NMSTRT: \
74 case BT_HEX: \
75 case BT_DIGIT: \
76 case BT_NAME: \
77 case BT_MINUS: \
78 ptr += MINBPC(enc); \
79 break; \
80 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
81 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
82 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
84 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
85 case BT_LEAD ## n: \
86 if (end - ptr < n) \
87 return XML_TOK_PARTIAL_CHAR; \
88 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
89 *nextTokPtr = ptr; \
90 return XML_TOK_INVALID; \
91 } \
92 ptr += n; \
93 break;
95 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
96 case BT_NONASCII: \
97 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
98 *nextTokPtr = ptr; \
99 return XML_TOK_INVALID; \
101 case BT_NMSTRT: \
102 case BT_HEX: \
103 ptr += MINBPC(enc); \
104 break; \
105 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
106 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
107 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
109 #ifndef PREFIX
110 #define PREFIX(ident) ident
111 #endif
113 /* ptr points to character following "<!-" */
115 static
116 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
117 const char **nextTokPtr)
119 if (ptr != end) {
120 if (!CHAR_MATCHES(enc, ptr, '-')) {
121 *nextTokPtr = ptr;
122 return XML_TOK_INVALID;
124 ptr += MINBPC(enc);
125 while (ptr != end) {
126 switch (BYTE_TYPE(enc, ptr)) {
127 INVALID_CASES(ptr, nextTokPtr)
128 case BT_MINUS:
129 if ((ptr += MINBPC(enc)) == end)
130 return XML_TOK_PARTIAL;
131 if (CHAR_MATCHES(enc, ptr, '-')) {
132 if ((ptr += MINBPC(enc)) == end)
133 return XML_TOK_PARTIAL;
134 if (!CHAR_MATCHES(enc, ptr, '>')) {
135 *nextTokPtr = ptr;
136 return XML_TOK_INVALID;
138 *nextTokPtr = ptr + MINBPC(enc);
139 return XML_TOK_COMMENT;
141 break;
142 default:
143 ptr += MINBPC(enc);
144 break;
148 return XML_TOK_PARTIAL;
151 /* ptr points to character following "<!" */
153 static
154 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
155 const char **nextTokPtr)
157 if (ptr == end)
158 return XML_TOK_PARTIAL;
159 switch (BYTE_TYPE(enc, ptr)) {
160 case BT_MINUS:
161 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
162 case BT_LSQB:
163 *nextTokPtr = ptr + MINBPC(enc);
164 return XML_TOK_COND_SECT_OPEN;
165 case BT_NMSTRT:
166 case BT_HEX:
167 ptr += MINBPC(enc);
168 break;
169 default:
170 *nextTokPtr = ptr;
171 return XML_TOK_INVALID;
173 while (ptr != end) {
174 switch (BYTE_TYPE(enc, ptr)) {
175 case BT_PERCNT:
176 if (ptr + MINBPC(enc) == end)
177 return XML_TOK_PARTIAL;
178 /* don't allow <!ENTITY% foo "whatever"> */
179 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
180 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
181 *nextTokPtr = ptr;
182 return XML_TOK_INVALID;
184 /* fall through */
185 case BT_S: case BT_CR: case BT_LF:
186 *nextTokPtr = ptr;
187 return XML_TOK_DECL_OPEN;
188 case BT_NMSTRT:
189 case BT_HEX:
190 ptr += MINBPC(enc);
191 break;
192 default:
193 *nextTokPtr = ptr;
194 return XML_TOK_INVALID;
197 return XML_TOK_PARTIAL;
200 static
201 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
203 int upper = 0;
204 *tokPtr = XML_TOK_PI;
205 if (end - ptr != MINBPC(enc)*3)
206 return 1;
207 switch (BYTE_TO_ASCII(enc, ptr)) {
208 case 'x':
209 break;
210 case 'X':
211 upper = 1;
212 break;
213 default:
214 return 1;
216 ptr += MINBPC(enc);
217 switch (BYTE_TO_ASCII(enc, ptr)) {
218 case 'm':
219 break;
220 case 'M':
221 upper = 1;
222 break;
223 default:
224 return 1;
226 ptr += MINBPC(enc);
227 switch (BYTE_TO_ASCII(enc, ptr)) {
228 case 'l':
229 break;
230 case 'L':
231 upper = 1;
232 break;
233 default:
234 return 1;
236 if (upper)
237 return 0;
238 *tokPtr = XML_TOK_XML_DECL;
239 return 1;
242 /* ptr points to character following "<?" */
244 static
245 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
246 const char **nextTokPtr)
248 int tok;
249 const char *target = ptr;
250 if (ptr == end)
251 return XML_TOK_PARTIAL;
252 switch (BYTE_TYPE(enc, ptr)) {
253 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
254 default:
255 *nextTokPtr = ptr;
256 return XML_TOK_INVALID;
258 while (ptr != end) {
259 switch (BYTE_TYPE(enc, ptr)) {
260 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
261 case BT_S: case BT_CR: case BT_LF:
262 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
263 *nextTokPtr = ptr;
264 return XML_TOK_INVALID;
266 ptr += MINBPC(enc);
267 while (ptr != end) {
268 switch (BYTE_TYPE(enc, ptr)) {
269 INVALID_CASES(ptr, nextTokPtr)
270 case BT_QUEST:
271 ptr += MINBPC(enc);
272 if (ptr == end)
273 return XML_TOK_PARTIAL;
274 if (CHAR_MATCHES(enc, ptr, '>')) {
275 *nextTokPtr = ptr + MINBPC(enc);
276 return tok;
278 break;
279 default:
280 ptr += MINBPC(enc);
281 break;
284 return XML_TOK_PARTIAL;
285 case BT_QUEST:
286 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
287 *nextTokPtr = ptr;
288 return XML_TOK_INVALID;
290 ptr += MINBPC(enc);
291 if (ptr == end)
292 return XML_TOK_PARTIAL;
293 if (CHAR_MATCHES(enc, ptr, '>')) {
294 *nextTokPtr = ptr + MINBPC(enc);
295 return tok;
297 /* fall through */
298 default:
299 *nextTokPtr = ptr;
300 return XML_TOK_INVALID;
303 return XML_TOK_PARTIAL;
307 static
308 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
309 const char **nextTokPtr)
311 int i;
312 /* CDATA[ */
313 if (end - ptr < 6 * MINBPC(enc))
314 return XML_TOK_PARTIAL;
315 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
316 if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
317 *nextTokPtr = ptr;
318 return XML_TOK_INVALID;
321 *nextTokPtr = ptr;
322 return XML_TOK_CDATA_SECT_OPEN;
325 static
326 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
327 const char **nextTokPtr)
329 if (ptr == end)
330 return XML_TOK_NONE;
331 if (MINBPC(enc) > 1) {
332 size_t n = end - ptr;
333 if (n & (MINBPC(enc) - 1)) {
334 n &= ~(MINBPC(enc) - 1);
335 if (n == 0)
336 return XML_TOK_PARTIAL;
337 end = ptr + n;
340 switch (BYTE_TYPE(enc, ptr)) {
341 case BT_RSQB:
342 ptr += MINBPC(enc);
343 if (ptr == end)
344 return XML_TOK_PARTIAL;
345 if (!CHAR_MATCHES(enc, ptr, ']'))
346 break;
347 ptr += MINBPC(enc);
348 if (ptr == end)
349 return XML_TOK_PARTIAL;
350 if (!CHAR_MATCHES(enc, ptr, '>')) {
351 ptr -= MINBPC(enc);
352 break;
354 *nextTokPtr = ptr + MINBPC(enc);
355 return XML_TOK_CDATA_SECT_CLOSE;
356 case BT_CR:
357 ptr += MINBPC(enc);
358 if (ptr == end)
359 return XML_TOK_PARTIAL;
360 if (BYTE_TYPE(enc, ptr) == BT_LF)
361 ptr += MINBPC(enc);
362 *nextTokPtr = ptr;
363 return XML_TOK_DATA_NEWLINE;
364 case BT_LF:
365 *nextTokPtr = ptr + MINBPC(enc);
366 return XML_TOK_DATA_NEWLINE;
367 INVALID_CASES(ptr, nextTokPtr)
368 default:
369 ptr += MINBPC(enc);
370 break;
372 while (ptr != end) {
373 switch (BYTE_TYPE(enc, ptr)) {
374 #define LEAD_CASE(n) \
375 case BT_LEAD ## n: \
376 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
377 *nextTokPtr = ptr; \
378 return XML_TOK_DATA_CHARS; \
380 ptr += n; \
381 break;
382 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
383 #undef LEAD_CASE
384 case BT_NONXML:
385 case BT_MALFORM:
386 case BT_TRAIL:
387 case BT_CR:
388 case BT_LF:
389 case BT_RSQB:
390 *nextTokPtr = ptr;
391 return XML_TOK_DATA_CHARS;
392 default:
393 ptr += MINBPC(enc);
394 break;
397 *nextTokPtr = ptr;
398 return XML_TOK_DATA_CHARS;
401 /* ptr points to character following "</" */
403 static
404 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
405 const char **nextTokPtr)
407 if (ptr == end)
408 return XML_TOK_PARTIAL;
409 switch (BYTE_TYPE(enc, ptr)) {
410 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
411 default:
412 *nextTokPtr = ptr;
413 return XML_TOK_INVALID;
415 while (ptr != end) {
416 switch (BYTE_TYPE(enc, ptr)) {
417 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
418 case BT_S: case BT_CR: case BT_LF:
419 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
420 switch (BYTE_TYPE(enc, ptr)) {
421 case BT_S: case BT_CR: case BT_LF:
422 break;
423 case BT_GT:
424 *nextTokPtr = ptr + MINBPC(enc);
425 return XML_TOK_END_TAG;
426 default:
427 *nextTokPtr = ptr;
428 return XML_TOK_INVALID;
431 return XML_TOK_PARTIAL;
432 #ifdef XML_NS
433 case BT_COLON:
434 /* no need to check qname syntax here, since end-tag must match exactly */
435 ptr += MINBPC(enc);
436 break;
437 #endif
438 case BT_GT:
439 *nextTokPtr = ptr + MINBPC(enc);
440 return XML_TOK_END_TAG;
441 default:
442 *nextTokPtr = ptr;
443 return XML_TOK_INVALID;
446 return XML_TOK_PARTIAL;
449 /* ptr points to character following "&#X" */
451 static
452 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
453 const char **nextTokPtr)
455 if (ptr != end) {
456 switch (BYTE_TYPE(enc, ptr)) {
457 case BT_DIGIT:
458 case BT_HEX:
459 break;
460 default:
461 *nextTokPtr = ptr;
462 return XML_TOK_INVALID;
464 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
465 switch (BYTE_TYPE(enc, ptr)) {
466 case BT_DIGIT:
467 case BT_HEX:
468 break;
469 case BT_SEMI:
470 *nextTokPtr = ptr + MINBPC(enc);
471 return XML_TOK_CHAR_REF;
472 default:
473 *nextTokPtr = ptr;
474 return XML_TOK_INVALID;
478 return XML_TOK_PARTIAL;
481 /* ptr points to character following "&#" */
483 static
484 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
485 const char **nextTokPtr)
487 if (ptr != end) {
488 if (CHAR_MATCHES(enc, ptr, 'x'))
489 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
490 switch (BYTE_TYPE(enc, ptr)) {
491 case BT_DIGIT:
492 break;
493 default:
494 *nextTokPtr = ptr;
495 return XML_TOK_INVALID;
497 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
498 switch (BYTE_TYPE(enc, ptr)) {
499 case BT_DIGIT:
500 break;
501 case BT_SEMI:
502 *nextTokPtr = ptr + MINBPC(enc);
503 return XML_TOK_CHAR_REF;
504 default:
505 *nextTokPtr = ptr;
506 return XML_TOK_INVALID;
510 return XML_TOK_PARTIAL;
513 /* ptr points to character following "&" */
515 static
516 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
517 const char **nextTokPtr)
519 if (ptr == end)
520 return XML_TOK_PARTIAL;
521 switch (BYTE_TYPE(enc, ptr)) {
522 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
523 case BT_NUM:
524 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
525 default:
526 *nextTokPtr = ptr;
527 return XML_TOK_INVALID;
529 while (ptr != end) {
530 switch (BYTE_TYPE(enc, ptr)) {
531 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
532 case BT_SEMI:
533 *nextTokPtr = ptr + MINBPC(enc);
534 return XML_TOK_ENTITY_REF;
535 default:
536 *nextTokPtr = ptr;
537 return XML_TOK_INVALID;
540 return XML_TOK_PARTIAL;
543 /* ptr points to character following first character of attribute name */
545 static
546 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
547 const char **nextTokPtr)
549 #ifdef XML_NS
550 int hadColon = 0;
551 #endif
552 while (ptr != end) {
553 switch (BYTE_TYPE(enc, ptr)) {
554 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
555 #ifdef XML_NS
556 case BT_COLON:
557 if (hadColon) {
558 *nextTokPtr = ptr;
559 return XML_TOK_INVALID;
561 hadColon = 1;
562 ptr += MINBPC(enc);
563 if (ptr == end)
564 return XML_TOK_PARTIAL;
565 switch (BYTE_TYPE(enc, ptr)) {
566 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
567 default:
568 *nextTokPtr = ptr;
569 return XML_TOK_INVALID;
571 break;
572 #endif
573 case BT_S: case BT_CR: case BT_LF:
574 for (;;) {
575 int t;
577 ptr += MINBPC(enc);
578 if (ptr == end)
579 return XML_TOK_PARTIAL;
580 t = BYTE_TYPE(enc, ptr);
581 if (t == BT_EQUALS)
582 break;
583 switch (t) {
584 case BT_S:
585 case BT_LF:
586 case BT_CR:
587 break;
588 default:
589 *nextTokPtr = ptr;
590 return XML_TOK_INVALID;
593 /* fall through */
594 case BT_EQUALS:
596 int open;
597 #ifdef XML_NS
598 hadColon = 0;
599 #endif
600 for (;;) {
602 ptr += MINBPC(enc);
603 if (ptr == end)
604 return XML_TOK_PARTIAL;
605 open = BYTE_TYPE(enc, ptr);
606 if (open == BT_QUOT || open == BT_APOS)
607 break;
608 switch (open) {
609 case BT_S:
610 case BT_LF:
611 case BT_CR:
612 break;
613 default:
614 *nextTokPtr = ptr;
615 return XML_TOK_INVALID;
618 ptr += MINBPC(enc);
619 /* in attribute value */
620 for (;;) {
621 int t;
622 if (ptr == end)
623 return XML_TOK_PARTIAL;
624 t = BYTE_TYPE(enc, ptr);
625 if (t == open)
626 break;
627 switch (t) {
628 INVALID_CASES(ptr, nextTokPtr)
629 case BT_AMP:
631 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
632 if (tok <= 0) {
633 if (tok == XML_TOK_INVALID)
634 *nextTokPtr = ptr;
635 return tok;
637 break;
639 case BT_LT:
640 *nextTokPtr = ptr;
641 return XML_TOK_INVALID;
642 default:
643 ptr += MINBPC(enc);
644 break;
647 ptr += MINBPC(enc);
648 if (ptr == end)
649 return XML_TOK_PARTIAL;
650 switch (BYTE_TYPE(enc, ptr)) {
651 case BT_S:
652 case BT_CR:
653 case BT_LF:
654 break;
655 case BT_SOL:
656 goto sol;
657 case BT_GT:
658 goto gt;
659 default:
660 *nextTokPtr = ptr;
661 return XML_TOK_INVALID;
663 /* ptr points to closing quote */
664 for (;;) {
665 ptr += MINBPC(enc);
666 if (ptr == end)
667 return XML_TOK_PARTIAL;
668 switch (BYTE_TYPE(enc, ptr)) {
669 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
670 case BT_S: case BT_CR: case BT_LF:
671 continue;
672 case BT_GT:
674 *nextTokPtr = ptr + MINBPC(enc);
675 return XML_TOK_START_TAG_WITH_ATTS;
676 case BT_SOL:
677 sol:
678 ptr += MINBPC(enc);
679 if (ptr == end)
680 return XML_TOK_PARTIAL;
681 if (!CHAR_MATCHES(enc, ptr, '>')) {
682 *nextTokPtr = ptr;
683 return XML_TOK_INVALID;
685 *nextTokPtr = ptr + MINBPC(enc);
686 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
687 default:
688 *nextTokPtr = ptr;
689 return XML_TOK_INVALID;
691 break;
693 break;
695 default:
696 *nextTokPtr = ptr;
697 return XML_TOK_INVALID;
700 return XML_TOK_PARTIAL;
703 /* ptr points to character following "<" */
705 static
706 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
707 const char **nextTokPtr)
709 #ifdef XML_NS
710 int hadColon;
711 #endif
712 if (ptr == end)
713 return XML_TOK_PARTIAL;
714 switch (BYTE_TYPE(enc, ptr)) {
715 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
716 case BT_EXCL:
717 if ((ptr += MINBPC(enc)) == end)
718 return XML_TOK_PARTIAL;
719 switch (BYTE_TYPE(enc, ptr)) {
720 case BT_MINUS:
721 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
722 case BT_LSQB:
723 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
725 *nextTokPtr = ptr;
726 return XML_TOK_INVALID;
727 case BT_QUEST:
728 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729 case BT_SOL:
730 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
731 default:
732 *nextTokPtr = ptr;
733 return XML_TOK_INVALID;
735 #ifdef XML_NS
736 hadColon = 0;
737 #endif
738 /* we have a start-tag */
739 while (ptr != end) {
740 switch (BYTE_TYPE(enc, ptr)) {
741 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
742 #ifdef XML_NS
743 case BT_COLON:
744 if (hadColon) {
745 *nextTokPtr = ptr;
746 return XML_TOK_INVALID;
748 hadColon = 1;
749 ptr += MINBPC(enc);
750 if (ptr == end)
751 return XML_TOK_PARTIAL;
752 switch (BYTE_TYPE(enc, ptr)) {
753 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
754 default:
755 *nextTokPtr = ptr;
756 return XML_TOK_INVALID;
758 break;
759 #endif
760 case BT_S: case BT_CR: case BT_LF:
762 ptr += MINBPC(enc);
763 while (ptr != end) {
764 switch (BYTE_TYPE(enc, ptr)) {
765 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
766 case BT_GT:
767 goto gt;
768 case BT_SOL:
769 goto sol;
770 case BT_S: case BT_CR: case BT_LF:
771 ptr += MINBPC(enc);
772 continue;
773 default:
774 *nextTokPtr = ptr;
775 return XML_TOK_INVALID;
777 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
779 return XML_TOK_PARTIAL;
781 case BT_GT:
783 *nextTokPtr = ptr + MINBPC(enc);
784 return XML_TOK_START_TAG_NO_ATTS;
785 case BT_SOL:
786 sol:
787 ptr += MINBPC(enc);
788 if (ptr == end)
789 return XML_TOK_PARTIAL;
790 if (!CHAR_MATCHES(enc, ptr, '>')) {
791 *nextTokPtr = ptr;
792 return XML_TOK_INVALID;
794 *nextTokPtr = ptr + MINBPC(enc);
795 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
796 default:
797 *nextTokPtr = ptr;
798 return XML_TOK_INVALID;
801 return XML_TOK_PARTIAL;
804 static
805 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
806 const char **nextTokPtr)
808 if (ptr == end)
809 return XML_TOK_NONE;
810 if (MINBPC(enc) > 1) {
811 size_t n = end - ptr;
812 if (n & (MINBPC(enc) - 1)) {
813 n &= ~(MINBPC(enc) - 1);
814 if (n == 0)
815 return XML_TOK_PARTIAL;
816 end = ptr + n;
819 switch (BYTE_TYPE(enc, ptr)) {
820 case BT_LT:
821 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
822 case BT_AMP:
823 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
824 case BT_CR:
825 ptr += MINBPC(enc);
826 if (ptr == end)
827 return XML_TOK_TRAILING_CR;
828 if (BYTE_TYPE(enc, ptr) == BT_LF)
829 ptr += MINBPC(enc);
830 *nextTokPtr = ptr;
831 return XML_TOK_DATA_NEWLINE;
832 case BT_LF:
833 *nextTokPtr = ptr + MINBPC(enc);
834 return XML_TOK_DATA_NEWLINE;
835 case BT_RSQB:
836 ptr += MINBPC(enc);
837 if (ptr == end)
838 return XML_TOK_TRAILING_RSQB;
839 if (!CHAR_MATCHES(enc, ptr, ']'))
840 break;
841 ptr += MINBPC(enc);
842 if (ptr == end)
843 return XML_TOK_TRAILING_RSQB;
844 if (!CHAR_MATCHES(enc, ptr, '>')) {
845 ptr -= MINBPC(enc);
846 break;
848 *nextTokPtr = ptr;
849 return XML_TOK_INVALID;
850 INVALID_CASES(ptr, nextTokPtr)
851 default:
852 ptr += MINBPC(enc);
853 break;
855 while (ptr != end) {
856 switch (BYTE_TYPE(enc, ptr)) {
857 #define LEAD_CASE(n) \
858 case BT_LEAD ## n: \
859 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
860 *nextTokPtr = ptr; \
861 return XML_TOK_DATA_CHARS; \
863 ptr += n; \
864 break;
865 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
866 #undef LEAD_CASE
867 case BT_RSQB:
868 if (ptr + MINBPC(enc) != end) {
869 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ']')) {
870 ptr += MINBPC(enc);
871 break;
873 if (ptr + 2*MINBPC(enc) != end) {
874 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), '>')) {
875 ptr += MINBPC(enc);
876 break;
878 *nextTokPtr = ptr + 2*MINBPC(enc);
879 return XML_TOK_INVALID;
882 /* fall through */
883 case BT_AMP:
884 case BT_LT:
885 case BT_NONXML:
886 case BT_MALFORM:
887 case BT_TRAIL:
888 case BT_CR:
889 case BT_LF:
890 *nextTokPtr = ptr;
891 return XML_TOK_DATA_CHARS;
892 default:
893 ptr += MINBPC(enc);
894 break;
897 *nextTokPtr = ptr;
898 return XML_TOK_DATA_CHARS;
901 /* ptr points to character following "%" */
903 static
904 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
905 const char **nextTokPtr)
907 if (ptr == end)
908 return XML_TOK_PARTIAL;
909 switch (BYTE_TYPE(enc, ptr)) {
910 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
911 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
912 *nextTokPtr = ptr;
913 return XML_TOK_PERCENT;
914 default:
915 *nextTokPtr = ptr;
916 return XML_TOK_INVALID;
918 while (ptr != end) {
919 switch (BYTE_TYPE(enc, ptr)) {
920 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
921 case BT_SEMI:
922 *nextTokPtr = ptr + MINBPC(enc);
923 return XML_TOK_PARAM_ENTITY_REF;
924 default:
925 *nextTokPtr = ptr;
926 return XML_TOK_INVALID;
929 return XML_TOK_PARTIAL;
932 static
933 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
934 const char **nextTokPtr)
936 if (ptr == end)
937 return XML_TOK_PARTIAL;
938 switch (BYTE_TYPE(enc, ptr)) {
939 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
940 default:
941 *nextTokPtr = ptr;
942 return XML_TOK_INVALID;
944 while (ptr != end) {
945 switch (BYTE_TYPE(enc, ptr)) {
946 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
947 case BT_CR: case BT_LF: case BT_S:
948 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
949 *nextTokPtr = ptr;
950 return XML_TOK_POUND_NAME;
951 default:
952 *nextTokPtr = ptr;
953 return XML_TOK_INVALID;
956 return XML_TOK_PARTIAL;
959 static
960 int PREFIX(scanLit)(int open, const ENCODING *enc,
961 const char *ptr, const char *end,
962 const char **nextTokPtr)
964 while (ptr != end) {
965 int t = BYTE_TYPE(enc, ptr);
966 switch (t) {
967 INVALID_CASES(ptr, nextTokPtr)
968 case BT_QUOT:
969 case BT_APOS:
970 ptr += MINBPC(enc);
971 if (t != open)
972 break;
973 if (ptr == end)
974 return XML_TOK_PARTIAL;
975 *nextTokPtr = ptr;
976 switch (BYTE_TYPE(enc, ptr)) {
977 case BT_S: case BT_CR: case BT_LF:
978 case BT_GT: case BT_PERCNT: case BT_LSQB:
979 return XML_TOK_LITERAL;
980 default:
981 return XML_TOK_INVALID;
983 default:
984 ptr += MINBPC(enc);
985 break;
988 return XML_TOK_PARTIAL;
991 static
992 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
993 const char **nextTokPtr)
995 int tok;
996 if (ptr == end)
997 return XML_TOK_NONE;
998 if (MINBPC(enc) > 1) {
999 size_t n = end - ptr;
1000 if (n & (MINBPC(enc) - 1)) {
1001 n &= ~(MINBPC(enc) - 1);
1002 if (n == 0)
1003 return XML_TOK_PARTIAL;
1004 end = ptr + n;
1007 switch (BYTE_TYPE(enc, ptr)) {
1008 case BT_QUOT:
1009 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1010 case BT_APOS:
1011 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1012 case BT_LT:
1014 ptr += MINBPC(enc);
1015 if (ptr == end)
1016 return XML_TOK_PARTIAL;
1017 switch (BYTE_TYPE(enc, ptr)) {
1018 case BT_EXCL:
1019 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1020 case BT_QUEST:
1021 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1022 case BT_NMSTRT:
1023 case BT_HEX:
1024 case BT_NONASCII:
1025 case BT_LEAD2:
1026 case BT_LEAD3:
1027 case BT_LEAD4:
1028 *nextTokPtr = ptr - MINBPC(enc);
1029 return XML_TOK_INSTANCE_START;
1031 *nextTokPtr = ptr;
1032 return XML_TOK_INVALID;
1034 case BT_CR:
1035 if (ptr + MINBPC(enc) == end)
1036 return XML_TOK_TRAILING_CR;
1037 /* fall through */
1038 case BT_S: case BT_LF:
1039 for (;;) {
1040 ptr += MINBPC(enc);
1041 if (ptr == end)
1042 break;
1043 switch (BYTE_TYPE(enc, ptr)) {
1044 case BT_S: case BT_LF:
1045 break;
1046 case BT_CR:
1047 /* don't split CR/LF pair */
1048 if (ptr + MINBPC(enc) != end)
1049 break;
1050 /* fall through */
1051 default:
1052 *nextTokPtr = ptr;
1053 return XML_TOK_PROLOG_S;
1056 *nextTokPtr = ptr;
1057 return XML_TOK_PROLOG_S;
1058 case BT_PERCNT:
1059 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1060 case BT_COMMA:
1061 *nextTokPtr = ptr + MINBPC(enc);
1062 return XML_TOK_COMMA;
1063 case BT_LSQB:
1064 *nextTokPtr = ptr + MINBPC(enc);
1065 return XML_TOK_OPEN_BRACKET;
1066 case BT_RSQB:
1067 ptr += MINBPC(enc);
1068 if (ptr == end)
1069 return XML_TOK_PARTIAL;
1070 if (CHAR_MATCHES(enc, ptr, ']')) {
1071 if (ptr + MINBPC(enc) == end)
1072 return XML_TOK_PARTIAL;
1073 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), '>')) {
1074 *nextTokPtr = ptr + 2*MINBPC(enc);
1075 return XML_TOK_COND_SECT_CLOSE;
1078 *nextTokPtr = ptr;
1079 return XML_TOK_CLOSE_BRACKET;
1080 case BT_LPAR:
1081 *nextTokPtr = ptr + MINBPC(enc);
1082 return XML_TOK_OPEN_PAREN;
1083 case BT_RPAR:
1084 ptr += MINBPC(enc);
1085 if (ptr == end)
1086 return XML_TOK_PARTIAL;
1087 switch (BYTE_TYPE(enc, ptr)) {
1088 case BT_AST:
1089 *nextTokPtr = ptr + MINBPC(enc);
1090 return XML_TOK_CLOSE_PAREN_ASTERISK;
1091 case BT_QUEST:
1092 *nextTokPtr = ptr + MINBPC(enc);
1093 return XML_TOK_CLOSE_PAREN_QUESTION;
1094 case BT_PLUS:
1095 *nextTokPtr = ptr + MINBPC(enc);
1096 return XML_TOK_CLOSE_PAREN_PLUS;
1097 case BT_CR: case BT_LF: case BT_S:
1098 case BT_GT: case BT_COMMA: case BT_VERBAR:
1099 case BT_RPAR:
1100 *nextTokPtr = ptr;
1101 return XML_TOK_CLOSE_PAREN;
1103 *nextTokPtr = ptr;
1104 return XML_TOK_INVALID;
1105 case BT_VERBAR:
1106 *nextTokPtr = ptr + MINBPC(enc);
1107 return XML_TOK_OR;
1108 case BT_GT:
1109 *nextTokPtr = ptr + MINBPC(enc);
1110 return XML_TOK_DECL_CLOSE;
1111 case BT_NUM:
1112 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1113 #define LEAD_CASE(n) \
1114 case BT_LEAD ## n: \
1115 if (end - ptr < n) \
1116 return XML_TOK_PARTIAL_CHAR; \
1117 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1118 ptr += n; \
1119 tok = XML_TOK_NAME; \
1120 break; \
1122 if (IS_NAME_CHAR(enc, ptr, n)) { \
1123 ptr += n; \
1124 tok = XML_TOK_NMTOKEN; \
1125 break; \
1127 *nextTokPtr = ptr; \
1128 return XML_TOK_INVALID;
1129 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1130 #undef LEAD_CASE
1131 case BT_NMSTRT:
1132 case BT_HEX:
1133 tok = XML_TOK_NAME;
1134 ptr += MINBPC(enc);
1135 break;
1136 case BT_DIGIT:
1137 case BT_NAME:
1138 case BT_MINUS:
1139 #ifdef XML_NS
1140 case BT_COLON:
1141 #endif
1142 tok = XML_TOK_NMTOKEN;
1143 ptr += MINBPC(enc);
1144 break;
1145 case BT_NONASCII:
1146 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1147 ptr += MINBPC(enc);
1148 tok = XML_TOK_NAME;
1149 break;
1151 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1152 ptr += MINBPC(enc);
1153 tok = XML_TOK_NMTOKEN;
1154 break;
1156 /* fall through */
1157 default:
1158 *nextTokPtr = ptr;
1159 return XML_TOK_INVALID;
1161 while (ptr != end) {
1162 switch (BYTE_TYPE(enc, ptr)) {
1163 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1164 case BT_GT: case BT_RPAR: case BT_COMMA:
1165 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1166 case BT_S: case BT_CR: case BT_LF:
1167 *nextTokPtr = ptr;
1168 return tok;
1169 #ifdef XML_NS
1170 case BT_COLON:
1171 ptr += MINBPC(enc);
1172 switch (tok) {
1173 case XML_TOK_NAME:
1174 if (ptr == end)
1175 return XML_TOK_PARTIAL;
1176 tok = XML_TOK_PREFIXED_NAME;
1177 switch (BYTE_TYPE(enc, ptr)) {
1178 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1179 default:
1180 tok = XML_TOK_NMTOKEN;
1181 break;
1183 break;
1184 case XML_TOK_PREFIXED_NAME:
1185 tok = XML_TOK_NMTOKEN;
1186 break;
1188 break;
1189 #endif
1190 case BT_PLUS:
1191 if (tok == XML_TOK_NMTOKEN) {
1192 *nextTokPtr = ptr;
1193 return XML_TOK_INVALID;
1195 *nextTokPtr = ptr + MINBPC(enc);
1196 return XML_TOK_NAME_PLUS;
1197 case BT_AST:
1198 if (tok == XML_TOK_NMTOKEN) {
1199 *nextTokPtr = ptr;
1200 return XML_TOK_INVALID;
1202 *nextTokPtr = ptr + MINBPC(enc);
1203 return XML_TOK_NAME_ASTERISK;
1204 case BT_QUEST:
1205 if (tok == XML_TOK_NMTOKEN) {
1206 *nextTokPtr = ptr;
1207 return XML_TOK_INVALID;
1209 *nextTokPtr = ptr + MINBPC(enc);
1210 return XML_TOK_NAME_QUESTION;
1211 default:
1212 *nextTokPtr = ptr;
1213 return XML_TOK_INVALID;
1216 return XML_TOK_PARTIAL;
1219 static
1220 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1221 const char **nextTokPtr)
1223 const char *start;
1224 if (ptr == end)
1225 return XML_TOK_NONE;
1226 start = ptr;
1227 while (ptr != end) {
1228 switch (BYTE_TYPE(enc, ptr)) {
1229 #define LEAD_CASE(n) \
1230 case BT_LEAD ## n: ptr += n; break;
1231 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1232 #undef LEAD_CASE
1233 case BT_AMP:
1234 if (ptr == start)
1235 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1236 *nextTokPtr = ptr;
1237 return XML_TOK_DATA_CHARS;
1238 case BT_LT:
1239 /* this is for inside entity references */
1240 *nextTokPtr = ptr;
1241 return XML_TOK_INVALID;
1242 case BT_LF:
1243 if (ptr == start) {
1244 *nextTokPtr = ptr + MINBPC(enc);
1245 return XML_TOK_DATA_NEWLINE;
1247 *nextTokPtr = ptr;
1248 return XML_TOK_DATA_CHARS;
1249 case BT_CR:
1250 if (ptr == start) {
1251 ptr += MINBPC(enc);
1252 if (ptr == end)
1253 return XML_TOK_TRAILING_CR;
1254 if (BYTE_TYPE(enc, ptr) == BT_LF)
1255 ptr += MINBPC(enc);
1256 *nextTokPtr = ptr;
1257 return XML_TOK_DATA_NEWLINE;
1259 *nextTokPtr = ptr;
1260 return XML_TOK_DATA_CHARS;
1261 case BT_S:
1262 if (ptr == start) {
1263 *nextTokPtr = ptr + MINBPC(enc);
1264 return XML_TOK_ATTRIBUTE_VALUE_S;
1266 *nextTokPtr = ptr;
1267 return XML_TOK_DATA_CHARS;
1268 default:
1269 ptr += MINBPC(enc);
1270 break;
1273 *nextTokPtr = ptr;
1274 return XML_TOK_DATA_CHARS;
1277 static
1278 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1279 const char **nextTokPtr)
1281 const char *start;
1282 if (ptr == end)
1283 return XML_TOK_NONE;
1284 start = ptr;
1285 while (ptr != end) {
1286 switch (BYTE_TYPE(enc, ptr)) {
1287 #define LEAD_CASE(n) \
1288 case BT_LEAD ## n: ptr += n; break;
1289 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1290 #undef LEAD_CASE
1291 case BT_AMP:
1292 if (ptr == start)
1293 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1294 *nextTokPtr = ptr;
1295 return XML_TOK_DATA_CHARS;
1296 case BT_PERCNT:
1297 if (ptr == start)
1298 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1299 *nextTokPtr = ptr;
1300 return XML_TOK_DATA_CHARS;
1301 case BT_LF:
1302 if (ptr == start) {
1303 *nextTokPtr = ptr + MINBPC(enc);
1304 return XML_TOK_DATA_NEWLINE;
1306 *nextTokPtr = ptr;
1307 return XML_TOK_DATA_CHARS;
1308 case BT_CR:
1309 if (ptr == start) {
1310 ptr += MINBPC(enc);
1311 if (ptr == end)
1312 return XML_TOK_TRAILING_CR;
1313 if (BYTE_TYPE(enc, ptr) == BT_LF)
1314 ptr += MINBPC(enc);
1315 *nextTokPtr = ptr;
1316 return XML_TOK_DATA_NEWLINE;
1318 *nextTokPtr = ptr;
1319 return XML_TOK_DATA_CHARS;
1320 default:
1321 ptr += MINBPC(enc);
1322 break;
1325 *nextTokPtr = ptr;
1326 return XML_TOK_DATA_CHARS;
1329 static
1330 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1331 const char **badPtr)
1333 ptr += MINBPC(enc);
1334 end -= MINBPC(enc);
1335 for (; ptr != end; ptr += MINBPC(enc)) {
1336 switch (BYTE_TYPE(enc, ptr)) {
1337 case BT_DIGIT:
1338 case BT_HEX:
1339 case BT_MINUS:
1340 case BT_APOS:
1341 case BT_LPAR:
1342 case BT_RPAR:
1343 case BT_PLUS:
1344 case BT_COMMA:
1345 case BT_SOL:
1346 case BT_EQUALS:
1347 case BT_QUEST:
1348 case BT_CR:
1349 case BT_LF:
1350 case BT_SEMI:
1351 case BT_EXCL:
1352 case BT_AST:
1353 case BT_PERCNT:
1354 case BT_NUM:
1355 #ifdef XML_NS
1356 case BT_COLON:
1357 #endif
1358 break;
1359 case BT_S:
1360 if (CHAR_MATCHES(enc, ptr, '\t')) {
1361 *badPtr = ptr;
1362 return 0;
1364 break;
1365 case BT_NAME:
1366 case BT_NMSTRT:
1367 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1368 break;
1369 default:
1370 switch (BYTE_TO_ASCII(enc, ptr)) {
1371 case 0x24: /* $ */
1372 case 0x40: /* @ */
1373 break;
1374 default:
1375 *badPtr = ptr;
1376 return 0;
1378 break;
1381 return 1;
1384 /* This must only be called for a well-formed start-tag or empty element tag.
1385 Returns the number of attributes. Pointers to the first attsMax attributes
1386 are stored in atts. */
1388 static
1389 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1390 int attsMax, ATTRIBUTE *atts)
1392 enum { other, inName, inValue } state = inName;
1393 int nAtts = 0;
1394 int open;
1396 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1397 switch (BYTE_TYPE(enc, ptr)) {
1398 #define START_NAME \
1399 if (state == other) { \
1400 if (nAtts < attsMax) { \
1401 atts[nAtts].name = ptr; \
1402 atts[nAtts].normalized = 1; \
1404 state = inName; \
1406 #define LEAD_CASE(n) \
1407 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1408 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1409 #undef LEAD_CASE
1410 case BT_NONASCII:
1411 case BT_NMSTRT:
1412 case BT_HEX:
1413 START_NAME
1414 break;
1415 #undef START_NAME
1416 case BT_QUOT:
1417 if (state != inValue) {
1418 if (nAtts < attsMax)
1419 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1420 state = inValue;
1421 open = BT_QUOT;
1423 else if (open == BT_QUOT) {
1424 state = other;
1425 if (nAtts < attsMax)
1426 atts[nAtts].valueEnd = ptr;
1427 nAtts++;
1429 break;
1430 case BT_APOS:
1431 if (state != inValue) {
1432 if (nAtts < attsMax)
1433 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1434 state = inValue;
1435 open = BT_APOS;
1437 else if (open == BT_APOS) {
1438 state = other;
1439 if (nAtts < attsMax)
1440 atts[nAtts].valueEnd = ptr;
1441 nAtts++;
1443 break;
1444 case BT_AMP:
1445 if (nAtts < attsMax)
1446 atts[nAtts].normalized = 0;
1447 break;
1448 case BT_S:
1449 if (state == inName)
1450 state = other;
1451 else if (state == inValue
1452 && nAtts < attsMax
1453 && atts[nAtts].normalized
1454 && (ptr == atts[nAtts].valuePtr
1455 || BYTE_TO_ASCII(enc, ptr) != ' '
1456 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ' '
1457 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1458 atts[nAtts].normalized = 0;
1459 break;
1460 case BT_CR: case BT_LF:
1461 /* This case ensures that the first attribute name is counted
1462 Apart from that we could just change state on the quote. */
1463 if (state == inName)
1464 state = other;
1465 else if (state == inValue && nAtts < attsMax)
1466 atts[nAtts].normalized = 0;
1467 break;
1468 case BT_GT:
1469 case BT_SOL:
1470 if (state != inValue)
1471 return nAtts;
1472 break;
1473 default:
1474 break;
1477 /* not reached */
1480 static
1481 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1483 int result = 0;
1484 /* skip &# */
1485 ptr += 2*MINBPC(enc);
1486 if (CHAR_MATCHES(enc, ptr, 'x')) {
1487 for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC(enc)) {
1488 int c = BYTE_TO_ASCII(enc, ptr);
1489 switch (c) {
1490 case '0': case '1': case '2': case '3': case '4':
1491 case '5': case '6': case '7': case '8': case '9':
1492 result <<= 4;
1493 result |= (c - '0');
1494 break;
1495 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1496 result <<= 4;
1497 result += 10 + (c - 'A');
1498 break;
1499 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1500 result <<= 4;
1501 result += 10 + (c - 'a');
1502 break;
1504 if (result >= 0x110000)
1505 return -1;
1508 else {
1509 for (; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC(enc)) {
1510 int c = BYTE_TO_ASCII(enc, ptr);
1511 result *= 10;
1512 result += (c - '0');
1513 if (result >= 0x110000)
1514 return -1;
1517 return checkCharRefNumber(result);
1520 static
1521 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1523 switch ((end - ptr)/MINBPC(enc)) {
1524 case 2:
1525 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), 't')) {
1526 switch (BYTE_TO_ASCII(enc, ptr)) {
1527 case 'l':
1528 return '<';
1529 case 'g':
1530 return '>';
1533 break;
1534 case 3:
1535 if (CHAR_MATCHES(enc, ptr, 'a')) {
1536 ptr += MINBPC(enc);
1537 if (CHAR_MATCHES(enc, ptr, 'm')) {
1538 ptr += MINBPC(enc);
1539 if (CHAR_MATCHES(enc, ptr, 'p'))
1540 return '&';
1543 break;
1544 case 4:
1545 switch (BYTE_TO_ASCII(enc, ptr)) {
1546 case 'q':
1547 ptr += MINBPC(enc);
1548 if (CHAR_MATCHES(enc, ptr, 'u')) {
1549 ptr += MINBPC(enc);
1550 if (CHAR_MATCHES(enc, ptr, 'o')) {
1551 ptr += MINBPC(enc);
1552 if (CHAR_MATCHES(enc, ptr, 't'))
1553 return '"';
1556 break;
1557 case 'a':
1558 ptr += MINBPC(enc);
1559 if (CHAR_MATCHES(enc, ptr, 'p')) {
1560 ptr += MINBPC(enc);
1561 if (CHAR_MATCHES(enc, ptr, 'o')) {
1562 ptr += MINBPC(enc);
1563 if (CHAR_MATCHES(enc, ptr, 's'))
1564 return '\'';
1567 break;
1570 return 0;
1573 static
1574 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1576 for (;;) {
1577 switch (BYTE_TYPE(enc, ptr1)) {
1578 #define LEAD_CASE(n) \
1579 case BT_LEAD ## n: \
1580 if (*ptr1++ != *ptr2++) \
1581 return 0;
1582 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1583 #undef LEAD_CASE
1584 /* fall through */
1585 if (*ptr1++ != *ptr2++)
1586 return 0;
1587 break;
1588 case BT_NONASCII:
1589 case BT_NMSTRT:
1590 #ifdef XML_NS
1591 case BT_COLON:
1592 #endif
1593 case BT_HEX:
1594 case BT_DIGIT:
1595 case BT_NAME:
1596 case BT_MINUS:
1597 if (*ptr2++ != *ptr1++)
1598 return 0;
1599 if (MINBPC(enc) > 1) {
1600 if (*ptr2++ != *ptr1++)
1601 return 0;
1602 if (MINBPC(enc) > 2) {
1603 if (*ptr2++ != *ptr1++)
1604 return 0;
1605 if (MINBPC(enc) > 3) {
1606 if (*ptr2++ != *ptr1++)
1607 return 0;
1611 break;
1612 default:
1613 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1614 return 1;
1615 switch (BYTE_TYPE(enc, ptr2)) {
1616 case BT_LEAD2:
1617 case BT_LEAD3:
1618 case BT_LEAD4:
1619 case BT_NONASCII:
1620 case BT_NMSTRT:
1621 #ifdef XML_NS
1622 case BT_COLON:
1623 #endif
1624 case BT_HEX:
1625 case BT_DIGIT:
1626 case BT_NAME:
1627 case BT_MINUS:
1628 return 0;
1629 default:
1630 return 1;
1634 /* not reached */
1637 static
1638 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1640 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1641 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1642 return 0;
1644 switch (BYTE_TYPE(enc, ptr1)) {
1645 case BT_LEAD2:
1646 case BT_LEAD3:
1647 case BT_LEAD4:
1648 case BT_NONASCII:
1649 case BT_NMSTRT:
1650 #ifdef XML_NS
1651 case BT_COLON:
1652 #endif
1653 case BT_HEX:
1654 case BT_DIGIT:
1655 case BT_NAME:
1656 case BT_MINUS:
1657 return 0;
1658 default:
1659 return 1;
1663 static
1664 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1666 const char *start = ptr;
1667 for (;;) {
1668 switch (BYTE_TYPE(enc, ptr)) {
1669 #define LEAD_CASE(n) \
1670 case BT_LEAD ## n: ptr += n; break;
1671 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1672 #undef LEAD_CASE
1673 case BT_NONASCII:
1674 case BT_NMSTRT:
1675 #ifdef XML_NS
1676 case BT_COLON:
1677 #endif
1678 case BT_HEX:
1679 case BT_DIGIT:
1680 case BT_NAME:
1681 case BT_MINUS:
1682 ptr += MINBPC(enc);
1683 break;
1684 default:
1685 return ptr - start;
1690 static
1691 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1693 for (;;) {
1694 switch (BYTE_TYPE(enc, ptr)) {
1695 case BT_LF:
1696 case BT_CR:
1697 case BT_S:
1698 ptr += MINBPC(enc);
1699 break;
1700 default:
1701 return ptr;
1706 static
1707 void PREFIX(updatePosition)(const ENCODING *enc,
1708 const char *ptr,
1709 const char *end,
1710 POSITION *pos)
1712 while (ptr != end) {
1713 switch (BYTE_TYPE(enc, ptr)) {
1714 #define LEAD_CASE(n) \
1715 case BT_LEAD ## n: \
1716 ptr += n; \
1717 break;
1718 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1719 #undef LEAD_CASE
1720 case BT_LF:
1721 pos->columnNumber = (unsigned)-1;
1722 pos->lineNumber++;
1723 ptr += MINBPC(enc);
1724 break;
1725 case BT_CR:
1726 pos->lineNumber++;
1727 ptr += MINBPC(enc);
1728 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1729 ptr += MINBPC(enc);
1730 pos->columnNumber = (unsigned)-1;
1731 break;
1732 default:
1733 ptr += MINBPC(enc);
1734 break;
1736 pos->columnNumber++;
1740 #undef DO_LEAD_CASE
1741 #undef MULTIBYTE_CASES
1742 #undef INVALID_CASES
1743 #undef CHECK_NAME_CASE
1744 #undef CHECK_NAME_CASES
1745 #undef CHECK_NMSTRT_CASE
1746 #undef CHECK_NMSTRT_CASES