iconv: Bail out of the loop when an illegal sequence of bytes occurs.
[elinks/elinks-j605.git] / src / dom / sgml / scanner.c
blob635229468cf41792a5b0760b4b778e211238ed4b
1 /* SGML token scanner utilities */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <stdio.h>
8 #include <string.h>
10 #include "elinks.h"
12 #include "dom/scanner.h"
13 #include "dom/sgml/scanner.h"
14 #include "dom/string.h"
15 #include "util/error.h"
18 /* Bitmap entries for the SGML character groups used in the scanner table */
20 enum sgml_char_group {
21 SGML_CHAR_ENTITY = (1 << 1),
22 SGML_CHAR_IDENT = (1 << 2),
23 SGML_CHAR_NEWLINE = (1 << 3),
24 SGML_CHAR_WHITESPACE = (1 << 4),
25 SGML_CHAR_NOT_TEXT = (1 << 5),
26 SGML_CHAR_NOT_ATTRIBUTE = (1 << 6),
29 static struct dom_scan_table_info sgml_scan_table_info[] = {
30 DOM_SCAN_TABLE_RANGE("0", '9', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
31 DOM_SCAN_TABLE_RANGE("A", 'Z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
32 DOM_SCAN_TABLE_RANGE("a", 'z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
33 /* For the octal number impared (me including) \241 is 161 --jonas */
34 DOM_SCAN_TABLE_RANGE("\241", 255, SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
36 DOM_SCAN_TABLE_STRING("-_:.", SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
37 DOM_SCAN_TABLE_STRING("#", SGML_CHAR_ENTITY),
38 DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE),
39 DOM_SCAN_TABLE_STRING("\f\n", SGML_CHAR_NEWLINE),
40 DOM_SCAN_TABLE_STRING("<&", SGML_CHAR_NOT_TEXT),
41 DOM_SCAN_TABLE_STRING("<=>", SGML_CHAR_NOT_ATTRIBUTE),
43 DOM_SCAN_TABLE_END,
46 #define SGML_STRING_MAP(str, type, family) \
47 { STATIC_DOM_STRING(str), SGML_TOKEN_##type, SGML_TOKEN_##family }
49 static struct dom_scanner_string_mapping sgml_string_mappings[] = {
50 SGML_STRING_MAP("--", NOTATION_COMMENT, NOTATION),
51 SGML_STRING_MAP("ATTLIST", NOTATION_ATTLIST, NOTATION),
52 SGML_STRING_MAP("DOCTYPE", NOTATION_DOCTYPE, NOTATION),
53 SGML_STRING_MAP("ELEMENT", NOTATION_ELEMENT, NOTATION),
54 SGML_STRING_MAP("ENTITY", NOTATION_ENTITY, NOTATION),
56 SGML_STRING_MAP("xml", PROCESS_XML, PROCESS),
57 SGML_STRING_MAP("xml-stylesheet", PROCESS_XML_STYLESHEET, PROCESS),
59 DOM_STRING_MAP_END,
62 static struct dom_scanner_token *scan_sgml_tokens(struct dom_scanner *scanner);
64 struct dom_scanner_info sgml_scanner_info = {
65 sgml_string_mappings,
66 sgml_scan_table_info,
67 scan_sgml_tokens,
70 #define check_sgml_table(c, bit) (sgml_scanner_info.scan_table[(c)] & (bit))
72 #define scan_sgml(scanner, s, bit) \
73 while ((s) < (scanner)->end && check_sgml_table(*(s), bit)) (s)++;
75 #define is_sgml_ident(c) check_sgml_table(c, SGML_CHAR_IDENT)
76 #define is_sgml_entity(c) check_sgml_table(c, SGML_CHAR_ENTITY)
77 #define is_sgml_space(c) check_sgml_table(c, SGML_CHAR_WHITESPACE)
78 #define is_sgml_newline(c) check_sgml_table(c, SGML_CHAR_NEWLINE)
79 #define is_sgml_text(c) !check_sgml_table(c, SGML_CHAR_NOT_TEXT)
80 #define is_sgml_token_start(c) check_sgml_table(c, SGML_CHAR_TOKEN_START)
81 #define is_sgml_attribute(c) !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE)
83 static inline void
84 skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
86 unsigned char *pos = *string;
88 if (!scanner->count_lines) {
89 scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE);
90 } else {
91 while (pos < scanner->end && is_sgml_space(*pos)) {
92 if (is_sgml_newline(*pos))
93 scanner->lineno++;
94 pos++;
98 *string = pos;
101 #define check_sgml_incomplete(scanner, string) \
102 ((scanner)->check_complete \
103 && (scanner)->incomplete \
104 && (string) == (scanner)->end)
106 static void
107 set_sgml_incomplete(struct dom_scanner *scanner, struct dom_scanner_token *token)
109 size_t left = scanner->end - scanner->position;
111 assert(left > 0);
113 token->type = SGML_TOKEN_INCOMPLETE;
114 set_dom_string(&token->string, scanner->position, left);
116 /* Stop the scanning. */
117 scanner->position = scanner->end;
121 static inline int
122 check_sgml_error(struct dom_scanner *scanner)
124 unsigned int found_error = scanner->found_error;
126 /* Toggle if we found an error previously. */
127 scanner->found_error = 0;
129 return scanner->detect_errors && !found_error;
132 static unsigned char *
133 get_sgml_error_end(struct dom_scanner *scanner, enum sgml_token_type type,
134 unsigned char *end)
136 switch (type) {
137 case SGML_TOKEN_CDATA_SECTION:
138 case SGML_TOKEN_NOTATION_ATTLIST:
139 case SGML_TOKEN_NOTATION_DOCTYPE:
140 case SGML_TOKEN_NOTATION_ELEMENT:
141 if (scanner->position + 9 < end)
142 end = scanner->position + 9;
143 break;
145 case SGML_TOKEN_NOTATION_COMMENT:
146 /* Just include the '<!--' part. */
147 if (scanner->position + 4 < end)
148 end = scanner->position + 4;
149 break;
151 case SGML_TOKEN_NOTATION_ENTITY:
152 if (scanner->position + 6 < end)
153 end = scanner->position + 6;
154 break;
156 case SGML_TOKEN_PROCESS_XML:
157 if (scanner->position + 5 < end)
158 end = scanner->position + 5;
159 break;
161 case SGML_TOKEN_PROCESS_XML_STYLESHEET:
162 if (scanner->position + 16 < end)
163 end = scanner->position + 16;
164 break;
166 default:
167 break;
170 return end;
174 static struct dom_scanner_token *
175 set_sgml_error(struct dom_scanner *scanner, unsigned char *end)
177 struct dom_scanner_token *token = scanner->current;
178 struct dom_scanner_token *next;
180 assert(!scanner->found_error);
182 if (scanner->current >= scanner->table + DOM_SCANNER_TOKENS) {
183 scanner->found_error = 1;
184 next = NULL;
186 } else {
187 scanner->current++;
188 next = scanner->current;
189 copy_struct(next, token);
192 token->type = SGML_TOKEN_ERROR;
193 token->lineno = scanner->lineno;
194 set_dom_string(&token->string, scanner->position, end - scanner->position);
196 return next;
200 /* Text token scanning */
202 /* I think it is faster to not check the table here --jonas */
203 #define foreach_sgml_cdata(scanner, str) \
204 for (; ((str) < (scanner)->end && *(str) != '<' && *(str) != '&'); (str)++)
206 static inline void
207 scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
209 unsigned char *string = scanner->position;
210 unsigned char first_char = *string;
211 enum sgml_token_type type = SGML_TOKEN_GARBAGE;
212 int real_length = -1;
214 /* In scan_sgml_tokens() we check that first_char != '<' */
215 assert(first_char != '<' && scanner->state == SGML_STATE_TEXT);
217 token->string.string = string++;
219 if (first_char == '&') {
220 int complete = 0;
222 if (is_sgml_entity(*string)) {
223 scan_sgml(scanner, string, SGML_CHAR_ENTITY);
224 type = SGML_TOKEN_ENTITY;
225 token->string.string++;
226 real_length = string - token->string.string;
229 foreach_sgml_cdata (scanner, string) {
230 if (*string == ';') {
231 complete = 1;
232 string++;
233 break;
237 /* We want the biggest possible text token. */
238 if (!complete) {
239 if (check_sgml_incomplete(scanner, string)) {
240 set_sgml_incomplete(scanner, token);
241 return;
244 if (check_sgml_error(scanner)) {
245 token = set_sgml_error(scanner, string);
246 if (!token)
247 return;
251 } else {
252 if (is_sgml_space(first_char)) {
253 if (scanner->count_lines
254 && is_sgml_newline(first_char))
255 scanner->lineno++;
257 skip_sgml_space(scanner, &string);
258 type = string < scanner->end && is_sgml_text(*string)
259 ? SGML_TOKEN_TEXT : SGML_TOKEN_SPACE;
260 } else {
261 type = SGML_TOKEN_TEXT;
264 if (scanner->count_lines) {
265 foreach_sgml_cdata (scanner, string) {
266 if (is_sgml_newline(*string))
267 scanner->lineno++;
269 } else {
270 foreach_sgml_cdata (scanner, string) {
271 /* m33p */;
275 /* We want the biggest possible text token. */
276 if (check_sgml_incomplete(scanner, string)) {
277 set_sgml_incomplete(scanner, token);
278 return;
282 token->type = type;
283 token->string.length = real_length >= 0 ? real_length : string - token->string.string;
284 token->precedence = get_sgml_precedence(type);
285 scanner->position = string;
289 /* Element scanning */
291 /* Check whether it is safe to skip the @token when looking for @skipto. */
292 static inline int
293 check_sgml_precedence(int type, int skipto)
295 return get_sgml_precedence(type) <= get_sgml_precedence(skipto);
298 /* Skip until @skipto is found, without taking precedence into account. */
299 static inline unsigned char *
300 skip_sgml_chars(struct dom_scanner *scanner, unsigned char *string,
301 unsigned char skipto)
303 int newlines;
305 assert(string >= scanner->position && string <= scanner->end);
307 if (!scanner->count_lines) {
308 size_t length = scanner->end - string;
310 return memchr(string, skipto, length);
313 for (newlines = 0; string < scanner->end; string++) {
314 if (is_sgml_newline(*string))
315 newlines++;
316 if (*string == skipto) {
317 /* Only count newlines if we actually find the
318 * requested char. Else callers are assumed to discard
319 * the scanning. */
320 scanner->lineno += newlines;
321 return string;
325 return NULL;
328 /* XXX: Only element or ``in tag'' precedence is handled correctly however
329 * using this function for CDATA or text would be overkill. */
330 static inline unsigned char *
331 skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char skipto,
332 int check_quoting)
334 unsigned char *pos = *string;
336 for (; pos < scanner->end; pos++) {
337 if (*pos == skipto) {
338 *string = pos + 1;
339 return pos;
342 if (!check_sgml_precedence(*pos, skipto))
343 break;
345 if (check_quoting && isquote(*pos)) {
346 unsigned char *end;
348 end = skip_sgml_chars(scanner, pos + 1, *pos);
349 if (end) pos = end;
351 } else if (scanner->count_lines && is_sgml_newline(*pos)) {
352 scanner->lineno++;
356 *string = pos;
357 return NULL;
360 static inline int
361 skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string,
362 int *possibly_incomplete)
364 unsigned char *pos = *string;
365 int length = 0;
367 for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
368 /* It is always safe to access index -2 and -1 here since we
369 * are supposed to have '<!--' before this is called. We do
370 * however need to check that the '-->' are not overlapping any
371 * preceeding '-'. Additionally also handle the quirky '--!>'
372 * end sometimes found. */
373 if (pos[-2] == '-') {
374 if (pos[-1] == '-' && &pos[-2] >= *string) {
375 length = pos - *string - 2;
376 *possibly_incomplete = 0;
377 pos++;
378 break;
379 } else if (pos[-1] == '!' && pos[-3] == '-' && &pos[-3] >= *string) {
380 length = pos - *string - 3;
381 *possibly_incomplete = 0;
382 pos++;
383 break;
388 if (!pos) {
389 pos = scanner->end;
390 /* The token is incomplete but set the length to handle tag
391 * tag soup graciously. */
392 *possibly_incomplete = 1;
393 length = pos - *string;
396 *string = pos;
397 return length;
400 static inline int
401 skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string,
402 int *possibly_incomplete)
404 unsigned char *pos = *string;
405 int length = 0;
407 for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
408 /* It is always safe to access index -2 and -1 here since we
409 * are supposed to have '<![CDATA[' before this is called. */
410 if (pos[-2] == ']' && pos[-1] == ']') {
411 length = pos - *string - 2;
412 *possibly_incomplete = 0;
413 pos++;
414 break;
418 if (!pos) {
419 pos = scanner->end;
420 /* The token is incomplete but set the length to handle tag
421 * soup graciously. */
422 *possibly_incomplete = 1;
423 length = pos - *string;
426 *string = pos;
427 return length;
430 #define scan_sgml_attribute(scanner, str) \
431 while ((str) < (scanner)->end && is_sgml_attribute(*(str))) \
432 (str)++;
434 static inline void
435 scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
437 unsigned char *string = scanner->position;
438 unsigned char first_char = *string;
439 enum sgml_token_type type = SGML_TOKEN_GARBAGE;
440 int real_length = -1;
441 int possibly_incomplete = 1;
442 enum sgml_scanner_state scanner_state = scanner->state;
444 token->string.string = string++;
446 if (first_char == '<') {
447 skip_sgml_space(scanner, &string);
449 if (scanner->state == SGML_STATE_ELEMENT) {
450 /* Already inside an element so insert a tag end token
451 * and continue scanning in next iteration. */
452 type = SGML_TOKEN_TAG_END;
453 scanner_state = SGML_STATE_TEXT;
455 /* We are creating a 'virtual' that has no source. */
456 possibly_incomplete = 0;
457 string = token->string.string;
458 real_length = 0;
460 } else if (string == scanner->end) {
461 /* It is incomplete so prevent out of bound acess to
462 * the scanned string. */
464 } else if (is_sgml_ident(*string)) {
465 token->string.string = string;
466 scan_sgml(scanner, string, SGML_CHAR_IDENT);
468 real_length = string - token->string.string;
470 skip_sgml_space(scanner, &string);
471 if (string < scanner->end && *string == '>') {
472 type = SGML_TOKEN_ELEMENT;
473 string++;
475 /* We found the end. */
476 possibly_incomplete = 0;
478 } else {
479 /* Was any space skipped? */
480 if (is_sgml_space(string[-1])) {
481 /* We found the end. */
482 possibly_incomplete = 0;
484 type = SGML_TOKEN_ELEMENT_BEGIN;
485 scanner_state = SGML_STATE_ELEMENT;
488 } else if (*string == '!') {
489 unsigned char *ident;
490 enum sgml_token_type base = SGML_TOKEN_NOTATION;
492 string++;
493 skip_sgml_space(scanner, &string);
494 token->string.string = ident = string;
496 if (string + 1 < scanner->end
497 && string[0] == '-' && string[1] == '-') {
498 string += 2;
499 type = SGML_TOKEN_NOTATION_COMMENT;
500 token->string.string = string;
501 real_length = skip_sgml_comment(scanner, &string,
502 &possibly_incomplete);
503 assert(real_length >= 0);
505 } else if (string + 6 < scanner->end
506 && !memcmp(string, "[CDATA[", 7)) {
508 string += 7;
509 type = SGML_TOKEN_CDATA_SECTION;
510 token->string.string = string;
511 real_length = skip_sgml_cdata_section(scanner, &string,
512 &possibly_incomplete);
513 assert(real_length >= 0);
515 } else {
516 scan_sgml(scanner, string, SGML_CHAR_IDENT);
517 type = map_dom_scanner_string(scanner, ident, string, base);
518 if (skip_sgml(scanner, &string, '>', 0)) {
519 /* We found the end. */
520 possibly_incomplete = 0;
524 } else if (*string == '?') {
525 unsigned char *pos;
526 enum sgml_token_type base = SGML_TOKEN_PROCESS;
528 string++;
529 skip_sgml_space(scanner, &string);
530 token->string.string = pos = string;
531 scan_sgml(scanner, string, SGML_CHAR_IDENT);
533 type = map_dom_scanner_string(scanner, pos, string, base);
535 scanner_state = SGML_STATE_PROC_INST;
537 real_length = string - token->string.string;
538 skip_sgml_space(scanner, &string);
540 /* Make '<?xml ' cause the right kind of error. */
541 if (is_sgml_space(string[-1])
542 && string < scanner->end) {
543 /* We found the end. */
544 possibly_incomplete = 0;
547 if (scanner->check_complete && scanner->incomplete) {
548 /* We need to fit both the process target token
549 * and the process data token into the scanner
550 * table. */
551 if (token + 1 >= scanner->table + DOM_SCANNER_TOKENS) {
552 possibly_incomplete = 1;
554 } else if (!possibly_incomplete) {
555 /* FIXME: We do this twice. */
556 for (pos = string + 1;
557 (pos = skip_sgml_chars(scanner, pos, '>'));
558 pos++) {
559 if (pos[-1] == '?')
560 break;
562 if (!pos)
563 possibly_incomplete = 1;
566 if (possibly_incomplete)
567 string = scanner->end;
570 } else if (*string == '/') {
571 string++;
572 skip_sgml_space(scanner, &string);
574 if (string == scanner->end) {
575 /* Prevent out of bound access. */
577 } else if (is_sgml_ident(*string)) {
578 token->string.string = string;
579 scan_sgml(scanner, string, SGML_CHAR_IDENT);
580 real_length = string - token->string.string;
582 type = SGML_TOKEN_ELEMENT_END;
583 if (skip_sgml(scanner, &string, '>', 1)) {
584 /* We found the end. */
585 possibly_incomplete = 0;
588 } else if (*string == '>') {
589 string++;
590 real_length = 0;
591 type = SGML_TOKEN_ELEMENT_END;
593 /* We found the end. */
594 possibly_incomplete = 0;
597 if (type != SGML_TOKEN_GARBAGE) {
598 scanner_state = SGML_STATE_TEXT;
601 } else {
602 /* Alien < > stuff so ignore it */
603 if (skip_sgml(scanner, &string, '>', 0)) {
604 /* We found the end. */
605 possibly_incomplete = 0;
609 } else if (first_char == '=') {
610 type = '=';
611 /* We found the end. */
612 possibly_incomplete = 0;
614 } else if (first_char == '?' || first_char == '>') {
615 if (first_char == '?') {
616 if (skip_sgml(scanner, &string, '>', 0)) {
617 /* We found the end. */
618 possibly_incomplete = 0;
620 } else {
621 assert(first_char == '>');
623 /* We found the end. */
624 possibly_incomplete = 0;
627 type = SGML_TOKEN_TAG_END;
628 assert(scanner->state == SGML_STATE_ELEMENT);
629 scanner_state = SGML_STATE_TEXT;
631 } else if (first_char == '/') {
632 /* We allow '/' inside elements and only consider it as an end
633 * tag if immediately preceeds the '>' char. This is to allow
635 * '<form action=/ >' where '/' is part of a path and
636 * '<form action=a />' where '/>' is truely a tag end
638 * For stricter parsing we should always require attribute
639 * values to be quoted.
641 if (string == scanner->end) {
642 /* Prevent out of bound access. */
644 } else if (*string == '>') {
645 string++;
646 real_length = 0;
647 type = SGML_TOKEN_ELEMENT_EMPTY_END;
648 assert(scanner->state == SGML_STATE_ELEMENT);
649 scanner_state = SGML_STATE_TEXT;
651 /* We found the end. */
652 possibly_incomplete = 0;
654 } else if (is_sgml_attribute(*string)) {
655 scan_sgml_attribute(scanner, string);
656 type = SGML_TOKEN_ATTRIBUTE;
657 if (string[-1] == '/' && string[0] == '>') {
658 string--;
659 /* We found the end. */
660 possibly_incomplete = 0;
664 } else if (isquote(first_char)) {
665 unsigned char *string_end = skip_sgml_chars(scanner, string, first_char);
667 if (string_end) {
668 /* We don't want the delimiters in the token */
669 token->string.string++;
670 real_length = string_end - token->string.string;
671 string = string_end + 1;
672 type = SGML_TOKEN_STRING;
674 /* We found the end. */
675 possibly_incomplete = 0;
677 } else if (scanner->check_complete && scanner->incomplete) {
678 /* Force an incomplete token. */
679 string = scanner->end;
681 } else if (string < scanner->end
682 && is_sgml_attribute(*string)) {
683 token->string.string++;
684 scan_sgml_attribute(scanner, string);
685 type = SGML_TOKEN_ATTRIBUTE;
688 } else if (is_sgml_attribute(first_char)) {
689 if (is_sgml_ident(first_char)) {
690 scan_sgml(scanner, string, SGML_CHAR_IDENT);
691 type = SGML_TOKEN_IDENT;
694 if (string < scanner->end
695 && is_sgml_attribute(*string)) {
696 scan_sgml_attribute(scanner, string);
697 type = SGML_TOKEN_ATTRIBUTE;
698 if (string[-1] == '/' && string[0] == '>') {
699 /* We found the end. */
700 possibly_incomplete = 0;
701 string--;
706 if (possibly_incomplete) {
707 if (check_sgml_incomplete(scanner, string)) {
708 set_sgml_incomplete(scanner, token);
709 return;
712 if (check_sgml_error(scanner) && string == scanner->end) {
713 unsigned char *end;
715 end = get_sgml_error_end(scanner, type, string);
716 token = set_sgml_error(scanner, end);
717 if (!token)
718 return;
722 /* Only apply the state change if the token was not abandoned because
723 * it was incomplete. */
724 scanner->state = scanner_state;
726 token->type = type;
727 token->string.length = real_length >= 0 ? real_length : string - token->string.string;
728 token->precedence = get_sgml_precedence(type);
729 scanner->position = string;
733 /* Processing instruction data scanning */
735 static inline void
736 scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
738 unsigned char *string = scanner->position;
739 /* The length can be empty for '<??>'. */
740 ssize_t length = -1;
742 token->string.string = string++;
744 /* Figure out where the processing instruction ends. This doesn't use
745 * skip_sgml() since we MUST ignore precedence here to allow '<' inside
746 * the data part to be skipped correctly. */
747 for ( ; (string = skip_sgml_chars(scanner, string, '>')); string++) {
748 if (string[-1] == '?') {
749 string++;
750 length = string - token->string.string - 2;
751 break;
755 if (!string) {
756 /* Makes the next succeed when checking for incompletion, and
757 * puts the rest of the text within the token. */
758 string = scanner->end;
760 if (check_sgml_incomplete(scanner, string)) {
761 set_sgml_incomplete(scanner, token);
762 return;
765 if (check_sgml_error(scanner)) {
766 token = set_sgml_error(scanner, string);
767 if (!token)
768 return;
772 token->type = SGML_TOKEN_PROCESS_DATA;
773 token->string.length = length >= 0 ? length : string - token->string.string;
774 token->precedence = get_sgml_precedence(token->type);
775 scanner->position = string;
776 scanner->state = SGML_STATE_TEXT;
780 /* Scanner multiplexor */
782 static struct dom_scanner_token *
783 scan_sgml_tokens(struct dom_scanner *scanner)
785 struct dom_scanner_token *table_end = scanner->table + DOM_SCANNER_TOKENS;
787 if (!begin_dom_token_scanning(scanner))
788 return get_dom_scanner_token(scanner);
790 /* Scan tokens until we fill the table */
791 for (scanner->current = scanner->table + scanner->tokens;
792 scanner->current < table_end && scanner->position < scanner->end;
793 scanner->current++) {
794 if (scanner->state == SGML_STATE_ELEMENT
795 || (*scanner->position == '<'
796 && scanner->state != SGML_STATE_PROC_INST)) {
797 skip_sgml_space(scanner, &scanner->position);
798 if (scanner->position >= scanner->end) break;
800 scan_sgml_element_token(scanner, scanner->current);
802 /* Shall we scratch this token? */
803 if (scanner->current->type == SGML_TOKEN_SKIP) {
804 scanner->current--;
807 } else if (scanner->state == SGML_STATE_TEXT) {
808 scan_sgml_text_token(scanner, scanner->current);
810 } else {
811 scan_sgml_proc_inst_token(scanner, scanner->current);
815 return end_dom_token_scanning(scanner, scanner->current);