1 /* SGML token scanner utilities */
12 #include "dom/scanner.h"
13 #include "dom/sgml/scanner.h"
14 #include "dom/string.h"
15 #include "util/error.h"
18 /* Bitmap entries for the SGML character groups used in the scanner table */
20 enum sgml_char_group
{
21 SGML_CHAR_ENTITY
= (1 << 1),
22 SGML_CHAR_IDENT
= (1 << 2),
23 SGML_CHAR_NEWLINE
= (1 << 3),
24 SGML_CHAR_WHITESPACE
= (1 << 4),
25 SGML_CHAR_NOT_TEXT
= (1 << 5),
26 SGML_CHAR_NOT_ATTRIBUTE
= (1 << 6),
29 static struct dom_scan_table_info sgml_scan_table_info
[] = {
30 DOM_SCAN_TABLE_RANGE("0", '9', SGML_CHAR_IDENT
| SGML_CHAR_ENTITY
),
31 DOM_SCAN_TABLE_RANGE("A", 'Z', SGML_CHAR_IDENT
| SGML_CHAR_ENTITY
),
32 DOM_SCAN_TABLE_RANGE("a", 'z', SGML_CHAR_IDENT
| SGML_CHAR_ENTITY
),
33 /* For the octal number impared (me including) \241 is 161 --jonas */
34 DOM_SCAN_TABLE_RANGE("\241", 255, SGML_CHAR_IDENT
| SGML_CHAR_ENTITY
),
36 DOM_SCAN_TABLE_STRING("-_:.", SGML_CHAR_IDENT
| SGML_CHAR_ENTITY
),
37 DOM_SCAN_TABLE_STRING("#", SGML_CHAR_ENTITY
),
38 DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE
),
39 DOM_SCAN_TABLE_STRING("\f\n", SGML_CHAR_NEWLINE
),
40 DOM_SCAN_TABLE_STRING("<&", SGML_CHAR_NOT_TEXT
),
41 DOM_SCAN_TABLE_STRING("<=>", SGML_CHAR_NOT_ATTRIBUTE
),
46 #define SGML_STRING_MAP(str, type, family) \
47 { INIT_DOM_STRING(str, -1), SGML_TOKEN_##type, SGML_TOKEN_##family }
49 static struct dom_scanner_string_mapping sgml_string_mappings
[] = {
50 SGML_STRING_MAP("--", NOTATION_COMMENT
, NOTATION
),
51 SGML_STRING_MAP("ATTLIST", NOTATION_ATTLIST
, NOTATION
),
52 SGML_STRING_MAP("DOCTYPE", NOTATION_DOCTYPE
, NOTATION
),
53 SGML_STRING_MAP("ELEMENT", NOTATION_ELEMENT
, NOTATION
),
54 SGML_STRING_MAP("ENTITY", NOTATION_ENTITY
, NOTATION
),
56 SGML_STRING_MAP("xml", PROCESS_XML
, PROCESS
),
57 SGML_STRING_MAP("xml-stylesheet", PROCESS_XML_STYLESHEET
, PROCESS
),
62 static struct dom_scanner_token
*scan_sgml_tokens(struct dom_scanner
*scanner
);
64 struct dom_scanner_info sgml_scanner_info
= {
70 #define check_sgml_table(c, bit) (sgml_scanner_info.scan_table[(c)] & (bit))
72 #define scan_sgml(scanner, s, bit) \
73 while ((s) < (scanner)->end && check_sgml_table(*(s), bit)) (s)++;
75 #define is_sgml_ident(c) check_sgml_table(c, SGML_CHAR_IDENT)
76 #define is_sgml_entity(c) check_sgml_table(c, SGML_CHAR_ENTITY)
77 #define is_sgml_space(c) check_sgml_table(c, SGML_CHAR_WHITESPACE)
78 #define is_sgml_newline(c) check_sgml_table(c, SGML_CHAR_NEWLINE)
79 #define is_sgml_text(c) !check_sgml_table(c, SGML_CHAR_NOT_TEXT)
80 #define is_sgml_token_start(c) check_sgml_table(c, SGML_CHAR_TOKEN_START)
81 #define is_sgml_attribute(c) !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE)
84 skip_sgml_space(struct dom_scanner
*scanner
, unsigned char **string
)
86 unsigned char *pos
= *string
;
88 if (!scanner
->count_lines
) {
89 scan_sgml(scanner
, pos
, SGML_CHAR_WHITESPACE
);
91 while (pos
< scanner
->end
&& is_sgml_space(*pos
)) {
92 if (is_sgml_newline(*pos
))
102 /* Text token scanning */
104 /* I think it is faster to not check the table here --jonas */
105 #define foreach_sgml_cdata(scanner, str) \
106 for (; ((str) < (scanner)->end && *(str) != '<' && *(str) != '&'); (str)++)
109 scan_sgml_text_token(struct dom_scanner
*scanner
, struct dom_scanner_token
*token
)
111 unsigned char *string
= scanner
->position
;
112 unsigned char first_char
= *string
;
113 enum sgml_token_type type
= SGML_TOKEN_GARBAGE
;
114 int real_length
= -1;
116 /* In scan_sgml_tokens() we check that first_char != '<' */
117 assert(first_char
!= '<' && scanner
->state
== SGML_STATE_TEXT
);
119 token
->string
.string
= string
++;
121 if (first_char
== '&') {
122 if (is_sgml_entity(*string
)) {
123 scan_sgml(scanner
, string
, SGML_CHAR_ENTITY
);
124 type
= SGML_TOKEN_ENTITY
;
125 token
->string
.string
++;
126 real_length
= string
- token
->string
.string
;
129 foreach_sgml_cdata (scanner
, string
) {
130 if (*string
== ';') {
137 if (is_sgml_space(first_char
)) {
138 skip_sgml_space(scanner
, &string
);
139 type
= string
< scanner
->end
&& is_sgml_text(*string
)
140 ? SGML_TOKEN_TEXT
: SGML_TOKEN_SPACE
;
142 type
= SGML_TOKEN_TEXT
;
145 foreach_sgml_cdata (scanner
, string
) {
151 token
->string
.length
= real_length
>= 0 ? real_length
: string
- token
->string
.string
;
152 token
->precedence
= get_sgml_precedence(type
);
153 scanner
->position
= string
;
157 /* Element scanning */
159 /* Check whether it is safe to skip the @token when looking for @skipto. */
161 check_sgml_precedence(int type
, int skipto
)
163 return get_sgml_precedence(type
) <= get_sgml_precedence(skipto
);
166 /* Skip until @skipto is found, without taking precedence into account. */
167 static inline unsigned char *
168 skip_sgml_chars(struct dom_scanner
*scanner
, unsigned char *string
,
169 unsigned char skipto
)
173 assert(string
>= scanner
->position
&& string
<= scanner
->end
);
175 if (!scanner
->count_lines
) {
176 size_t length
= scanner
->end
- string
;
178 return memchr(string
, skipto
, length
);
181 for (newlines
= 0; string
< scanner
->end
; string
++) {
182 if (is_sgml_newline(*string
))
184 if (*string
== skipto
) {
185 /* Only count newlines if we actually find the
186 * requested char. Else callers are assumed to discard
188 scanner
->lineno
+= newlines
;
196 /* XXX: Only element or ``in tag'' precedence is handled correctly however
197 * using this function for CDATA or text would be overkill. */
198 static inline unsigned char *
199 skip_sgml(struct dom_scanner
*scanner
, unsigned char **string
, unsigned char skipto
,
202 unsigned char *pos
= *string
;
204 for (; pos
< scanner
->end
; pos
++) {
205 if (*pos
== skipto
) {
210 if (!check_sgml_precedence(*pos
, skipto
))
213 if (check_quoting
&& isquote(*pos
)) {
216 end
= skip_sgml_chars(scanner
, pos
+ 1, *pos
);
219 } else if (scanner
->count_lines
&& is_sgml_newline(*pos
)) {
229 skip_sgml_comment(struct dom_scanner
*scanner
, unsigned char **string
)
231 unsigned char *pos
= *string
;
234 for ( ; (pos
= skip_sgml_chars(scanner
, pos
, '>')); pos
++) {
235 /* It is always safe to access index -2 and -1 here since we
236 * are supposed to have '<!--' before this is called. We do
237 * however need to check that the '-->' are not overlapping any
239 if (pos
[-2] == '-' && pos
[-1] == '-' && &pos
[-2] >= *string
) {
240 length
= pos
- *string
- 2;
248 length
= pos
- *string
;
256 skip_sgml_cdata_section(struct dom_scanner
*scanner
, unsigned char **string
)
258 unsigned char *pos
= *string
;
261 for ( ; (pos
= skip_sgml_chars(scanner
, pos
, '>')); pos
++) {
262 /* It is always safe to access index -2 and -1 here since we
263 * are supposed to have '<![CDATA[' before this is called. */
264 if (pos
[-2] == ']' && pos
[-1] == ']') {
265 length
= pos
- *string
- 2;
273 length
= pos
- *string
;
280 #define scan_sgml_attribute(scanner, str) \
281 while ((str) < (scanner)->end && is_sgml_attribute(*(str))) \
285 scan_sgml_element_token(struct dom_scanner
*scanner
, struct dom_scanner_token
*token
)
287 unsigned char *string
= scanner
->position
;
288 unsigned char first_char
= *string
;
289 enum sgml_token_type type
= SGML_TOKEN_GARBAGE
;
290 int real_length
= -1;
292 token
->string
.string
= string
++;
294 if (first_char
== '<') {
295 skip_sgml_space(scanner
, &string
);
297 if (string
== scanner
->end
) {
298 /* Prevent out of bound access. */
300 } else if (scanner
->state
== SGML_STATE_ELEMENT
) {
301 /* Already inside an element so insert a tag end token
302 * and continue scanning in next iteration. */
305 type
= SGML_TOKEN_TAG_END
;
306 scanner
->state
= SGML_STATE_TEXT
;
308 } else if (is_sgml_ident(*string
)) {
309 token
->string
.string
= string
;
310 scan_sgml(scanner
, string
, SGML_CHAR_IDENT
);
312 real_length
= string
- token
->string
.string
;
314 skip_sgml_space(scanner
, &string
);
315 if (string
< scanner
->end
&& *string
== '>') {
316 type
= SGML_TOKEN_ELEMENT
;
319 scanner
->state
= SGML_STATE_ELEMENT
;
320 type
= SGML_TOKEN_ELEMENT_BEGIN
;
323 } else if (*string
== '!') {
324 unsigned char *ident
;
325 enum sgml_token_type base
= SGML_TOKEN_NOTATION
;
328 skip_sgml_space(scanner
, &string
);
329 token
->string
.string
= ident
= string
;
331 if (string
+ 1 < scanner
->end
332 && string
[0] == '-' && string
[1] == '-') {
334 type
= SGML_TOKEN_NOTATION_COMMENT
;
335 token
->string
.string
= string
;
336 real_length
= skip_sgml_comment(scanner
, &string
);
337 assert(real_length
>= 0);
339 } else if (string
+ 6 < scanner
->end
340 && !memcmp(string
, "[CDATA[", 7)) {
343 type
= SGML_TOKEN_CDATA_SECTION
;
344 token
->string
.string
= string
;
345 real_length
= skip_sgml_cdata_section(scanner
, &string
);
346 assert(real_length
>= 0);
349 skip_sgml_space(scanner
, &string
);
350 type
= map_dom_scanner_string(scanner
, ident
, string
, base
);
351 skip_sgml(scanner
, &string
, '>', 0);
354 } else if (*string
== '?') {
356 enum sgml_token_type base
= SGML_TOKEN_PROCESS
;
359 skip_sgml_space(scanner
, &string
);
360 token
->string
.string
= pos
= string
;
361 scan_sgml(scanner
, string
, SGML_CHAR_IDENT
);
363 type
= map_dom_scanner_string(scanner
, pos
, string
, base
);
365 scanner
->state
= SGML_STATE_PROC_INST
;
367 } else if (*string
== '/') {
369 skip_sgml_space(scanner
, &string
);
371 if (string
== scanner
->end
) {
372 /* Prevent out of bound access. */
374 } else if (is_sgml_ident(*string
)) {
375 token
->string
.string
= string
;
376 scan_sgml(scanner
, string
, SGML_CHAR_IDENT
);
377 real_length
= string
- token
->string
.string
;
379 type
= SGML_TOKEN_ELEMENT_END
;
380 skip_sgml(scanner
, &string
, '>', 1);
382 } else if (*string
== '>') {
385 type
= SGML_TOKEN_ELEMENT_END
;
388 if (type
!= SGML_TOKEN_GARBAGE
)
389 scanner
->state
= SGML_STATE_TEXT
;
392 /* Alien < > stuff so ignore it */
393 skip_sgml(scanner
, &string
, '>', 0);
396 } else if (first_char
== '=') {
399 } else if (first_char
== '?' || first_char
== '>') {
400 if (first_char
== '?') {
401 skip_sgml(scanner
, &string
, '>', 0);
404 type
= SGML_TOKEN_TAG_END
;
405 assert(scanner
->state
== SGML_STATE_ELEMENT
);
406 scanner
->state
= SGML_STATE_TEXT
;
408 } else if (first_char
== '/') {
409 if (string
== scanner
->end
) {
410 /* Prevent out of bound access. */
412 } else if (*string
== '>') {
415 type
= SGML_TOKEN_ELEMENT_EMPTY_END
;
416 assert(scanner
->state
== SGML_STATE_ELEMENT
);
417 scanner
->state
= SGML_STATE_TEXT
;
418 } else if (is_sgml_attribute(*string
)) {
419 scan_sgml_attribute(scanner
, string
);
420 type
= SGML_TOKEN_ATTRIBUTE
;
421 if (string
[-1] == '/' && string
[0] == '>')
425 } else if (isquote(first_char
)) {
426 unsigned char *string_end
= skip_sgml_chars(scanner
, string
, first_char
);
429 /* We don't want the delimiters in the token */
430 token
->string
.string
++;
431 real_length
= string_end
- token
->string
.string
;
432 string
= string_end
+ 1;
433 type
= SGML_TOKEN_STRING
;
435 } else if (string
< scanner
->end
436 && is_sgml_attribute(*string
)) {
438 token
->string
.string
++;
439 scan_sgml_attribute(scanner
, string
);
440 type
= SGML_TOKEN_ATTRIBUTE
;
443 } else if (is_sgml_attribute(first_char
)) {
444 if (is_sgml_ident(first_char
)) {
445 scan_sgml(scanner
, string
, SGML_CHAR_IDENT
);
446 type
= SGML_TOKEN_IDENT
;
449 if (string
< scanner
->end
450 && is_sgml_attribute(*string
)) {
451 scan_sgml_attribute(scanner
, string
);
452 type
= SGML_TOKEN_ATTRIBUTE
;
453 if (string
[-1] == '/' && string
[0] == '>')
459 token
->string
.length
= real_length
>= 0 ? real_length
: string
- token
->string
.string
;
460 token
->precedence
= get_sgml_precedence(type
);
461 scanner
->position
= string
;
465 /* Processing instruction data scanning */
468 scan_sgml_proc_inst_token(struct dom_scanner
*scanner
, struct dom_scanner_token
*token
)
470 unsigned char *string
= scanner
->position
;
472 token
->string
.string
= string
;
474 /* Figure out where the processing instruction ends. This doesn't use
475 * skip_sgml() since we MUST ignore precedence here to allow '<' inside
476 * the data part to be skipped correctly. */
477 for ( ; (string
= skip_sgml_chars(scanner
, string
, '>')); string
++) {
478 if (string
[-1] == '?') {
484 if (!string
) string
= scanner
->end
;
486 token
->type
= SGML_TOKEN_PROCESS_DATA
;
487 token
->string
.length
= string
- token
->string
.string
- 2;
488 token
->precedence
= get_sgml_precedence(token
->type
);
489 scanner
->position
= string
;
490 scanner
->state
= SGML_STATE_TEXT
;
494 /* Scanner multiplexor */
496 static struct dom_scanner_token
*
497 scan_sgml_tokens(struct dom_scanner
*scanner
)
499 struct dom_scanner_token
*table_end
= scanner
->table
+ DOM_SCANNER_TOKENS
;
500 struct dom_scanner_token
*current
;
502 if (!begin_dom_token_scanning(scanner
))
503 return get_dom_scanner_token(scanner
);
505 /* Scan tokens until we fill the table */
506 for (current
= scanner
->table
+ scanner
->tokens
;
507 current
< table_end
&& scanner
->position
< scanner
->end
;
509 if (scanner
->state
== SGML_STATE_ELEMENT
510 || (*scanner
->position
== '<'
511 && scanner
->state
!= SGML_STATE_PROC_INST
)) {
512 skip_sgml_space(scanner
, &scanner
->position
);
513 if (scanner
->position
>= scanner
->end
) break;
515 scan_sgml_element_token(scanner
, current
);
517 /* Shall we scratch this token? */
518 if (current
->type
== SGML_TOKEN_SKIP
) {
522 } else if (scanner
->state
== SGML_STATE_TEXT
) {
523 scan_sgml_text_token(scanner
, current
);
526 skip_sgml_space(scanner
, &scanner
->position
);
527 scan_sgml_proc_inst_token(scanner
, current
);
531 return end_dom_token_scanning(scanner
, current
);