1 /* SGML node handling */
13 #include "dom/sgml/parser.h"
14 #include "dom/sgml/scanner.h"
15 #include "dom/sgml/sgml.h"
16 #include "dom/stack.h"
17 #include "dom/string.h"
18 #include "util/error.h"
19 #include "util/memory.h"
22 /* When getting the sgml_parser struct it is _always_ assumed that the parser
23 * is the first to add it's context, which it is since it initializes the
26 #define get_sgml_parser(stack) ((stack)->contexts[0]->data)
28 #define get_sgml_parser_state(stack, state) \
29 get_dom_stack_state_data(stack->contexts[0], state)
32 /* Functions for adding new nodes to the DOM tree: */
34 /* They wrap init_dom_node() and add_dom_*() and set up of additional
35 * information like node subtypes and SGML parser state information. */
37 static inline struct dom_node
*
38 add_sgml_document(struct sgml_parser
*parser
)
40 int allocated
= parser
->flags
& SGML_PARSER_INCREMENTAL
;
41 struct dom_node
*node
;
43 node
= init_dom_node(DOM_NODE_DOCUMENT
, &parser
->uri
, allocated
);
44 if (node
&& push_dom_node(&parser
->stack
, node
) == DOM_CODE_OK
)
50 static inline struct dom_node
*
51 add_sgml_element(struct dom_stack
*stack
, struct dom_scanner_token
*token
)
53 struct sgml_parser
*parser
= get_sgml_parser(stack
);
54 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
55 struct dom_stack_state
*state
;
56 struct sgml_parser_state
*pstate
;
57 struct dom_node
*node
;
58 struct sgml_node_info
*node_info
;
60 node
= add_dom_element(parent
, &token
->string
);
61 if (!node
) return NULL
;
63 node_info
= get_sgml_node_info(parser
->info
->elements
, node
);
64 node
->data
.element
.type
= node_info
->type
;
66 if (push_dom_node(stack
, node
) != DOM_CODE_OK
)
69 state
= get_dom_stack_top(stack
);
70 assert(node
== state
->node
);
72 pstate
= get_sgml_parser_state(stack
, state
);
73 pstate
->info
= node_info
;
79 static inline struct dom_node
*
80 add_sgml_attribute(struct dom_stack
*stack
,
81 struct dom_scanner_token
*token
, struct dom_scanner_token
*valtoken
)
83 struct sgml_parser
*parser
= get_sgml_parser(stack
);
84 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
85 struct dom_string
*value
= valtoken
? &valtoken
->string
: NULL
;
86 struct sgml_node_info
*info
;
87 struct dom_node
*node
;
89 node
= add_dom_attribute(parent
, &token
->string
, value
);
91 info
= get_sgml_node_info(parser
->info
->attributes
, node
);
93 node
->data
.attribute
.type
= info
->type
;
94 node
->data
.attribute
.id
= !!(info
->flags
& SGML_ATTRIBUTE_IDENTIFIER
);
95 node
->data
.attribute
.reference
= !!(info
->flags
& SGML_ATTRIBUTE_REFERENCE
);
97 if (valtoken
&& valtoken
->type
== SGML_TOKEN_STRING
)
98 node
->data
.attribute
.quoted
= valtoken
->string
.string
[-1];
100 if (!node
|| push_dom_node(stack
, node
) != DOM_CODE_OK
)
108 static inline struct dom_node
*
109 add_sgml_proc_instruction(struct dom_stack
*stack
, struct dom_scanner_token
*target
,
110 struct dom_scanner_token
*data
)
112 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
113 struct dom_string
*data_str
= data
? &data
->string
: NULL
;
114 struct dom_node
*node
;
116 node
= add_dom_proc_instruction(parent
, &target
->string
, data_str
);
117 if (!node
) return NULL
;
119 switch (target
->type
) {
120 case SGML_TOKEN_PROCESS_XML
:
121 node
->data
.proc_instruction
.type
= DOM_PROC_INSTRUCTION_XML
;
124 case SGML_TOKEN_PROCESS_XML_STYLESHEET
:
125 node
->data
.proc_instruction
.type
= DOM_PROC_INSTRUCTION_XML_STYLESHEET
;
128 case SGML_TOKEN_PROCESS
:
130 node
->data
.proc_instruction
.type
= DOM_PROC_INSTRUCTION
;
133 if (push_dom_node(stack
, node
) == DOM_CODE_OK
)
139 static inline struct dom_node
*
140 add_sgml_node(struct dom_stack
*stack
, enum dom_node_type type
, struct dom_scanner_token
*token
)
142 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
143 struct dom_node
*node
= add_dom_node(parent
, type
, &token
->string
);
145 if (!node
) return NULL
;
147 if (token
->type
== SGML_TOKEN_SPACE
)
148 node
->data
.text
.only_space
= 1;
150 if (push_dom_node(stack
, node
) == DOM_CODE_OK
)
157 /* SGML parser main handling: */
160 call_sgml_error_function(struct dom_stack
*stack
, struct dom_scanner_token
*token
)
162 struct sgml_parser
*parser
= get_sgml_parser(stack
);
163 unsigned int line
= get_sgml_parser_line_number(parser
);
165 assert(parser
->error_func
);
167 return parser
->error_func(parser
, &token
->string
, line
);
170 /* Appends to or 'creates' an incomplete token. This can be used to
171 * force tokens back into the 'stream' if they require that later tokens
174 * NOTE: You can only do this for tokens that are not stripped of markup such
177 check_sgml_incomplete(struct dom_scanner
*scanner
,
178 struct dom_scanner_token
*start
,
179 struct dom_scanner_token
*token
)
181 if (token
&& token
->type
== SGML_TOKEN_INCOMPLETE
) {
182 token
->string
.length
+= token
->string
.string
- start
->string
.string
;
183 token
->string
.string
= start
->string
.string
;
186 } else if (!token
&& scanner
->check_complete
&& scanner
->incomplete
) {
187 size_t left
= scanner
->end
- start
->string
.string
;
191 token
= scanner
->current
= scanner
->table
;
193 token
->type
= SGML_TOKEN_INCOMPLETE
;
194 set_dom_string(&token
->string
, start
->string
.string
, left
);
201 static inline enum dom_code
202 parse_sgml_attributes(struct dom_stack
*stack
, struct dom_scanner
*scanner
)
204 struct dom_scanner_token name
;
206 while (dom_scanner_has_tokens(scanner
)) {
207 struct dom_scanner_token
*token
= get_dom_scanner_token(scanner
);
211 switch (token
->type
) {
212 case SGML_TOKEN_TAG_END
:
213 skip_dom_scanner_token(scanner
);
215 case SGML_TOKEN_ELEMENT
:
216 case SGML_TOKEN_ELEMENT_BEGIN
:
217 case SGML_TOKEN_ELEMENT_END
:
218 case SGML_TOKEN_ELEMENT_EMPTY_END
:
221 case SGML_TOKEN_IDENT
:
222 copy_struct(&name
, token
);
224 /* Skip the attribute name token */
225 token
= get_next_dom_scanner_token(scanner
);
227 if (token
&& token
->type
== '=') {
228 /* If the token is not a valid value token
230 token
= get_next_dom_scanner_token(scanner
);
231 if (check_sgml_incomplete(scanner
, &name
, token
))
232 return DOM_CODE_INCOMPLETE
;
235 && token
->type
!= SGML_TOKEN_IDENT
236 && token
->type
!= SGML_TOKEN_ATTRIBUTE
237 && token
->type
!= SGML_TOKEN_STRING
)
240 } else if (check_sgml_incomplete(scanner
, &name
, token
)) {
241 return DOM_CODE_INCOMPLETE
;
247 if (!add_sgml_attribute(stack
, &name
, token
))
248 return DOM_CODE_ALLOC_ERR
;
250 /* Skip the value token */
252 skip_dom_scanner_token(scanner
);
255 case SGML_TOKEN_INCOMPLETE
:
256 return DOM_CODE_INCOMPLETE
;
258 case SGML_TOKEN_ERROR
:
262 code
= call_sgml_error_function(stack
, token
);
263 if (code
!= DOM_CODE_OK
)
266 skip_dom_scanner_token(scanner
);
270 skip_dom_scanner_token(scanner
);
278 parse_sgml_plain(struct dom_stack
*stack
, struct dom_scanner
*scanner
)
280 struct dom_scanner_token target
;
282 while (dom_scanner_has_tokens(scanner
)) {
283 struct dom_scanner_token
*token
= get_dom_scanner_token(scanner
);
285 switch (token
->type
) {
286 case SGML_TOKEN_ELEMENT
:
287 case SGML_TOKEN_ELEMENT_BEGIN
:
288 if (!add_sgml_element(stack
, token
))
289 return DOM_CODE_ALLOC_ERR
;
291 if (token
->type
== SGML_TOKEN_ELEMENT_BEGIN
) {
294 skip_dom_scanner_token(scanner
);
296 code
= parse_sgml_attributes(stack
, scanner
);
297 if (code
!= DOM_CODE_OK
)
301 skip_dom_scanner_token(scanner
);
306 case SGML_TOKEN_ELEMENT_EMPTY_END
:
308 skip_dom_scanner_token(scanner
);
311 case SGML_TOKEN_ELEMENT_END
:
312 if (!token
->string
.length
) {
315 struct dom_string string
;
316 struct dom_stack_state
*state
;
318 set_dom_string(&string
, token
->string
.string
, token
->string
.length
);
319 state
= search_dom_stack(stack
, DOM_NODE_ELEMENT
,
322 struct sgml_parser_state
*pstate
;
324 pstate
= get_sgml_parser_state(stack
, state
);
325 copy_struct(&pstate
->end_token
, token
);
327 pop_dom_state(stack
, state
);
330 skip_dom_scanner_token(scanner
);
333 case SGML_TOKEN_NOTATION_COMMENT
:
334 if (!add_sgml_node(stack
, DOM_NODE_COMMENT
, token
))
335 return DOM_CODE_ALLOC_ERR
;
336 skip_dom_scanner_token(scanner
);
339 case SGML_TOKEN_NOTATION_ATTLIST
:
340 case SGML_TOKEN_NOTATION_DOCTYPE
:
341 case SGML_TOKEN_NOTATION_ELEMENT
:
342 case SGML_TOKEN_NOTATION_ENTITY
:
343 case SGML_TOKEN_NOTATION
:
344 skip_dom_scanner_token(scanner
);
347 case SGML_TOKEN_CDATA_SECTION
:
348 if (!add_sgml_node(stack
, DOM_NODE_CDATA_SECTION
, token
))
349 return DOM_CODE_ALLOC_ERR
;
350 skip_dom_scanner_token(scanner
);
353 case SGML_TOKEN_PROCESS_XML_STYLESHEET
:
354 case SGML_TOKEN_PROCESS_XML
:
355 case SGML_TOKEN_PROCESS
:
356 copy_struct(&target
, token
);
358 /* Skip the target token */
359 token
= get_next_dom_scanner_token(scanner
);
360 if (!token
|| token
->type
== SGML_TOKEN_INCOMPLETE
)
361 return DOM_CODE_INCOMPLETE
;
363 if (token
->type
== SGML_TOKEN_ERROR
)
366 assert(token
->type
== SGML_TOKEN_PROCESS_DATA
);
369 if (!add_sgml_proc_instruction(stack
, &target
, token
))
370 return DOM_CODE_ALLOC_ERR
;
371 if ((target
.type
== SGML_TOKEN_PROCESS_XML
372 || target
.type
== SGML_TOKEN_PROCESS_XML_STYLESHEET
)
373 && token
->string
.length
> 0) {
374 /* Parse the <?xml data="attributes"?>. */
375 struct dom_scanner attr_scanner
;
377 /* The attribute souce is complete. */
378 init_dom_scanner(&attr_scanner
, &sgml_scanner_info
,
379 &token
->string
, SGML_STATE_ELEMENT
,
380 scanner
->count_lines
, 1, 0, 0);
382 if (dom_scanner_has_tokens(&attr_scanner
)) {
383 /* Ignore parser codes from this
384 * enhanced parsing of attributes. It
385 * is really just a simple way to try
386 * and support xml and xml-stylesheet
388 parse_sgml_attributes(stack
, &attr_scanner
);
393 skip_dom_scanner_token(scanner
);
396 case SGML_TOKEN_ENTITY
:
397 add_sgml_node(stack
, DOM_NODE_ENTITY_REFERENCE
, token
);
398 skip_dom_scanner_token(scanner
);
401 case SGML_TOKEN_INCOMPLETE
:
402 return DOM_CODE_INCOMPLETE
;
404 case SGML_TOKEN_ERROR
:
408 code
= call_sgml_error_function(stack
, token
);
409 if (code
!= DOM_CODE_OK
)
412 skip_dom_scanner_token(scanner
);
415 case SGML_TOKEN_SPACE
:
416 case SGML_TOKEN_TEXT
:
418 add_sgml_node(stack
, DOM_NODE_TEXT
, token
);
419 skip_dom_scanner_token(scanner
);
427 parse_sgml(struct sgml_parser
*parser
, unsigned char *buf
, size_t bufsize
,
430 struct dom_string source
= INIT_DOM_STRING(buf
, bufsize
);
431 struct dom_node
*node
;
434 parser
->flags
|= SGML_PARSER_COMPLETE
;
437 parser
->root
= add_sgml_document(parser
);
439 return DOM_CODE_ALLOC_ERR
;
440 get_dom_stack_top(&parser
->stack
)->immutable
= 1;
443 node
= init_dom_node(DOM_NODE_TEXT
, &source
, 0);
444 if (!node
|| push_dom_node(&parser
->parsing
, node
) != DOM_CODE_OK
)
445 return DOM_CODE_ALLOC_ERR
;
451 /* Parsing state management: */
453 /* The SGML parser can handle nested calls to parse_sgml(). This can be used to
454 * handle output of external processing of data in the document tree. For
455 * example this can allows output of the document.write() from DOM scripting
456 * interface to be parsed. */
458 /* This holds info about a chunk of text being parsed. */
459 struct sgml_parsing_state
{
460 struct dom_scanner scanner
;
461 struct dom_node
*node
;
462 struct dom_string incomplete
;
464 unsigned int resume
:1;
468 sgml_parsing_push(struct dom_stack
*stack
, struct dom_node
*node
, void *data
)
470 struct sgml_parser
*parser
= get_sgml_parser(stack
);
471 struct sgml_parsing_state
*parsing
= data
;
472 int count_lines
= !!(parser
->flags
& SGML_PARSER_COUNT_LINES
);
473 int complete
= !!(parser
->flags
& SGML_PARSER_COMPLETE
);
474 int incremental
= !!(parser
->flags
& SGML_PARSER_INCREMENTAL
);
475 int detect_errors
= !!(parser
->flags
& SGML_PARSER_DETECT_ERRORS
);
476 struct dom_string
*string
= &node
->string
;
477 struct dom_scanner_token
*token
;
478 struct dom_string incomplete
;
479 enum sgml_scanner_state scanner_state
= SGML_STATE_TEXT
;
481 parsing
->depth
= parser
->stack
.depth
;
483 if (stack
->depth
> 1) {
484 struct sgml_parsing_state
*parent
= &parsing
[-1];
486 if (parent
->resume
) {
487 if (is_dom_string_set(&parent
->incomplete
)) {
489 if (!add_to_dom_string(&parent
->incomplete
,
493 parser
->code
= DOM_CODE_ALLOC_ERR
;
497 string
= &parent
->incomplete
;
500 scanner_state
= parent
->scanner
.state
;
502 /* Pop down to the parent. */
509 init_dom_scanner(&parsing
->scanner
, &sgml_scanner_info
, string
,
510 scanner_state
, count_lines
, complete
, incremental
,
513 if (scanner_state
== SGML_STATE_ELEMENT
) {
514 parser
->code
= parse_sgml_attributes(&parser
->stack
, &parsing
->scanner
);
515 if (parser
->code
== DOM_CODE_OK
)
516 parser
->code
= parse_sgml_plain(&parser
->stack
, &parsing
->scanner
);
518 parser
->code
= parse_sgml_plain(&parser
->stack
, &parsing
->scanner
);
522 pop_dom_node(&parser
->parsing
);
526 if (parser
->code
!= DOM_CODE_INCOMPLETE
) {
527 /* No need to preserve the default scanner state. */
528 if (parsing
->scanner
.state
== SGML_STATE_TEXT
) {
529 pop_dom_node(&parser
->parsing
);
533 done_dom_string(&parsing
->incomplete
);
538 token
= get_dom_scanner_token(&parsing
->scanner
);
539 assert(token
&& token
->type
== SGML_TOKEN_INCOMPLETE
);
541 string
= &token
->string
;
543 set_dom_string(&incomplete
, NULL
, 0);
545 if (!init_dom_string(&incomplete
, string
->string
, string
->length
)) {
546 parser
->code
= DOM_CODE_ALLOC_ERR
;
550 done_dom_string(&parsing
->incomplete
);
551 set_dom_string(&parsing
->incomplete
, incomplete
.string
, incomplete
.length
);
558 sgml_parsing_pop(struct dom_stack
*stack
, struct dom_node
*node
, void *data
)
560 struct sgml_parser
*parser
= get_sgml_parser(stack
);
561 struct sgml_parsing_state
*parsing
= data
;
563 /* Only clean up the stack if complete so that we get proper nesting. */
564 if (parser
->flags
& SGML_PARSER_COMPLETE
) {
565 /* Pop the stack back to the state it was in. This includes cleaning
566 * away even immutable states left on the stack. */
567 while (parsing
->depth
< parser
->stack
.depth
) {
568 get_dom_stack_top(&parser
->stack
)->immutable
= 0;
569 pop_dom_node(&parser
->stack
);
571 /* It's bigger than when calling done_sgml_parser() in the middle of an
572 * incomplete parsing. */
573 assert(parsing
->depth
>= parser
->stack
.depth
);
576 done_dom_string(&parsing
->incomplete
);
581 static struct dom_stack_context_info sgml_parsing_context_info
= {
582 /* Object size: */ sizeof(struct sgml_parsing_state
),
586 /* DOM_NODE_ELEMENT */ NULL
,
587 /* DOM_NODE_ATTRIBUTE */ NULL
,
588 /* DOM_NODE_TEXT */ sgml_parsing_push
,
589 /* DOM_NODE_CDATA_SECTION */ NULL
,
590 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
591 /* DOM_NODE_ENTITY */ NULL
,
592 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
593 /* DOM_NODE_COMMENT */ NULL
,
594 /* DOM_NODE_DOCUMENT */ NULL
,
595 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
596 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
597 /* DOM_NODE_NOTATION */ NULL
,
602 /* DOM_NODE_ELEMENT */ NULL
,
603 /* DOM_NODE_ATTRIBUTE */ NULL
,
604 /* DOM_NODE_TEXT */ sgml_parsing_pop
,
605 /* DOM_NODE_CDATA_SECTION */ NULL
,
606 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
607 /* DOM_NODE_ENTITY */ NULL
,
608 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
609 /* DOM_NODE_COMMENT */ NULL
,
610 /* DOM_NODE_DOCUMENT */ NULL
,
611 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
612 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
613 /* DOM_NODE_NOTATION */ NULL
,
618 get_sgml_parser_line_number(struct sgml_parser
*parser
)
620 struct dom_stack_state
*state
;
621 struct sgml_parsing_state
*pstate
;
623 assert(parser
->flags
& SGML_PARSER_COUNT_LINES
);
625 if (dom_stack_is_empty(&parser
->parsing
))
628 state
= get_dom_stack_top(&parser
->parsing
);
629 pstate
= get_dom_stack_state_data(parser
->parsing
.contexts
[0], state
);
631 assert(pstate
->scanner
.count_lines
&& pstate
->scanner
.lineno
);
633 if (pstate
->scanner
.current
634 && pstate
->scanner
.current
< pstate
->scanner
.table
+ DOM_SCANNER_TOKENS
635 && pstate
->scanner
.current
->type
== SGML_TOKEN_ERROR
)
636 return pstate
->scanner
.current
->lineno
;
638 return pstate
->scanner
.lineno
;
642 /* Parser creation and destruction: */
644 /* FIXME: For now the main SGML parser context doesn't do much other than
645 * declaring the sgml_parser_state object. */
646 static struct dom_stack_context_info sgml_parser_context_info
= {
647 /* Object size: */ sizeof(struct sgml_parser_state
),
651 /* DOM_NODE_ELEMENT */ NULL
,
652 /* DOM_NODE_ATTRIBUTE */ NULL
,
653 /* DOM_NODE_TEXT */ NULL
,
654 /* DOM_NODE_CDATA_SECTION */ NULL
,
655 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
656 /* DOM_NODE_ENTITY */ NULL
,
657 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
658 /* DOM_NODE_COMMENT */ NULL
,
659 /* DOM_NODE_DOCUMENT */ NULL
,
660 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
661 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
662 /* DOM_NODE_NOTATION */ NULL
,
667 /* DOM_NODE_ELEMENT */ NULL
,
668 /* DOM_NODE_ATTRIBUTE */ NULL
,
669 /* DOM_NODE_TEXT */ NULL
,
670 /* DOM_NODE_CDATA_SECTION */ NULL
,
671 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
672 /* DOM_NODE_ENTITY */ NULL
,
673 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
674 /* DOM_NODE_COMMENT */ NULL
,
675 /* DOM_NODE_DOCUMENT */ NULL
,
676 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
677 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
678 /* DOM_NODE_NOTATION */ NULL
,
683 init_sgml_parser(enum sgml_parser_type type
, enum sgml_document_type doctype
,
684 struct dom_string
*uri
, enum sgml_parser_flag flags
)
686 struct sgml_parser
*parser
;
687 enum dom_stack_flag stack_flags
= 0;
689 parser
= mem_calloc(1, sizeof(*parser
));
690 if (!parser
) return NULL
;
692 if (!init_dom_string(&parser
->uri
, uri
->string
, uri
->length
)) {
697 if (flags
& SGML_PARSER_DETECT_ERRORS
)
698 flags
|= SGML_PARSER_COUNT_LINES
;
701 parser
->flags
= flags
;
702 parser
->info
= get_sgml_info(doctype
);
704 if (type
== SGML_PARSER_STREAM
)
705 stack_flags
|= DOM_STACK_FLAG_FREE_NODES
;
707 init_dom_stack(&parser
->stack
, stack_flags
);
708 /* FIXME: Some sgml backend specific callbacks? Handle HTML script tags,
709 * and feed document.write() data back to the parser. */
710 add_dom_stack_context(&parser
->stack
, parser
, &sgml_parser_context_info
);
712 /* Don't keep the 'fake' text nodes that holds the parsing data. */
713 init_dom_stack(&parser
->parsing
, DOM_STACK_FLAG_FREE_NODES
);
714 add_dom_stack_context(&parser
->parsing
, parser
, &sgml_parsing_context_info
);
720 done_sgml_parser(struct sgml_parser
*parser
)
722 while (!dom_stack_is_empty(&parser
->parsing
))
723 pop_dom_node(&parser
->parsing
);
724 done_dom_stack(&parser
->parsing
);
725 done_dom_stack(&parser
->stack
);
726 done_dom_string(&parser
->uri
);