1 /* SGML node handling */
13 #include "dom/sgml/parser.h"
14 #include "dom/sgml/scanner.h"
15 #include "dom/sgml/sgml.h"
16 #include "dom/stack.h"
17 #include "dom/string.h"
18 #include "util/error.h"
19 #include "util/memory.h"
22 /* This holds info about a chunk of text being parsed. The SGML parser uses
23 * these to keep track of possible nested calls to parse_sgml(). This can be
24 * used to feed output of stuff like ECMAScripts document.write() from
25 * <script>-elements back to the SGML parser. */
26 struct sgml_parsing_state
{
27 struct dom_scanner scanner
;
28 struct dom_node
*node
;
32 static struct sgml_parsing_state
*
33 init_sgml_parsing_state(struct sgml_parser
*parser
, struct dom_string
*buffer
);
36 /* When getting the sgml_parser struct it is _always_ assumed that the parser
37 * is the first to add it's context, which it is since it initializes the
40 #define get_sgml_parser(stack) ((stack)->contexts[0]->data)
42 #define get_sgml_parser_state(stack, state) \
43 get_dom_stack_state_data(stack->contexts[0], state)
46 /* Functions for adding new nodes to the DOM tree: */
48 /* They wrap init_dom_node() and add_dom_*() and set up of additional
49 * information like node subtypes and SGML parser state information. */
51 static inline struct dom_node
*
52 add_sgml_document(struct dom_stack
*stack
, struct dom_string
*string
)
54 struct dom_node
*node
= init_dom_node(DOM_NODE_DOCUMENT
, string
);
56 return node
? push_dom_node(stack
, node
) : NULL
;
59 static inline struct dom_node
*
60 add_sgml_element(struct dom_stack
*stack
, struct dom_scanner_token
*token
)
62 struct sgml_parser
*parser
= get_sgml_parser(stack
);
63 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
64 struct dom_stack_state
*state
;
65 struct sgml_parser_state
*pstate
;
66 struct dom_node
*node
;
67 struct sgml_node_info
*node_info
;
69 node
= add_dom_element(parent
, &token
->string
);
70 if (!node
) return NULL
;
72 node_info
= get_sgml_node_info(parser
->info
->elements
, node
);
73 node
->data
.element
.type
= node_info
->type
;
75 if (!push_dom_node(stack
, node
))
78 state
= get_dom_stack_top(stack
);
79 assert(node
== state
->node
);
81 pstate
= get_sgml_parser_state(stack
, state
);
82 pstate
->info
= node_info
;
89 add_sgml_attribute(struct dom_stack
*stack
,
90 struct dom_scanner_token
*token
, struct dom_scanner_token
*valtoken
)
92 struct sgml_parser
*parser
= get_sgml_parser(stack
);
93 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
94 struct dom_string
*value
= valtoken
? &valtoken
->string
: NULL
;
95 struct sgml_node_info
*info
;
96 struct dom_node
*node
;
98 node
= add_dom_attribute(parent
, &token
->string
, value
);
100 info
= get_sgml_node_info(parser
->info
->attributes
, node
);
102 node
->data
.attribute
.type
= info
->type
;
103 node
->data
.attribute
.id
= !!(info
->flags
& SGML_ATTRIBUTE_IDENTIFIER
);
104 node
->data
.attribute
.reference
= !!(info
->flags
& SGML_ATTRIBUTE_REFERENCE
);
106 if (valtoken
&& valtoken
->type
== SGML_TOKEN_STRING
)
107 node
->data
.attribute
.quoted
= 1;
109 if (!node
|| !push_dom_node(stack
, node
))
115 static inline struct dom_node
*
116 add_sgml_proc_instruction(struct dom_stack
*stack
, struct dom_scanner_token
*target
,
117 struct dom_scanner_token
*data
)
119 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
120 struct dom_string
*data_str
= data
? &data
->string
: NULL
;
121 struct dom_node
*node
;
123 node
= add_dom_proc_instruction(parent
, &target
->string
, data_str
);
124 if (!node
) return NULL
;
126 switch (target
->type
) {
127 case SGML_TOKEN_PROCESS_XML
:
128 node
->data
.proc_instruction
.type
= DOM_PROC_INSTRUCTION_XML
;
131 case SGML_TOKEN_PROCESS
:
133 node
->data
.proc_instruction
.type
= DOM_PROC_INSTRUCTION
;
136 return push_dom_node(stack
, node
);
140 add_sgml_node(struct dom_stack
*stack
, enum dom_node_type type
, struct dom_scanner_token
*token
)
142 struct dom_node
*parent
= get_dom_stack_top(stack
)->node
;
143 struct dom_node
*node
= add_dom_node(parent
, type
, &token
->string
);
147 if (token
->type
== SGML_TOKEN_SPACE
)
148 node
->data
.text
.only_space
= 1;
150 if (push_dom_node(stack
, node
))
155 /* SGML parser main handling: */
158 parse_sgml_attributes(struct dom_stack
*stack
, struct dom_scanner
*scanner
)
160 struct dom_scanner_token name
;
162 assert(dom_scanner_has_tokens(scanner
)
163 && (get_dom_scanner_token(scanner
)->type
== SGML_TOKEN_ELEMENT_BEGIN
164 || (get_dom_stack_top(stack
)->node
->type
== DOM_NODE_PROCESSING_INSTRUCTION
)));
166 if (get_dom_scanner_token(scanner
)->type
== SGML_TOKEN_ELEMENT_BEGIN
)
167 skip_dom_scanner_token(scanner
);
169 while (dom_scanner_has_tokens(scanner
)) {
170 struct dom_scanner_token
*token
= get_dom_scanner_token(scanner
);
174 switch (token
->type
) {
175 case SGML_TOKEN_TAG_END
:
176 skip_dom_scanner_token(scanner
);
178 case SGML_TOKEN_ELEMENT
:
179 case SGML_TOKEN_ELEMENT_BEGIN
:
180 case SGML_TOKEN_ELEMENT_END
:
181 case SGML_TOKEN_ELEMENT_EMPTY_END
:
184 case SGML_TOKEN_IDENT
:
185 copy_struct(&name
, token
);
187 /* Skip the attribute name token */
188 token
= get_next_dom_scanner_token(scanner
);
189 if (token
&& token
->type
== '=') {
190 /* If the token is not a valid value token
192 token
= get_next_dom_scanner_token(scanner
);
194 && token
->type
!= SGML_TOKEN_IDENT
195 && token
->type
!= SGML_TOKEN_ATTRIBUTE
196 && token
->type
!= SGML_TOKEN_STRING
)
202 add_sgml_attribute(stack
, &name
, token
);
204 /* Skip the value token */
206 skip_dom_scanner_token(scanner
);
210 skip_dom_scanner_token(scanner
);
217 parse_sgml_plain(struct dom_stack
*stack
, struct dom_scanner
*scanner
)
219 struct dom_scanner_token target
;
221 while (dom_scanner_has_tokens(scanner
)) {
222 struct dom_scanner_token
*token
= get_dom_scanner_token(scanner
);
224 switch (token
->type
) {
225 case SGML_TOKEN_ELEMENT
:
226 case SGML_TOKEN_ELEMENT_BEGIN
:
227 if (!add_sgml_element(stack
, token
)) {
228 if (token
->type
== SGML_TOKEN_ELEMENT
) {
229 skip_dom_scanner_token(scanner
);
233 skip_sgml_tokens(scanner
, SGML_TOKEN_TAG_END
);
237 if (token
->type
== SGML_TOKEN_ELEMENT_BEGIN
) {
238 parse_sgml_attributes(stack
, scanner
);
240 skip_dom_scanner_token(scanner
);
245 case SGML_TOKEN_ELEMENT_EMPTY_END
:
247 skip_dom_scanner_token(scanner
);
250 case SGML_TOKEN_ELEMENT_END
:
251 if (!token
->string
.length
) {
254 struct dom_string string
;
255 struct dom_stack_state
*state
;
257 set_dom_string(&string
, token
->string
.string
, token
->string
.length
);
258 state
= search_dom_stack(stack
, DOM_NODE_ELEMENT
,
261 struct sgml_parser_state
*pstate
;
263 pstate
= get_sgml_parser_state(stack
, state
);
264 copy_struct(&pstate
->end_token
, token
);
266 pop_dom_state(stack
, state
);
269 skip_dom_scanner_token(scanner
);
272 case SGML_TOKEN_NOTATION_COMMENT
:
273 add_sgml_node(stack
, DOM_NODE_COMMENT
, token
);
274 skip_dom_scanner_token(scanner
);
277 case SGML_TOKEN_NOTATION_ATTLIST
:
278 case SGML_TOKEN_NOTATION_DOCTYPE
:
279 case SGML_TOKEN_NOTATION_ELEMENT
:
280 case SGML_TOKEN_NOTATION_ENTITY
:
281 case SGML_TOKEN_NOTATION
:
282 skip_dom_scanner_token(scanner
);
285 case SGML_TOKEN_CDATA_SECTION
:
286 add_sgml_node(stack
, DOM_NODE_CDATA_SECTION
, token
);
287 skip_dom_scanner_token(scanner
);
290 case SGML_TOKEN_PROCESS_XML_STYLESHEET
:
291 case SGML_TOKEN_PROCESS_XML
:
292 case SGML_TOKEN_PROCESS
:
293 copy_struct(&target
, token
);
295 /* Skip the target token */
296 token
= get_next_dom_scanner_token(scanner
);
299 assert(token
->type
== SGML_TOKEN_PROCESS_DATA
);
301 if (add_sgml_proc_instruction(stack
, &target
, token
)
302 && (target
.type
== SGML_TOKEN_PROCESS_XML
303 || target
.type
== SGML_TOKEN_PROCESS_XML_STYLESHEET
)
304 && token
->string
.length
> 0) {
305 /* Parse the <?xml data="attributes"?>. */
306 struct dom_scanner attr_scanner
;
308 init_dom_scanner(&attr_scanner
, &sgml_scanner_info
,
309 &token
->string
, SGML_STATE_ELEMENT
,
310 scanner
->count_lines
);
312 if (dom_scanner_has_tokens(&attr_scanner
))
313 parse_sgml_attributes(stack
, &attr_scanner
);
317 skip_dom_scanner_token(scanner
);
320 case SGML_TOKEN_ENTITY
:
321 add_sgml_node(stack
, DOM_NODE_ENTITY_REFERENCE
, token
);
322 skip_dom_scanner_token(scanner
);
325 case SGML_TOKEN_SPACE
:
326 case SGML_TOKEN_TEXT
:
328 add_sgml_node(stack
, DOM_NODE_TEXT
, token
);
329 skip_dom_scanner_token(scanner
);
335 parse_sgml(struct sgml_parser
*parser
, struct dom_string
*buffer
)
337 struct sgml_parsing_state
*parsing
;
340 parser
->root
= add_sgml_document(&parser
->stack
, &parser
->uri
);
343 get_dom_stack_top(&parser
->stack
)->immutable
= 1;
346 parsing
= init_sgml_parsing_state(parser
, buffer
);
347 if (!parsing
) return NULL
;
349 /* FIXME: Make parse_sgml_plain() return something (error code or if
350 * can be guarenteed a root node). */
351 parse_sgml_plain(&parser
->stack
, &parsing
->scanner
);
353 pop_dom_node(&parser
->parsing
);
359 /* Parsing state management: */
361 /* The SGML parser can handle nested calls to parse_sgml(). This can be used to
362 * handle output of external processing of data in the document tree. For
363 * example this can allows output of the document.write() from DOM scripting
364 * interface to be parsed. */
367 sgml_parsing_push(struct dom_stack
*stack
, struct dom_node
*node
, void *data
)
369 struct sgml_parser
*parser
= get_sgml_parser(stack
);
370 struct sgml_parsing_state
*parsing
= data
;
372 parsing
->depth
= parser
->stack
.depth
;
373 get_dom_stack_top(&parser
->stack
)->immutable
= 1;
374 init_dom_scanner(&parsing
->scanner
, &sgml_scanner_info
, &node
->string
,
379 sgml_parsing_pop(struct dom_stack
*stack
, struct dom_node
*node
, void *data
)
381 struct sgml_parser
*parser
= get_sgml_parser(stack
);
382 struct sgml_parsing_state
*parsing
= data
;
384 /* Pop the stack back to the state it was in. This includes cleaning
385 * away even immutable states left on the stack. */
386 while (parsing
->depth
< parser
->stack
.depth
) {
387 get_dom_stack_top(&parser
->stack
)->immutable
= 0;
388 pop_dom_node(&parser
->stack
);
391 assert(parsing
->depth
== parser
->stack
.depth
);
394 static struct dom_stack_context_info sgml_parsing_context_info
= {
395 /* Object size: */ sizeof(struct sgml_parsing_state
),
399 /* DOM_NODE_ELEMENT */ NULL
,
400 /* DOM_NODE_ATTRIBUTE */ NULL
,
401 /* DOM_NODE_TEXT */ sgml_parsing_push
,
402 /* DOM_NODE_CDATA_SECTION */ NULL
,
403 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
404 /* DOM_NODE_ENTITY */ NULL
,
405 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
406 /* DOM_NODE_COMMENT */ NULL
,
407 /* DOM_NODE_DOCUMENT */ NULL
,
408 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
409 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
410 /* DOM_NODE_NOTATION */ NULL
,
415 /* DOM_NODE_ELEMENT */ NULL
,
416 /* DOM_NODE_ATTRIBUTE */ NULL
,
417 /* DOM_NODE_TEXT */ sgml_parsing_pop
,
418 /* DOM_NODE_CDATA_SECTION */ NULL
,
419 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
420 /* DOM_NODE_ENTITY */ NULL
,
421 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
422 /* DOM_NODE_COMMENT */ NULL
,
423 /* DOM_NODE_DOCUMENT */ NULL
,
424 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
425 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
426 /* DOM_NODE_NOTATION */ NULL
,
430 /* Create a new parsing state by pushing a new text node containing the*/
431 static struct sgml_parsing_state
*
432 init_sgml_parsing_state(struct sgml_parser
*parser
, struct dom_string
*buffer
)
434 struct dom_stack_state
*state
;
435 struct dom_node
*node
;
437 node
= init_dom_node(DOM_NODE_TEXT
, buffer
);
438 if (!node
|| !push_dom_node(&parser
->parsing
, node
))
441 state
= get_dom_stack_top(&parser
->parsing
);
443 return get_dom_stack_state_data(parser
->parsing
.contexts
[0], state
);
447 /* Parser creation and destruction: */
449 /* FIXME: For now the main SGML parser context doesn't do much other than
450 * declaring the sgml_parser_state object. */
451 static struct dom_stack_context_info sgml_parser_context_info
= {
452 /* Object size: */ sizeof(struct sgml_parser_state
),
456 /* DOM_NODE_ELEMENT */ NULL
,
457 /* DOM_NODE_ATTRIBUTE */ NULL
,
458 /* DOM_NODE_TEXT */ NULL
,
459 /* DOM_NODE_CDATA_SECTION */ NULL
,
460 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
461 /* DOM_NODE_ENTITY */ NULL
,
462 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
463 /* DOM_NODE_COMMENT */ NULL
,
464 /* DOM_NODE_DOCUMENT */ NULL
,
465 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
466 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
467 /* DOM_NODE_NOTATION */ NULL
,
472 /* DOM_NODE_ELEMENT */ NULL
,
473 /* DOM_NODE_ATTRIBUTE */ NULL
,
474 /* DOM_NODE_TEXT */ NULL
,
475 /* DOM_NODE_CDATA_SECTION */ NULL
,
476 /* DOM_NODE_ENTITY_REFERENCE */ NULL
,
477 /* DOM_NODE_ENTITY */ NULL
,
478 /* DOM_NODE_PROC_INSTRUCTION */ NULL
,
479 /* DOM_NODE_COMMENT */ NULL
,
480 /* DOM_NODE_DOCUMENT */ NULL
,
481 /* DOM_NODE_DOCUMENT_TYPE */ NULL
,
482 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL
,
483 /* DOM_NODE_NOTATION */ NULL
,
488 init_sgml_parser(enum sgml_parser_type type
, enum sgml_document_type doctype
,
489 struct dom_string
*uri
)
491 struct sgml_parser
*parser
;
492 enum dom_stack_flag flags
= 0;
494 parser
= mem_calloc(1, sizeof(*parser
));
495 if (!parser
) return NULL
;
497 if (!init_dom_string(&parser
->uri
, uri
->string
, uri
->length
)) {
503 parser
->info
= get_sgml_info(doctype
);
505 if (type
== SGML_PARSER_TREE
)
506 flags
|= DOM_STACK_KEEP_NODES
;
508 init_dom_stack(&parser
->stack
, flags
);
509 /* FIXME: Some sgml backend specific callbacks? Handle HTML script tags,
510 * and feed document.write() data back to the parser. */
511 add_dom_stack_context(&parser
->stack
, parser
, &sgml_parser_context_info
);
513 /* Don't keep the 'fake' text nodes that holds the parsing data. */
514 init_dom_stack(&parser
->parsing
, 0);
515 add_dom_stack_context(&parser
->parsing
, parser
, &sgml_parsing_context_info
);
521 done_sgml_parser(struct sgml_parser
*parser
)
523 done_dom_stack(&parser
->stack
);
524 done_dom_stack(&parser
->parsing
);
525 done_dom_string(&parser
->uri
);