config: Access OPT_MUST_SAVE in the real option, not alias.
[elinks/elinks-j605.git] / src / dom / sgml / parser.c
bloba0abe226a54826a86de67a2aed9a4b97e2327d5f
1 /* SGML node handling */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <stdlib.h>
8 #include <string.h>
10 #include "elinks.h"
12 #include "dom/node.h"
13 #include "dom/sgml/parser.h"
14 #include "dom/sgml/scanner.h"
15 #include "dom/sgml/sgml.h"
16 #include "dom/stack.h"
17 #include "dom/string.h"
18 #include "util/error.h"
19 #include "util/memory.h"
22 /* This holds info about a chunk of text being parsed. The SGML parser uses
23 * these to keep track of possible nested calls to parse_sgml(). This can be
24 * used to feed output of stuff like ECMAScripts document.write() from
25 * <script>-elements back to the SGML parser. */
26 struct sgml_parsing_state {
27 struct dom_scanner scanner;
28 struct dom_node *node;
29 size_t depth;
32 static struct sgml_parsing_state *
33 init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer);
36 /* When getting the sgml_parser struct it is _always_ assumed that the parser
37 * is the first to add it's context, which it is since it initializes the
38 * stack. */
40 #define get_sgml_parser(stack) ((stack)->contexts[0]->data)
42 #define get_sgml_parser_state(stack, state) \
43 get_dom_stack_state_data(stack->contexts[0], state)
46 /* Functions for adding new nodes to the DOM tree: */
48 /* They wrap init_dom_node() and add_dom_*() and set up of additional
49 * information like node subtypes and SGML parser state information. */
51 static inline struct dom_node *
52 add_sgml_document(struct dom_stack *stack, struct dom_string *string)
54 struct dom_node *node = init_dom_node(DOM_NODE_DOCUMENT, string);
56 return node ? push_dom_node(stack, node) : NULL;
59 static inline struct dom_node *
60 add_sgml_element(struct dom_stack *stack, struct dom_scanner_token *token)
62 struct sgml_parser *parser = get_sgml_parser(stack);
63 struct dom_node *parent = get_dom_stack_top(stack)->node;
64 struct dom_stack_state *state;
65 struct sgml_parser_state *pstate;
66 struct dom_node *node;
67 struct sgml_node_info *node_info;
69 node = add_dom_element(parent, &token->string);
70 if (!node) return NULL;
72 node_info = get_sgml_node_info(parser->info->elements, node);
73 node->data.element.type = node_info->type;
75 if (!push_dom_node(stack, node))
76 return NULL;
78 state = get_dom_stack_top(stack);
79 assert(node == state->node);
81 pstate = get_sgml_parser_state(stack, state);
82 pstate->info = node_info;
84 return node;
88 static inline void
89 add_sgml_attribute(struct dom_stack *stack,
90 struct dom_scanner_token *token, struct dom_scanner_token *valtoken)
92 struct sgml_parser *parser = get_sgml_parser(stack);
93 struct dom_node *parent = get_dom_stack_top(stack)->node;
94 struct dom_string *value = valtoken ? &valtoken->string : NULL;
95 struct sgml_node_info *info;
96 struct dom_node *node;
98 node = add_dom_attribute(parent, &token->string, value);
100 info = get_sgml_node_info(parser->info->attributes, node);
102 node->data.attribute.type = info->type;
103 node->data.attribute.id = !!(info->flags & SGML_ATTRIBUTE_IDENTIFIER);
104 node->data.attribute.reference = !!(info->flags & SGML_ATTRIBUTE_REFERENCE);
106 if (valtoken && valtoken->type == SGML_TOKEN_STRING)
107 node->data.attribute.quoted = 1;
109 if (!node || !push_dom_node(stack, node))
110 return;
112 pop_dom_node(stack);
115 static inline struct dom_node *
116 add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
117 struct dom_scanner_token *data)
119 struct dom_node *parent = get_dom_stack_top(stack)->node;
120 struct dom_string *data_str = data ? &data->string : NULL;
121 struct dom_node *node;
123 node = add_dom_proc_instruction(parent, &target->string, data_str);
124 if (!node) return NULL;
126 switch (target->type) {
127 case SGML_TOKEN_PROCESS_XML:
128 node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
129 break;
131 case SGML_TOKEN_PROCESS:
132 default:
133 node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
136 return push_dom_node(stack, node);
139 static inline void
140 add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct dom_scanner_token *token)
142 struct dom_node *parent = get_dom_stack_top(stack)->node;
143 struct dom_node *node = add_dom_node(parent, type, &token->string);
145 if (!node) return;
147 if (token->type == SGML_TOKEN_SPACE)
148 node->data.text.only_space = 1;
150 if (push_dom_node(stack, node))
151 pop_dom_node(stack);
155 /* SGML parser main handling: */
157 static inline void
158 parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
160 struct dom_scanner_token name;
162 assert(dom_scanner_has_tokens(scanner)
163 && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
164 || (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION)));
166 if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN)
167 skip_dom_scanner_token(scanner);
169 while (dom_scanner_has_tokens(scanner)) {
170 struct dom_scanner_token *token = get_dom_scanner_token(scanner);
172 assert(token);
174 switch (token->type) {
175 case SGML_TOKEN_TAG_END:
176 skip_dom_scanner_token(scanner);
177 /* and return */
178 case SGML_TOKEN_ELEMENT:
179 case SGML_TOKEN_ELEMENT_BEGIN:
180 case SGML_TOKEN_ELEMENT_END:
181 case SGML_TOKEN_ELEMENT_EMPTY_END:
182 return;
184 case SGML_TOKEN_IDENT:
185 copy_struct(&name, token);
187 /* Skip the attribute name token */
188 token = get_next_dom_scanner_token(scanner);
189 if (token && token->type == '=') {
190 /* If the token is not a valid value token
191 * ignore it. */
192 token = get_next_dom_scanner_token(scanner);
193 if (token
194 && token->type != SGML_TOKEN_IDENT
195 && token->type != SGML_TOKEN_ATTRIBUTE
196 && token->type != SGML_TOKEN_STRING)
197 token = NULL;
198 } else {
199 token = NULL;
202 add_sgml_attribute(stack, &name, token);
204 /* Skip the value token */
205 if (token)
206 skip_dom_scanner_token(scanner);
207 break;
209 default:
210 skip_dom_scanner_token(scanner);
216 static void
217 parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
219 struct dom_scanner_token target;
221 while (dom_scanner_has_tokens(scanner)) {
222 struct dom_scanner_token *token = get_dom_scanner_token(scanner);
224 switch (token->type) {
225 case SGML_TOKEN_ELEMENT:
226 case SGML_TOKEN_ELEMENT_BEGIN:
227 if (!add_sgml_element(stack, token)) {
228 if (token->type == SGML_TOKEN_ELEMENT) {
229 skip_dom_scanner_token(scanner);
230 break;
233 skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
234 break;
237 if (token->type == SGML_TOKEN_ELEMENT_BEGIN) {
238 parse_sgml_attributes(stack, scanner);
239 } else {
240 skip_dom_scanner_token(scanner);
243 break;
245 case SGML_TOKEN_ELEMENT_EMPTY_END:
246 pop_dom_node(stack);
247 skip_dom_scanner_token(scanner);
248 break;
250 case SGML_TOKEN_ELEMENT_END:
251 if (!token->string.length) {
252 pop_dom_node(stack);
253 } else {
254 struct dom_string string;
255 struct dom_stack_state *state;
257 set_dom_string(&string, token->string.string, token->string.length);
258 state = search_dom_stack(stack, DOM_NODE_ELEMENT,
259 &string);
260 if (state) {
261 struct sgml_parser_state *pstate;
263 pstate = get_sgml_parser_state(stack, state);
264 copy_struct(&pstate->end_token, token);
266 pop_dom_state(stack, state);
269 skip_dom_scanner_token(scanner);
270 break;
272 case SGML_TOKEN_NOTATION_COMMENT:
273 add_sgml_node(stack, DOM_NODE_COMMENT, token);
274 skip_dom_scanner_token(scanner);
275 break;
277 case SGML_TOKEN_NOTATION_ATTLIST:
278 case SGML_TOKEN_NOTATION_DOCTYPE:
279 case SGML_TOKEN_NOTATION_ELEMENT:
280 case SGML_TOKEN_NOTATION_ENTITY:
281 case SGML_TOKEN_NOTATION:
282 skip_dom_scanner_token(scanner);
283 break;
285 case SGML_TOKEN_CDATA_SECTION:
286 add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token);
287 skip_dom_scanner_token(scanner);
288 break;
290 case SGML_TOKEN_PROCESS_XML_STYLESHEET:
291 case SGML_TOKEN_PROCESS_XML:
292 case SGML_TOKEN_PROCESS:
293 copy_struct(&target, token);
295 /* Skip the target token */
296 token = get_next_dom_scanner_token(scanner);
297 if (!token) break;
299 assert(token->type == SGML_TOKEN_PROCESS_DATA);
301 if (add_sgml_proc_instruction(stack, &target, token)
302 && (target.type == SGML_TOKEN_PROCESS_XML
303 || target.type == SGML_TOKEN_PROCESS_XML_STYLESHEET)
304 && token->string.length > 0) {
305 /* Parse the <?xml data="attributes"?>. */
306 struct dom_scanner attr_scanner;
308 init_dom_scanner(&attr_scanner, &sgml_scanner_info,
309 &token->string, SGML_STATE_ELEMENT,
310 scanner->count_lines);
312 if (dom_scanner_has_tokens(&attr_scanner))
313 parse_sgml_attributes(stack, &attr_scanner);
316 pop_dom_node(stack);
317 skip_dom_scanner_token(scanner);
318 break;
320 case SGML_TOKEN_ENTITY:
321 add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token);
322 skip_dom_scanner_token(scanner);
323 break;
325 case SGML_TOKEN_SPACE:
326 case SGML_TOKEN_TEXT:
327 default:
328 add_sgml_node(stack, DOM_NODE_TEXT, token);
329 skip_dom_scanner_token(scanner);
334 struct dom_node *
335 parse_sgml(struct sgml_parser *parser, struct dom_string *buffer)
337 struct sgml_parsing_state *parsing;
339 if (!parser->root) {
340 parser->root = add_sgml_document(&parser->stack, &parser->uri);
341 if (!parser->root)
342 return NULL;
343 get_dom_stack_top(&parser->stack)->immutable = 1;
346 parsing = init_sgml_parsing_state(parser, buffer);
347 if (!parsing) return NULL;
349 /* FIXME: Make parse_sgml_plain() return something (error code or if
350 * can be guarenteed a root node). */
351 parse_sgml_plain(&parser->stack, &parsing->scanner);
353 pop_dom_node(&parser->parsing);
355 return parser->root;
359 /* Parsing state management: */
361 /* The SGML parser can handle nested calls to parse_sgml(). This can be used to
362 * handle output of external processing of data in the document tree. For
363 * example this can allows output of the document.write() from DOM scripting
364 * interface to be parsed. */
366 static void
367 sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
369 struct sgml_parser *parser = get_sgml_parser(stack);
370 struct sgml_parsing_state *parsing = data;
372 parsing->depth = parser->stack.depth;
373 get_dom_stack_top(&parser->stack)->immutable = 1;
374 init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string,
375 SGML_STATE_TEXT, 0);
378 static void
379 sgml_parsing_pop(struct dom_stack *stack, struct dom_node *node, void *data)
381 struct sgml_parser *parser = get_sgml_parser(stack);
382 struct sgml_parsing_state *parsing = data;
384 /* Pop the stack back to the state it was in. This includes cleaning
385 * away even immutable states left on the stack. */
386 while (parsing->depth < parser->stack.depth) {
387 get_dom_stack_top(&parser->stack)->immutable = 0;
388 pop_dom_node(&parser->stack);
391 assert(parsing->depth == parser->stack.depth);
394 static struct dom_stack_context_info sgml_parsing_context_info = {
395 /* Object size: */ sizeof(struct sgml_parsing_state),
396 /* Push: */
398 /* */ NULL,
399 /* DOM_NODE_ELEMENT */ NULL,
400 /* DOM_NODE_ATTRIBUTE */ NULL,
401 /* DOM_NODE_TEXT */ sgml_parsing_push,
402 /* DOM_NODE_CDATA_SECTION */ NULL,
403 /* DOM_NODE_ENTITY_REFERENCE */ NULL,
404 /* DOM_NODE_ENTITY */ NULL,
405 /* DOM_NODE_PROC_INSTRUCTION */ NULL,
406 /* DOM_NODE_COMMENT */ NULL,
407 /* DOM_NODE_DOCUMENT */ NULL,
408 /* DOM_NODE_DOCUMENT_TYPE */ NULL,
409 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
410 /* DOM_NODE_NOTATION */ NULL,
412 /* Pop: */
414 /* */ NULL,
415 /* DOM_NODE_ELEMENT */ NULL,
416 /* DOM_NODE_ATTRIBUTE */ NULL,
417 /* DOM_NODE_TEXT */ sgml_parsing_pop,
418 /* DOM_NODE_CDATA_SECTION */ NULL,
419 /* DOM_NODE_ENTITY_REFERENCE */ NULL,
420 /* DOM_NODE_ENTITY */ NULL,
421 /* DOM_NODE_PROC_INSTRUCTION */ NULL,
422 /* DOM_NODE_COMMENT */ NULL,
423 /* DOM_NODE_DOCUMENT */ NULL,
424 /* DOM_NODE_DOCUMENT_TYPE */ NULL,
425 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
426 /* DOM_NODE_NOTATION */ NULL,
430 /* Create a new parsing state by pushing a new text node containing the*/
431 static struct sgml_parsing_state *
432 init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer)
434 struct dom_stack_state *state;
435 struct dom_node *node;
437 node = init_dom_node(DOM_NODE_TEXT, buffer);
438 if (!node || !push_dom_node(&parser->parsing, node))
439 return NULL;
441 state = get_dom_stack_top(&parser->parsing);
443 return get_dom_stack_state_data(parser->parsing.contexts[0], state);
447 /* Parser creation and destruction: */
449 /* FIXME: For now the main SGML parser context doesn't do much other than
450 * declaring the sgml_parser_state object. */
451 static struct dom_stack_context_info sgml_parser_context_info = {
452 /* Object size: */ sizeof(struct sgml_parser_state),
453 /* Push: */
455 /* */ NULL,
456 /* DOM_NODE_ELEMENT */ NULL,
457 /* DOM_NODE_ATTRIBUTE */ NULL,
458 /* DOM_NODE_TEXT */ NULL,
459 /* DOM_NODE_CDATA_SECTION */ NULL,
460 /* DOM_NODE_ENTITY_REFERENCE */ NULL,
461 /* DOM_NODE_ENTITY */ NULL,
462 /* DOM_NODE_PROC_INSTRUCTION */ NULL,
463 /* DOM_NODE_COMMENT */ NULL,
464 /* DOM_NODE_DOCUMENT */ NULL,
465 /* DOM_NODE_DOCUMENT_TYPE */ NULL,
466 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
467 /* DOM_NODE_NOTATION */ NULL,
469 /* Pop: */
471 /* */ NULL,
472 /* DOM_NODE_ELEMENT */ NULL,
473 /* DOM_NODE_ATTRIBUTE */ NULL,
474 /* DOM_NODE_TEXT */ NULL,
475 /* DOM_NODE_CDATA_SECTION */ NULL,
476 /* DOM_NODE_ENTITY_REFERENCE */ NULL,
477 /* DOM_NODE_ENTITY */ NULL,
478 /* DOM_NODE_PROC_INSTRUCTION */ NULL,
479 /* DOM_NODE_COMMENT */ NULL,
480 /* DOM_NODE_DOCUMENT */ NULL,
481 /* DOM_NODE_DOCUMENT_TYPE */ NULL,
482 /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
483 /* DOM_NODE_NOTATION */ NULL,
487 struct sgml_parser *
488 init_sgml_parser(enum sgml_parser_type type, enum sgml_document_type doctype,
489 struct dom_string *uri)
491 struct sgml_parser *parser;
492 enum dom_stack_flag flags = 0;
494 parser = mem_calloc(1, sizeof(*parser));
495 if (!parser) return NULL;
497 if (!init_dom_string(&parser->uri, uri->string, uri->length)) {
498 mem_free(parser);
499 return NULL;
502 parser->type = type;
503 parser->info = get_sgml_info(doctype);
505 if (type == SGML_PARSER_TREE)
506 flags |= DOM_STACK_KEEP_NODES;
508 init_dom_stack(&parser->stack, flags);
509 /* FIXME: Some sgml backend specific callbacks? Handle HTML script tags,
510 * and feed document.write() data back to the parser. */
511 add_dom_stack_context(&parser->stack, parser, &sgml_parser_context_info);
513 /* Don't keep the 'fake' text nodes that holds the parsing data. */
514 init_dom_stack(&parser->parsing, 0);
515 add_dom_stack_context(&parser->parsing, parser, &sgml_parsing_context_info);
517 return parser;
520 void
521 done_sgml_parser(struct sgml_parser *parser)
523 done_dom_stack(&parser->stack);
524 done_dom_stack(&parser->parsing);
525 done_dom_string(&parser->uri);
526 mem_free(parser);