iconv: Bail out of the loop when an illegal sequence of bytes occurs.
[elinks/elinks-j605.git] / src / document / css / parser.c
blobee2bfe721b22a92970618ca22ca662df44fa8cab
1 /** CSS main parser
2 * @file */
4 #ifdef HAVE_CONFIG_H
5 #include "config.h"
6 #endif
8 #include <stdlib.h>
9 #include <string.h>
11 #include "elinks.h"
13 #include "config/options.h"
14 #include "document/css/css.h"
15 #include "document/css/parser.h"
16 #include "document/css/property.h"
17 #include "document/css/scanner.h"
18 #include "document/css/stylesheet.h"
19 #include "document/css/value.h"
20 #include "util/color.h"
21 #include "util/lists.h"
22 #include "util/error.h"
23 #include "util/memory.h"
24 #include "util/string.h"
26 static void css_parse_ruleset(struct css_stylesheet *css,
27 struct scanner *scanner);
30 void
31 css_parse_properties(LIST_OF(struct css_property) *props,
32 struct scanner *scanner)
34 assert(props && scanner);
36 while (scanner_has_tokens(scanner)) {
37 struct css_property_info *property_info = NULL;
38 struct css_property *prop;
39 struct scanner_token *token = get_scanner_token(scanner);
40 int i;
42 if (!token || token->type == '}') break;
44 /* Extract property name. */
46 if (token->type != CSS_TOKEN_IDENT
47 || !check_next_scanner_token(scanner, ':')) {
48 /* Some use style="{ properties }" so we have to be
49 * check what to skip to. */
50 if (token->type == '{') {
51 skip_scanner_token(scanner);
52 } else {
53 skip_css_tokens(scanner, ';');
55 continue;
58 for (i = 0; css_property_info[i].name; i++) {
59 struct css_property_info *info = &css_property_info[i];
61 if (scanner_token_strlcasecmp(token, info->name, -1)) {
62 property_info = info;
63 break;
67 /* Skip property name and separator and check for expression */
68 if (!skip_css_tokens(scanner, ':')) {
69 assert(!scanner_has_tokens(scanner));
70 break;
73 if (!property_info) {
74 /* Unknown property, check the next one. */
75 goto ride_on;
78 /* We might be on track of something, cook up the struct. */
80 prop = mem_calloc(1, sizeof(*prop));
81 if (!prop) {
82 goto ride_on;
84 prop->type = property_info->type;
85 prop->value_type = property_info->value_type;
86 if (!css_parse_value(property_info, &prop->value, scanner)) {
87 mem_free(prop);
88 goto ride_on;
90 add_to_list(*props, prop);
92 /* Maybe we have something else to go yet? */
94 ride_on:
95 skip_css_tokens(scanner, ';');
99 static void
100 skip_css_block(struct scanner *scanner)
102 if (skip_css_tokens(scanner, '{')) {
103 const int preclimit = get_css_precedence('}');
104 int depth = 1;
105 struct scanner_token *token = get_scanner_token(scanner);
107 while (token && token->precedence <= preclimit && depth > 0) {
108 if (token->type == '{')
109 ++depth;
110 else if (token->type == '}')
111 --depth;
112 token = get_next_scanner_token(scanner);
117 /* Parse a list of media types.
119 * Media types grammar:
121 * @verbatim
122 * media_types:
123 * <empty>
124 * | <ident>
125 * | media_types ',' <ident>
126 * @endverbatim
128 * This does not entirely match appendix D of CSS2: ELinks allows any
129 * list of media types to be empty, whereas CSS2 allows that only in
130 * @@import and not in @@media.
132 * @return nonzero if the directive containing this list should take
133 * effect, zero if not.
135 static int
136 css_parse_media_types(struct scanner *scanner)
138 int matched = 0;
139 int empty = 1;
140 const unsigned char *const optstr = get_opt_str("document.css.media", NULL);
141 struct scanner_token *token = get_scanner_token(scanner);
143 while (token && token->type == CSS_TOKEN_IDENT) {
144 empty = 0;
145 if (!matched) /* Skip string ops if already matched. */
146 matched = supports_css_media_type(
147 optstr, token->string, token->length);
149 token = get_next_scanner_token(scanner);
150 if (!token || token->type != ',')
151 break;
153 token = get_next_scanner_token(scanner);
156 return matched || empty;
159 /** Parse an atrule from @a scanner and update @a css accordingly.
161 * Atrules grammar:
163 * @verbatim
164 * atrule:
165 * '@charset' <string> ';'
166 * | '@import' <string> media_types ';'
167 * | '@import' <uri> media_types ';'
168 * | '@media' media_types '{' ruleset* '}'
169 * | '@page' <ident>? [':' <ident>]? '{' properties '}'
170 * | '@font-face' '{' properties '}'
171 * @endverbatim
173 static void
174 css_parse_atrule(struct css_stylesheet *css, struct scanner *scanner,
175 struct uri *base_uri)
177 struct scanner_token *token = get_scanner_token(scanner);
178 struct string import_uri;
180 /* Skip skip skip that code */
181 switch (token->type) {
182 case CSS_TOKEN_AT_IMPORT:
183 token = get_next_scanner_token(scanner);
184 if (!token) break;
185 if (token->type != CSS_TOKEN_STRING
186 && token->type != CSS_TOKEN_URL)
187 goto skip_rest_of_atrule;
189 /* As of 2007-07, token->string points into the
190 * original CSS text, so the pointer will remain
191 * valid even if we parse more tokens. But this
192 * may have to change when backslash escapes are
193 * properly supported. So play it safe and make
194 * a copy of the string. */
195 if (!init_string(&import_uri))
196 goto skip_rest_of_atrule;
197 if (!add_bytes_to_string(&import_uri,
198 token->string,
199 token->length)) {
200 done_string(&import_uri);
201 goto skip_rest_of_atrule;
204 skip_scanner_token(scanner);
205 if (!css_parse_media_types(scanner)) {
206 done_string(&import_uri);
207 goto skip_rest_of_atrule;
210 token = get_scanner_token(scanner);
211 if (!token || token->type != ';') {
212 done_string(&import_uri);
213 goto skip_rest_of_atrule;
215 skip_scanner_token(scanner);
217 assert(css->import);
218 css->import(css, base_uri,
219 import_uri.source, import_uri.length);
220 done_string(&import_uri);
221 break;
223 case CSS_TOKEN_AT_CHARSET:
224 skip_css_tokens(scanner, ';');
225 break;
227 case CSS_TOKEN_AT_MEDIA:
228 skip_scanner_token(scanner);
229 if (!css_parse_media_types(scanner))
230 goto skip_rest_of_atrule;
231 token = get_scanner_token(scanner);
232 if (!token || token->type != '{')
233 goto skip_rest_of_atrule;
234 token = get_next_scanner_token(scanner);
235 while (token && token->type != '}') {
236 css_parse_ruleset(css, scanner);
237 token = get_scanner_token(scanner);
239 if (token)
240 skip_scanner_token(scanner);
241 break;
243 case CSS_TOKEN_AT_FONT_FACE:
244 case CSS_TOKEN_AT_PAGE:
245 skip_css_block(scanner);
246 break;
248 skip_rest_of_atrule:
249 case CSS_TOKEN_AT_KEYWORD:
250 /* TODO: Unkown @-rule so either skip til ';' or next block. */
251 token = get_scanner_token(scanner);
252 while (token) {
253 if (token->type == ';') {
254 skip_scanner_token(scanner);
255 break;
257 } else if (token->type == '{') {
258 skip_css_block(scanner);
259 break;
262 token = get_next_scanner_token(scanner);
264 break;
265 default:
266 INTERNAL("@-rule parser called without atrule.");
271 struct selector_pkg {
272 LIST_HEAD(struct selector_pkg);
273 struct css_selector *selector;
276 /** Move a CSS selector and its leaves into a new set. If a similar
277 * selector already exists in the set, merge them.
279 * @param sels
280 * The set to which @a selector should be moved. Must not be NULL.
281 * @param selector
282 * The selector that should be moved. Must not be NULL. If it is
283 * already in some set, this function removes it from there.
284 * @param watch
285 * This function updates @a *watch if it merges that selector into
286 * another one. @a watch must not be NULL but @a *watch may be.
288 * @returns @a selector or the one into which it was merged. */
289 static struct css_selector *
290 reparent_selector(struct css_selector_set *sels,
291 struct css_selector *selector,
292 struct css_selector **watch)
294 struct css_selector *twin = find_css_selector(sels, selector->type,
295 selector->relation,
296 selector->name, -1);
298 if (twin) {
299 merge_css_selectors(twin, selector);
300 /* Reparent leaves. */
301 while (!css_selector_set_empty(&selector->leaves)) {
302 struct css_selector *leaf = css_selector_set_front(&selector->leaves);
304 reparent_selector(&twin->leaves, leaf, watch);
306 if (*watch == selector)
307 *watch = twin;
308 done_css_selector(selector);
309 } else {
310 if (css_selector_is_in_set(selector))
311 del_css_selector_from_set(selector);
312 add_css_selector_to_set(selector, sels);
315 return twin ? twin : selector;
318 /** Parse a comma-separated list of CSS selectors from @a scanner.
319 * Register the selectors in @a css so that get_css_selector_for_element()
320 * will find them, and add them to @a selectors so that the caller can
321 * attach properties to them.
323 * Our selector grammar:
325 * @verbatim
326 * selector:
327 * element_name? ('#' id)? ('.' class)? (':' pseudo_class)? \
328 * ((' ' | '>') selector)?
329 * @endverbatim
331 static void
332 css_parse_selector(struct css_stylesheet *css, struct scanner *scanner,
333 LIST_OF(struct selector_pkg) *selectors)
335 /* Shell for the last selector (the whole selector chain, that is). */
336 struct selector_pkg *pkg = NULL;
337 /* In 'p#x.y i.z', it's NULL for 'p', 'p' for '#x', '.y' and 'i', and
338 * 'i' for '.z'. */
339 struct css_selector *prev_element_selector = NULL;
340 /* In 'p#x.y:q i', it's NULL for 'p' and '#x', '#x' for '.y', and '.y'
341 * for ':q', and again NULL for 'i'. */
342 struct css_selector *prev_specific_selector = NULL;
343 /* In 'p#x.y div.z:a' it is NULL for 'p#x.y' and 'div', and 'p' for
344 * '.z' and ':a'. So the difference from @prev_element_selector is that
345 * it is changed after the current selector fragment is finished, not
346 * right after the base selector is loaded. So it is set differently
347 * for the '#x.y' and '.z:a' parts of selector. */
348 struct css_selector *last_chained_selector = NULL;
349 /* In 'p#x.y div.z:a, i.b {}', it's set for ':a' and '.b'. */
350 int last_fragment = 0;
351 /* In 'p#x .y', it's set for 'p' and '.y'. Note that it is always set in
352 * the previous iteration so it's valid for the current token only
353 * before "saving" the token. */
354 int selector_start = 1;
356 /* FIXME: element can be even '*' --pasky */
358 while (scanner_has_tokens(scanner)) {
359 struct scanner_token *token = get_scanner_token(scanner);
360 struct scanner_token last_token;
361 struct css_selector *selector;
362 enum css_selector_relation reltype = CSR_ROOT;
363 enum css_selector_type seltype = CST_ELEMENT;
365 assert(token);
366 assert(!last_fragment);
369 if (token->type == '{'
370 || token->type == '}'
371 || token->type == ';')
372 break;
375 /* Examine the selector fragment */
377 if (token->type != CSS_TOKEN_IDENT) {
378 switch (token->type) {
379 case CSS_TOKEN_HASH:
380 case CSS_TOKEN_HEX_COLOR:
381 seltype = CST_ID;
382 reltype = selector_start ? CSR_ANCESTOR : CSR_SPECIFITY;
383 break;
385 case '.':
386 seltype = CST_CLASS;
387 reltype = selector_start ? CSR_ANCESTOR : CSR_SPECIFITY;
388 break;
390 case ':':
391 seltype = CST_PSEUDO;
392 reltype = selector_start ? CSR_ANCESTOR : CSR_SPECIFITY;
393 break;
395 case '>':
396 seltype = CST_ELEMENT;
397 reltype = CSR_PARENT;
398 break;
400 default:
401 /* FIXME: Temporary fix for this weird CSS
402 * precedence thing. ')' has higher than ','
403 * and it can cause problems when skipping
404 * here. The reason is for the function()
405 * parsing. Hmm... --jonas */
406 if (!skip_css_tokens(scanner, ','))
407 skip_scanner_token(scanner);
408 seltype = CST_INVALID;
409 break;
412 if (seltype == CST_INVALID)
413 continue;
415 /* Hexcolor and hash already contains the ident
416 * inside. */
417 if (token->type != CSS_TOKEN_HEX_COLOR
418 && token->type != CSS_TOKEN_HASH) {
419 token = get_next_scanner_token(scanner);
420 if (!token) break;
421 if (token->type != CSS_TOKEN_IDENT) /* wtf */
422 continue;
423 } else {
424 /* Skip the leading '#'. */
425 token->string++, token->length--;
428 } else {
429 if (pkg) reltype = CSR_ANCESTOR;
433 /* Look ahead at what's coming next */
435 copy_struct(&last_token, token);
436 /* Detect whether upcoming tokens are separated by
437 * whitespace or not (that's important for determining
438 * whether it's a combinator or specificitier). */
439 if (last_token.string + last_token.length < scanner->end) {
440 selector_start = last_token.string[last_token.length];
441 selector_start = (selector_start != '#'
442 && selector_start != '.'
443 && selector_start != ':');
444 } /* else it doesn't matter as we are gonna bail out anyway. */
446 token = get_next_scanner_token(scanner);
447 if (!token) break;
448 last_fragment = (token->type == ',' || token->type == '{');
451 /* Register the selector */
453 if (!pkg) {
454 selector = get_css_base_selector(
455 last_fragment ? css : NULL, seltype,
456 CSR_ROOT,
457 last_token.string, last_token.length);
458 if (!selector) continue;
460 pkg = mem_calloc(1, sizeof(*pkg));
461 if (!pkg) continue;
462 add_to_list(*selectors, pkg);
463 pkg->selector = selector;
465 } else if (reltype == CSR_SPECIFITY) {
466 /* We append under the last fragment. */
467 struct css_selector *base_sel = prev_specific_selector;
469 if (!base_sel) base_sel = prev_element_selector;
470 assert(base_sel);
472 selector = get_css_selector(&base_sel->leaves,
473 seltype, reltype,
474 last_token.string,
475 last_token.length);
476 if (!selector) continue;
478 if (last_chained_selector) {
479 /* The situation is like: 'div p#x', now it was
480 * 'p -> div', but we need to redo that as
481 * '(p ->) #x -> div'. */
482 del_css_selector_from_set(last_chained_selector);
483 add_css_selector_to_set(last_chained_selector,
484 &selector->leaves);
487 if (pkg->selector == base_sel) {
488 /* This is still just specificitying offspring
489 * of the previous pkg->selector. */
490 pkg->selector = selector;
493 if (last_fragment) {
494 /* This is the last fragment of the selector
495 * chain, that means the last base fragment
496 * wasn't marked so and thus wasn't bound to
497 * the stylesheet. Let's do that now. */
498 assert(prev_element_selector);
499 set_css_selector_relation(prev_element_selector, CSR_ROOT);
500 prev_element_selector =
501 reparent_selector(&css->selectors,
502 prev_element_selector,
503 &pkg->selector);
506 } else /* CSR_PARENT || CSR_ANCESTOR */ {
507 /* We - in the perlish speak - unshift in front
508 * of the previous selector fragment and reparent
509 * it to the upcoming one. */
510 selector = get_css_base_selector(
511 last_fragment ? css : NULL, seltype,
512 CSR_ROOT,
513 last_token.string, last_token.length);
514 if (!selector) continue;
516 assert(prev_element_selector);
517 set_css_selector_relation(prev_element_selector, reltype);
518 add_css_selector_to_set(prev_element_selector,
519 &selector->leaves);
520 last_chained_selector = prev_element_selector;
525 /* Record the selector fragment for future generations */
527 if (reltype == CSR_SPECIFITY) {
528 prev_specific_selector = selector;
529 } else {
530 prev_element_selector = selector;
531 prev_specific_selector = NULL;
535 /* What to do next */
537 if (last_fragment) {
538 /* Next selector coming, clean up. */
539 pkg = NULL; last_fragment = 0; selector_start = 1;
540 prev_element_selector = NULL;
541 prev_specific_selector = NULL;
542 last_chained_selector = NULL;
545 if (token->type == ',') {
546 /* Another selector hooked to these properties. */
547 skip_scanner_token(scanner);
549 } else if (token->type == '{') {
550 /* End of selector list. */
551 break;
553 } /* else Another selector fragment probably coming up. */
556 /* Wipe the selector we were currently composing, if any. */
557 if (pkg) {
558 if (prev_element_selector)
559 done_css_selector(prev_element_selector);
560 del_from_list(pkg);
561 mem_free(pkg);
566 /** Parse a ruleset from @a scanner to @a css.
568 * Ruleset grammar:
570 * @verbatim
571 * ruleset:
572 * selector [ ',' selector ]* '{' properties '}'
573 * @endverbatim
575 static void
576 css_parse_ruleset(struct css_stylesheet *css, struct scanner *scanner)
578 INIT_LIST_OF(struct selector_pkg, selectors);
579 INIT_LIST_OF(struct css_property, properties);
580 struct selector_pkg *pkg;
582 css_parse_selector(css, scanner, &selectors);
583 if (list_empty(selectors)
584 || !skip_css_tokens(scanner, '{')) {
585 if (!list_empty(selectors)) free_list(selectors);
586 skip_css_tokens(scanner, '}');
587 return;
591 /* We don't handle the case where a property has already been added to
592 * a selector. That doesn't matter though, because the best one will be
593 * always the last one (FIXME: 'important!'), therefore the applier
594 * will take it last and it will have the "final" effect.
596 * So it's only a little waste and no real harm. The thing is, what do
597 * you do when you have 'background: #fff' and then 'background:
598 * x-repeat'? It would require yet another logic to handle merging of
599 * these etc and the induced overhead would in most cases mean more
600 * waste that having the property multiple times in a selector, I
601 * believe. --pasky */
603 pkg = selectors.next;
604 css_parse_properties(&properties, scanner);
606 skip_css_tokens(scanner, '}');
608 /* Mirror the properties to all the selectors. */
609 foreach (pkg, selectors) {
610 #ifdef DEBUG_CSS
611 /* Cannot use list_empty() inside the arglist of DBG()
612 * because GCC 4.1 "warning: operation on `errfile'
613 * may be undefined" breaks the build with -Werror. */
614 int dbg_has_properties = !list_empty(properties);
615 int dbg_has_leaves = !css_selector_set_empty(&pkg->selector->leaves);
617 DBG("Binding properties (!!%d) to selector %s (type %d, relation %d, children %d)",
618 dbg_has_properties,
619 pkg->selector->name, pkg->selector->type,
620 pkg->selector->relation,
621 dbg_has_leaves);
622 #endif
623 add_selector_properties(pkg->selector, &properties);
625 free_list(selectors);
626 free_list(properties);
630 void
631 css_parse_stylesheet(struct css_stylesheet *css, struct uri *base_uri,
632 const unsigned char *string, const unsigned char *end)
634 struct scanner scanner;
636 init_scanner(&scanner, &css_scanner_info, string, end);
638 while (scanner_has_tokens(&scanner)) {
639 struct scanner_token *token = get_scanner_token(&scanner);
641 assert(token);
643 switch (token->type) {
644 case CSS_TOKEN_AT_KEYWORD:
645 case CSS_TOKEN_AT_CHARSET:
646 case CSS_TOKEN_AT_FONT_FACE:
647 case CSS_TOKEN_AT_IMPORT:
648 case CSS_TOKEN_AT_MEDIA:
649 case CSS_TOKEN_AT_PAGE:
650 css_parse_atrule(css, &scanner, base_uri);
651 break;
653 default:
654 /* And WHAT ELSE could it be?! */
655 css_parse_ruleset(css, &scanner);
658 #ifdef DEBUG_CSS
659 dump_css_selector_tree(&css->selectors);
660 WDBG("That's it.");
661 #endif