iconv: Bail out of the loop when an illegal sequence of bytes occurs.
[elinks/elinks-j605.git] / src / document / html / parser.c
blobc10fd9d09b3f32b283c4435fc0da848f43ad38a6
1 /* HTML parser */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* XXX: we _WANT_ strcasestr() ! */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #include <errno.h>
12 #include <stdarg.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
17 #include "elinks.h"
19 #include "bfu/listmenu.h"
20 #include "bfu/menu.h"
21 #include "document/css/apply.h"
22 #include "document/css/css.h"
23 #include "document/css/stylesheet.h"
24 #include "document/html/frames.h"
25 #include "document/html/parse-meta-refresh.h"
26 #include "document/html/parser/link.h"
27 #include "document/html/parser/stack.h"
28 #include "document/html/parser/parse.h"
29 #include "document/html/parser.h"
30 #include "document/html/renderer.h"
31 #include "document/options.h"
32 #include "document/renderer.h"
33 #include "intl/charsets.h"
34 #include "protocol/date.h"
35 #include "protocol/header.h"
36 #include "protocol/uri.h"
37 #include "session/task.h"
38 #include "terminal/draw.h"
39 #include "util/align.h"
40 #include "util/box.h"
41 #include "util/color.h"
42 #include "util/conv.h"
43 #include "util/error.h"
44 #include "util/memdebug.h"
45 #include "util/memlist.h"
46 #include "util/memory.h"
47 #include "util/string.h"
49 /* Unsafe macros */
50 #include "document/html/internal.h"
52 /* TODO: This needs rewrite. Yes, no kidding. */
54 static int
55 extract_color(struct html_context *html_context, unsigned char *a,
56 unsigned char *attribute, color_T *rgb)
58 unsigned char *value;
59 int retval;
61 value = get_attr_val(a, attribute, html_context->doc_cp);
62 if (!value) return -1;
64 retval = decode_color(value, strlen(value), rgb);
65 mem_free(value);
67 return retval;
70 int
71 get_color(struct html_context *html_context, unsigned char *a,
72 unsigned char *attribute, color_T *rgb)
74 if (!use_document_fg_colors(html_context->options))
75 return -1;
77 return extract_color(html_context, a, attribute, rgb);
80 int
81 get_bgcolor(struct html_context *html_context, unsigned char *a, color_T *rgb)
83 if (!use_document_bg_colors(html_context->options))
84 return -1;
86 return extract_color(html_context, a, "bgcolor", rgb);
89 unsigned char *
90 get_target(struct document_options *options, unsigned char *a)
92 /* FIXME (bug 784): options->cp is the terminal charset;
93 * should use the document charset instead. */
94 unsigned char *v = get_attr_val(a, "target", options->cp);
96 if (!v) return NULL;
98 if (!*v || !c_strcasecmp(v, "_self")) {
99 mem_free_set(&v, stracpy(options->framename));
102 return v;
106 void
107 ln_break(struct html_context *html_context, int n)
109 if (!n || html_top->invisible) return;
110 while (n > html_context->line_breax) {
111 html_context->line_breax++;
112 html_context->line_break_f(html_context);
114 html_context->position = 0;
115 html_context->putsp = HTML_SPACE_SUPPRESS;
118 void
119 put_chrs(struct html_context *html_context, unsigned char *start, int len)
121 if (html_is_preformatted())
122 html_context->putsp = HTML_SPACE_NORMAL;
124 if (!len || html_top->invisible)
125 return;
127 switch (html_context->putsp) {
128 case HTML_SPACE_NORMAL:
129 break;
131 case HTML_SPACE_ADD:
132 html_context->put_chars_f(html_context, " ", 1);
133 html_context->position++;
134 html_context->putsp = HTML_SPACE_SUPPRESS;
136 /* Fall thru. */
138 case HTML_SPACE_SUPPRESS:
139 html_context->putsp = HTML_SPACE_NORMAL;
140 if (isspace(start[0])) {
141 start++, len--;
143 if (!len) {
144 html_context->putsp = HTML_SPACE_SUPPRESS;
145 return;
149 break;
152 if (isspace(start[len - 1]) && !html_is_preformatted())
153 html_context->putsp = HTML_SPACE_SUPPRESS;
154 html_context->was_br = 0;
156 html_context->put_chars_f(html_context, start, len);
158 html_context->position += len;
159 html_context->line_breax = 0;
160 if (html_context->was_li > 0)
161 html_context->was_li--;
164 void
165 set_fragment_identifier(struct html_context *html_context,
166 unsigned char *attr_name, unsigned char *attr)
168 unsigned char *id_attr;
170 id_attr = get_attr_val(attr_name, attr, html_context->doc_cp);
172 if (id_attr) {
173 html_context->special_f(html_context, SP_TAG, id_attr);
174 mem_free(id_attr);
178 void
179 add_fragment_identifier(struct html_context *html_context,
180 struct part *part, unsigned char *attr)
182 struct part *saved_part = html_context->part;
184 html_context->part = part;
185 html_context->special_f(html_context, SP_TAG, attr);
186 html_context->part = saved_part;
189 #ifdef CONFIG_CSS
190 void
191 import_css_stylesheet(struct css_stylesheet *css, struct uri *base_uri,
192 const unsigned char *unterminated_url, int len)
194 struct html_context *html_context = css->import_data;
195 unsigned char *url;
196 unsigned char *import_url;
197 struct uri *uri;
199 assert(html_context);
200 assert(base_uri);
202 if (!html_context->options->css_enable
203 || !html_context->options->css_import)
204 return;
206 /* unterminated_url might not end with '\0', but join_urls
207 * requires that, so make a copy. */
208 url = memacpy(unterminated_url, len);
209 if (!url) return;
211 /* HTML <head> urls should already be fine but we can.t detect them. */
212 import_url = join_urls(base_uri, url);
213 mem_free(url);
215 if (!import_url) return;
217 uri = get_uri(import_url, URI_BASE);
218 mem_free(import_url);
220 if (!uri) return;
222 /* Request the imported stylesheet as part of the document ... */
223 html_context->special_f(html_context, SP_STYLESHEET, uri);
225 /* ... and then attempt to import from the cache. */
226 import_css(css, uri);
228 done_uri(uri);
230 #endif
232 /* Extract the extra information that is available for elements which can
233 * receive focus. Call this from each element which supports tabindex or
234 * accesskey. */
235 /* Note that in ELinks, we support those attributes (I mean, we call this
236 * function) while processing any focusable element (otherwise it'd have zero
237 * tabindex, thus messing up navigation between links), thus we support these
238 * attributes even near tags where we're not supposed to (like IFRAME, FRAME or
239 * LINK). I think this doesn't make any harm ;). --pasky */
240 void
241 html_focusable(struct html_context *html_context, unsigned char *a)
243 unsigned char *accesskey;
244 int cp;
245 int tabindex;
247 format.accesskey = 0;
248 format.tabindex = 0x80000000;
250 if (!a) return;
252 cp = html_context->doc_cp;
254 accesskey = get_attr_val(a, "accesskey", cp);
255 if (accesskey) {
256 format.accesskey = accesskey_string_to_unicode(accesskey);
257 mem_free(accesskey);
260 tabindex = get_num(a, "tabindex", cp);
261 if (0 < tabindex && tabindex < 32767) {
262 format.tabindex = (tabindex & 0x7fff) << 16;
265 mem_free_set(&format.onclick, get_attr_val(a, "onclick", cp));
266 mem_free_set(&format.ondblclick, get_attr_val(a, "ondblclick", cp));
267 mem_free_set(&format.onmouseover, get_attr_val(a, "onmouseover", cp));
268 mem_free_set(&format.onhover, get_attr_val(a, "onhover", cp));
269 mem_free_set(&format.onfocus, get_attr_val(a, "onfocus", cp));
270 mem_free_set(&format.onmouseout, get_attr_val(a, "onmouseout", cp));
271 mem_free_set(&format.onblur, get_attr_val(a, "onblur", cp));
274 void
275 html_skip(struct html_context *html_context, unsigned char *a)
277 html_top->invisible = 1;
278 html_top->type = ELEMENT_DONT_KILL;
281 static void
282 check_head_for_refresh(struct html_context *html_context, unsigned char *head)
284 unsigned char *refresh;
285 unsigned char *url = NULL;
286 unsigned char *joined_url = NULL;
287 unsigned long seconds;
289 refresh = parse_header(head, "Refresh", NULL);
290 if (!refresh) return;
292 if (html_parse_meta_refresh(refresh, &seconds, &url) == 0) {
293 if (!url) {
294 /* If the URL parameter is missing assume that the
295 * document being processed should be refreshed. */
296 url = get_uri_string(html_context->base_href,
297 URI_ORIGINAL);
301 if (url)
302 joined_url = join_urls(html_context->base_href, url);
304 if (joined_url) {
305 if (seconds > HTTP_REFRESH_MAX_DELAY)
306 seconds = HTTP_REFRESH_MAX_DELAY;
308 html_focusable(html_context, NULL);
310 put_link_line("Refresh: ", url, joined_url,
311 html_context->options->framename, html_context);
312 html_context->special_f(html_context, SP_REFRESH, seconds, joined_url);
315 mem_free_if(joined_url);
316 mem_free_if(url);
317 mem_free(refresh);
320 static void
321 check_head_for_cache_control(struct html_context *html_context,
322 unsigned char *head)
324 unsigned char *d;
325 int no_cache = 0;
326 time_t expires = 0;
328 if (get_opt_bool("document.cache.ignore_cache_control", NULL))
329 return;
331 /* XXX: Code duplication with HTTP protocol backend. */
332 /* I am not entirely sure in what order we should process these
333 * headers and if we should still process Cache-Control max-age
334 * if we already set max age to date mentioned in Expires.
335 * --jonas */
336 if ((d = parse_header(head, "Pragma", NULL))) {
337 if (strstr(d, "no-cache")) {
338 no_cache = 1;
340 mem_free(d);
343 if (!no_cache && (d = parse_header(head, "Cache-Control", NULL))) {
344 if (strstr(d, "no-cache") || strstr(d, "must-revalidate")) {
345 no_cache = 1;
347 } else {
348 unsigned char *pos = strstr(d, "max-age=");
350 assert(!no_cache);
352 if (pos) {
353 /* Grab the number of seconds. */
354 timeval_T max_age, seconds;
356 timeval_from_seconds(&seconds, atol(pos + 8));
357 timeval_now(&max_age);
358 timeval_add_interval(&max_age, &seconds);
360 expires = timeval_to_seconds(&max_age);
364 mem_free(d);
367 if (!no_cache && (d = parse_header(head, "Expires", NULL))) {
368 /* Convert date to seconds. */
369 if (strstr(d, "now")) {
370 timeval_T now;
372 timeval_now(&now);
373 expires = timeval_to_seconds(&now);
374 } else {
375 expires = parse_date(&d, NULL, 0, 1);
378 mem_free(d);
381 if (no_cache)
382 html_context->special_f(html_context, SP_CACHE_CONTROL);
383 else if (expires)
384 html_context->special_f(html_context,
385 SP_CACHE_EXPIRES, expires);
388 void
389 process_head(struct html_context *html_context, unsigned char *head)
391 check_head_for_refresh(html_context, head);
393 check_head_for_cache_control(html_context, head);
399 static int
400 look_for_map(unsigned char **pos, unsigned char *eof, struct uri *uri,
401 struct document_options *options)
403 unsigned char *al, *attr, *name;
404 int namelen;
406 while (*pos < eof && **pos != '<') {
407 (*pos)++;
410 if (*pos >= eof) return 0;
412 if (*pos + 2 <= eof && ((*pos)[1] == '!' || (*pos)[1] == '?')) {
413 *pos = skip_comment(*pos, eof);
414 return 1;
417 if (parse_element(*pos, eof, &name, &namelen, &attr, pos)) {
418 (*pos)++;
419 return 1;
422 if (c_strlcasecmp(name, namelen, "MAP", 3)) return 1;
424 if (uri && uri->fragment) {
425 /* FIXME (bug 784): options->cp is the terminal charset;
426 * should use the document charset instead. */
427 al = get_attr_val(attr, "name", options->cp);
428 if (!al) return 1;
430 if (c_strlcasecmp(al, -1, uri->fragment, uri->fragmentlen)) {
431 mem_free(al);
432 return 1;
435 mem_free(al);
438 return 0;
441 static int
442 look_for_tag(unsigned char **pos, unsigned char *eof,
443 unsigned char *name, int namelen, unsigned char **label)
445 unsigned char *pos2;
446 struct string str;
448 if (!init_string(&str)) {
449 /* Is this the right way to bail out? --jonas */
450 *pos = eof;
451 return 0;
454 pos2 = *pos;
455 while (pos2 < eof && *pos2 != '<') {
456 pos2++;
459 if (pos2 >= eof) {
460 done_string(&str);
461 *pos = eof;
462 return 0;
464 if (pos2 - *pos)
465 add_bytes_to_string(&str, *pos, pos2 - *pos);
466 *label = str.source;
468 *pos = pos2;
470 if (*pos + 2 <= eof && ((*pos)[1] == '!' || (*pos)[1] == '?')) {
471 *pos = skip_comment(*pos, eof);
472 return 1;
475 if (parse_element(*pos, eof, NULL, NULL, NULL, &pos2)) return 1;
477 if (c_strlcasecmp(name, namelen, "A", 1)
478 && c_strlcasecmp(name, namelen, "/A", 2)
479 && c_strlcasecmp(name, namelen, "MAP", 3)
480 && c_strlcasecmp(name, namelen, "/MAP", 4)
481 && c_strlcasecmp(name, namelen, "AREA", 4)
482 && c_strlcasecmp(name, namelen, "/AREA", 5)) {
483 *pos = pos2;
484 return 1;
487 return 0;
490 /** @return -1 if EOF is hit without the closing tag; 0 if the closing
491 * tag is found (in which case this also adds *@a menu to *@a ml); or
492 * 1 if this should be called again. */
493 static int
494 look_for_link(unsigned char **pos, unsigned char *eof, struct menu_item **menu,
495 struct memory_list **ml, struct uri *href_base,
496 unsigned char *target_base, struct conv_table *ct,
497 struct document_options *options)
499 unsigned char *attr, *href, *name, *target;
500 unsigned char *label = NULL; /* shut up warning */
501 struct link_def *ld;
502 struct menu_item *nm;
503 int nmenu;
504 int namelen;
506 while (*pos < eof && **pos != '<') {
507 (*pos)++;
510 if (*pos >= eof) return -1;
512 if (*pos + 2 <= eof && ((*pos)[1] == '!' || (*pos)[1] == '?')) {
513 *pos = skip_comment(*pos, eof);
514 return 1;
517 if (parse_element(*pos, eof, &name, &namelen, &attr, pos)) {
518 (*pos)++;
519 return 1;
522 if (!c_strlcasecmp(name, namelen, "A", 1)) {
523 while (look_for_tag(pos, eof, name, namelen, &label));
525 if (*pos >= eof) return -1;
527 } else if (!c_strlcasecmp(name, namelen, "AREA", 4)) {
528 /* FIXME (bug 784): options->cp is the terminal charset;
529 * should use the document charset instead. */
530 unsigned char *alt = get_attr_val(attr, "alt", options->cp);
532 if (alt) {
533 /* CSM_NONE because get_attr_val() already
534 * decoded entities. */
535 label = convert_string(ct, alt, strlen(alt),
536 options->cp, CSM_NONE,
537 NULL, NULL, NULL);
538 mem_free(alt);
539 } else {
540 label = NULL;
543 } else if (!c_strlcasecmp(name, namelen, "/MAP", 4)) {
544 /* This is the only successful return from here! */
545 add_to_ml(ml, (void *) *menu, (void *) NULL);
546 return 0;
548 } else {
549 return 1;
552 target = get_target(options, attr);
553 if (!target) target = stracpy(empty_string_or_(target_base));
554 if (!target) {
555 mem_free_if(label);
556 return 1;
559 ld = mem_alloc(sizeof(*ld));
560 if (!ld) {
561 mem_free_if(label);
562 mem_free(target);
563 return 1;
566 /* FIXME (bug 784): options->cp is the terminal charset;
567 * should use the document charset instead. */
568 href = get_url_val(attr, "href", options->cp);
569 if (!href) {
570 mem_free_if(label);
571 mem_free(target);
572 mem_free(ld);
573 return 1;
577 ld->link = join_urls(href_base, href);
578 mem_free(href);
579 if (!ld->link) {
580 mem_free_if(label);
581 mem_free(target);
582 mem_free(ld);
583 return 1;
587 ld->target = target;
588 for (nmenu = 0; !mi_is_end_of_menu(&(*menu)[nmenu]); nmenu++) {
589 struct link_def *ll = (*menu)[nmenu].data;
591 if (!strcmp(ll->link, ld->link) &&
592 !strcmp(ll->target, ld->target)) {
593 mem_free(ld->link);
594 mem_free(ld->target);
595 mem_free(ld);
596 mem_free_if(label);
597 return 1;
601 if (label) {
602 clr_spaces(label);
604 if (!*label) {
605 mem_free(label);
606 label = NULL;
610 if (!label) {
611 label = stracpy(ld->link);
612 if (!label) {
613 mem_free(target);
614 mem_free(ld->link);
615 mem_free(ld);
616 return 1;
620 nm = mem_realloc(*menu, (nmenu + 2) * sizeof(*nm));
621 if (nm) {
622 *menu = nm;
623 memset(&nm[nmenu], 0, 2 * sizeof(*nm));
624 nm[nmenu].text = label;
625 nm[nmenu].func = map_selected;
626 nm[nmenu].data = ld;
627 nm[nmenu].flags = NO_INTL;
630 add_to_ml(ml, (void *) ld, (void *) ld->link, (void *) ld->target,
631 (void *) label, (void *) NULL);
633 return 1;
638 get_image_map(unsigned char *head, unsigned char *pos, unsigned char *eof,
639 struct menu_item **menu, struct memory_list **ml, struct uri *uri,
640 struct document_options *options, unsigned char *target_base,
641 int to, int def, int hdef)
643 struct conv_table *ct;
644 struct string hd;
645 int look_result;
647 if (!init_string(&hd)) return -1;
649 if (head) add_to_string(&hd, head);
650 /* FIXME (bug 784): cp is the terminal charset;
651 * should use the document charset instead. */
652 scan_http_equiv(pos, eof, &hd, NULL, options->cp);
653 ct = get_convert_table(hd.source, to, def, NULL, NULL, hdef);
654 done_string(&hd);
656 *menu = mem_calloc(1, sizeof(**menu));
657 if (!*menu) return -1;
659 while (look_for_map(&pos, eof, uri, options));
661 if (pos >= eof) {
662 mem_free(*menu);
663 return -1;
666 *ml = NULL;
668 do {
669 /* This call can modify both *ml and *menu. */
670 look_result = look_for_link(&pos, eof, menu, ml, uri,
671 target_base, ct, options);
672 } while (look_result > 0);
674 if (look_result < 0) {
675 freeml(*ml);
676 mem_free(*menu);
677 return -1;
680 return 0;
686 void *
687 init_html_parser_state(struct html_context *html_context,
688 enum html_element_mortality_type type,
689 int align, int margin, int width)
691 html_stack_dup(html_context, type);
693 par_format.align = align;
695 if (type <= ELEMENT_IMMORTAL) {
696 par_format.leftmargin = margin;
697 par_format.rightmargin = margin;
698 par_format.width = width;
699 par_format.list_level = 0;
700 par_format.list_number = 0;
701 par_format.dd_margin = 0;
702 html_top->namelen = 0;
705 return html_top;
710 void
711 done_html_parser_state(struct html_context *html_context,
712 void *state)
714 struct html_element *element = state;
716 html_context->line_breax = 1;
718 while (html_top != element) {
719 pop_html_element(html_context);
720 #if 0
721 /* I've preserved this bit to show an example of the Old Code
722 * of the Mikulas days (I _HOPE_ it's by Mikulas, at least ;-).
723 * I think this assert() can never fail, for one. --pasky */
724 assertm(html_top && (void *) html_top != (void *) &html_stack,
725 "html stack trashed");
726 if_assert_failed break;
727 #endif
730 html_top->type = ELEMENT_KILLABLE;
731 pop_html_element(html_context);
735 /* This function does not set html_context.doc_cp = document.cp,
736 * because it does not know the document, and because the codepage has
737 * not even been decided when it is called.
739 * @param[out] title
740 * The title of the document. This is in the document charset,
741 * and entities have not been decoded. */
742 struct html_context *
743 init_html_parser(struct uri *uri, struct document_options *options,
744 unsigned char *start, unsigned char *end,
745 struct string *head, struct string *title,
746 void (*put_chars)(struct html_context *, unsigned char *, int),
747 void (*line_break)(struct html_context *),
748 void *(*special)(struct html_context *, enum html_special_type, ...))
750 struct html_context *html_context;
751 struct html_element *e;
753 assert(uri && options);
754 if_assert_failed return NULL;
756 html_context = mem_calloc(1, sizeof(*html_context));
757 if (!html_context) return NULL;
759 #ifdef CONFIG_CSS
760 html_context->css_styles.import = import_css_stylesheet;
761 init_css_selector_set(&html_context->css_styles.selectors);
762 #endif
764 init_list(html_context->stack);
766 html_context->startf = start;
767 html_context->put_chars_f = put_chars;
768 html_context->line_break_f = line_break;
769 html_context->special_f = special;
771 html_context->base_href = get_uri_reference(uri);
772 html_context->base_target = null_or_stracpy(options->framename);
774 html_context->options = options;
776 /* FIXME (bug 784): cp is the terminal charset;
777 * should use the document charset instead. */
778 scan_http_equiv(start, end, head, title, options->cp);
780 e = mem_calloc(1, sizeof(*e));
781 if (!e) return NULL;
782 add_to_list(html_context->stack, e);
784 format.style.attr = 0;
785 format.fontsize = 3;
786 format.link = format.target = format.image = NULL;
787 format.onclick = format.ondblclick = format.onmouseover = format.onhover
788 = format.onfocus = format.onmouseout = format.onblur = NULL;
789 format.select = NULL;
790 format.form = NULL;
791 format.title = NULL;
793 format.style = options->default_style;
794 format.color.clink = options->default_color.link;
795 format.color.vlink = options->default_color.vlink;
796 #ifdef CONFIG_BOOKMARKS
797 format.color.bookmark_link = options->default_color.bookmark_link;
798 #endif
799 format.color.image_link = options->default_color.image_link;
801 par_format.align = ALIGN_LEFT;
802 par_format.leftmargin = options->margin;
803 par_format.rightmargin = options->margin;
805 par_format.width = options->box.width;
806 par_format.list_level = par_format.list_number = 0;
807 par_format.dd_margin = options->margin;
808 par_format.flags = P_NONE;
810 par_format.color.background = options->default_style.color.background;
812 html_top->invisible = 0;
813 html_top->name = NULL;
814 html_top->namelen = 0;
815 html_top->options = NULL;
816 html_top->linebreak = 1;
817 html_top->type = ELEMENT_DONT_KILL;
819 html_context->has_link_lines = 0;
820 html_context->table_level = 0;
822 #ifdef CONFIG_CSS
823 html_context->css_styles.import_data = html_context;
825 if (options->css_enable)
826 mirror_css_stylesheet(&default_stylesheet,
827 &html_context->css_styles);
828 #endif
830 return html_context;
833 void
834 done_html_parser(struct html_context *html_context)
836 #ifdef CONFIG_CSS
837 if (html_context->options->css_enable)
838 done_css_stylesheet(&html_context->css_styles);
839 #endif
841 mem_free(html_context->base_target);
842 done_uri(html_context->base_href);
844 kill_html_stack_item(html_context, html_context->stack.next);
846 assertm(list_empty(html_context->stack),
847 "html stack not empty after operation");
848 if_assert_failed init_list(html_context->stack);
850 mem_free(html_context);