2 * Copyright (c) 2002, Adam Dunkels.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following
12 * disclaimer in the documentation and/or other materials provided
13 * with the distribution.
14 * 3. The name of the author may not be used to endorse or promote
15 * products derived from this software without specific prior
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * This file is part of the Contiki desktop environment
32 * $Id: htmlparser.c,v 1.8 2007/11/30 21:53:50 oliverschmidt Exp $
38 * Implements a very simplistic HTML parser. It recognizes HTML links
39 * (<a href>-tags), HTML img alt tags, a few text flow break tags
40 G * (<br>, <p>, <h>), the <li> tag (but does not even try to
41 * distinguish between <ol> or <ul>) as well as HTML comment tags
44 * To save memory, the HTML parser is state machine driver, which
45 * means that it will shave off one character from the HTML page,
46 * process that character, and return to the next. Another way of
47 * doing it would be to buffer a number of characters and process them
50 * The main function in this file is the htmlparser_parse() function
51 * which takes a htmlparser_state structur and a part of an HTML file
52 * as an argument. The htmlparser_parse() function will call the
53 * helper functions parse_char() and parse_tag(). Those functions will
54 * in turn call the two callback functions htmlparser_char() and
55 * htmlparser_tag(). Those functions must be implemented by the using
56 * module (e.g., a web browser program).
58 * htmlparser_char() will be called for every non-tag character.
60 * htmlparser_tag() will be called whenever a full tag has been found.
67 #include "html-strings.h"
70 #include "htmlparser.h"
76 #define PRINTF(x) printf x
80 /*-----------------------------------------------------------------------------------*/
94 #define ISO_a (ISO_A | 0x20)
95 #define ISO_b (ISO_B | 0x20)
96 #define ISO_e (ISO_E | 0x20)
97 #define ISO_f (ISO_F | 0x20)
98 #define ISO_g (ISO_G | 0x20)
99 #define ISO_h (ISO_H | 0x20)
100 #define ISO_i (ISO_I | 0x20)
101 #define ISO_l (ISO_L | 0x20)
102 #define ISO_m (ISO_M | 0x20)
103 #define ISO_p (ISO_P | 0x20)
104 #define ISO_r (ISO_R | 0x20)
105 #define ISO_t (ISO_T | 0x20)
110 #define ISO_space 0x20
111 #define ISO_bang 0x21
112 #define ISO_citation 0x22
113 #define ISO_ampersand 0x26
114 #define ISO_citation2 0x27
115 #define ISO_asterisk 0x2a
116 #define ISO_dash 0x2d
117 #define ISO_slash 0x2f
118 #define ISO_semicolon 0x3b
123 #define ISO_rbrack 0x5b
124 #define ISO_lbrack 0x5d
126 #define MINORSTATE_NONE 0
127 #define MINORSTATE_TEXT 1 /* Parse normal text */
128 #define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */
129 #define MINORSTATE_TAG 3 /* Check for name of tag. */
130 #define MINORSTATE_TAGEND 4 /* Scan for end of tag. */
131 #define MINORSTATE_TAGATTR 5 /* Parse tag attr. */
132 #define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag
134 #define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */
135 #define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without
137 #define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */
139 #define MAJORSTATE_NONE 0
140 #define MAJORSTATE_BODY 1
141 #define MAJORSTATE_LINK 2
142 #define MAJORSTATE_FORM 3
143 #define MAJORSTATE_DISCARD 4
146 struct htmlparser_state
{
148 unsigned char minorstate
;
150 unsigned char tagptr
;
152 unsigned char tagattrptr
;
153 char tagattrparam
[WWW_CONF_MAX_URLLEN
];
154 unsigned char tagattrparamptr
;
155 unsigned char lastchar
, quotechar
;
156 unsigned char majorstate
, lastmajorstate
;
157 char linkurl
[WWW_CONF_MAX_URLLEN
];
159 char word
[WWW_CONF_WEBPAGE_WIDTH
];
160 unsigned char wordlen
;
163 char formaction
[WWW_CONF_MAX_FORMACTIONLEN
];
164 char formname
[WWW_CONF_MAX_FORMNAMELEN
];
165 unsigned char inputtype
;
166 char inputname
[WWW_CONF_MAX_INPUTNAMELEN
];
167 char inputvalue
[WWW_CONF_MAX_INPUTVALUELEN
];
168 unsigned char inputvaluesize
;
169 #endif /* WWW_CONF_FORMS */
172 static struct htmlparser_state s
;
174 /*-----------------------------------------------------------------------------------*/
175 static char last
[1] = {(char)0xff};
177 static const char *tags
[] = {
181 #define TAG_SLASHCENTER 1
183 #define TAG_SLASHFORM 2
187 #define TAG_SLASHSCRIPT 4
189 #define TAG_SLASHSELECT 5
191 #define TAG_SLASHSTYLE 6
199 #define TAG_CENTER 10
221 #define TAG_SCRIPT 21
223 #define TAG_SELECT 22
233 /*-----------------------------------------------------------------------------------*/
234 static unsigned char CC_FASTCALL
237 return (c
== ISO_space
||
242 /*-----------------------------------------------------------------------------------*/
244 htmlparser_init(void)
246 s
.majorstate
= s
.lastmajorstate
= MAJORSTATE_DISCARD
;
247 s
.minorstate
= MINORSTATE_TEXT
;
250 /*-----------------------------------------------------------------------------------*/
251 static char CC_FASTCALL
254 /* XXX: This is a *brute force* approach to lower-case
255 converting and should *not* be used anywhere else! It
256 works for our purposes, however (i.e., HTML tags). */
258 return (c
& 0x1f) | 0x60;
263 /*-----------------------------------------------------------------------------------*/
268 s
.tagattr
[s
.tagattrptr
] = 0;
269 s
.tagattrparam
[s
.tagattrparamptr
] = 0;
271 /*-----------------------------------------------------------------------------------*/
272 static void CC_FASTCALL
273 switch_majorstate(unsigned char newstate
)
275 if(s
.majorstate
!= newstate
) {
276 PRINTF(("Switching state from %d to %d (%d)\n", s
.majorstate
, newstate
, s
.lastmajorstate
));
277 s
.lastmajorstate
= s
.majorstate
;
278 s
.majorstate
= newstate
;
281 /*-----------------------------------------------------------------------------------*/
282 static void CC_FASTCALL
283 add_char(unsigned char c
)
285 if(s
.wordlen
< WWW_CONF_WEBPAGE_WIDTH
- 1 && c
< 0x80) {
286 s
.word
[s
.wordlen
] = c
;
290 /*-----------------------------------------------------------------------------------*/
295 if(s
.majorstate
== MAJORSTATE_LINK
) {
296 if(s
.word
[s
.wordlen
] != ISO_space
) {
299 } else if(s
.majorstate
== MAJORSTATE_DISCARD
) {
302 s
.word
[s
.wordlen
] = '\0';
303 htmlparser_word(s
.word
, s
.wordlen
);
308 /*-----------------------------------------------------------------------------------*/
313 htmlparser_newline();
315 /*-----------------------------------------------------------------------------------*/
316 static unsigned char CC_FASTCALL
319 static unsigned char first
, last
, i
, tabi
;
330 tags
[first
][i
] == 0) {
336 /* First, find first matching tag from table. */
337 while(tagc
> (tags
[tabi
])[i
] &&
343 /* Second, find last matching tag from table. */
344 while(tagc
== (tags
[tabi
])[i
] &&
350 /* If first and last matching tags are equal, we have a non-match
351 and return. Else we continue with the next character. */
354 } while(last
!= first
);
357 /*-----------------------------------------------------------------------------------*/
361 static char *tagattrparam
;
362 static unsigned char size
;
366 PRINTF(("Parsing tag '%s' '%s' '%s'\n",
367 s
.tag
, s
.tagattr
, s
.tagattrparam
));
369 switch(find_tag(s
.tag
)) {
375 /* parse_char(ISO_nl);*/
381 /* parse_char(ISO_nl);*/
387 add_char(ISO_asterisk
);
393 switch_majorstate(MAJORSTATE_DISCARD
);
395 case TAG_SLASHSCRIPT
:
397 case TAG_SLASHSELECT
:
399 switch_majorstate(s
.lastmajorstate
);
402 s
.majorstate
= s
.lastmajorstate
= MAJORSTATE_BODY
;
405 if(strncmp(s
.tagattr
, html_src
, sizeof(html_src
)) == 0 &&
406 s
.tagattrparam
[0] != 0) {
407 switch_majorstate(MAJORSTATE_BODY
);
409 add_char(ISO_rbrack
);
411 htmlparser_link((char *)html_frame
, (unsigned char)strlen(html_frame
), s
.tagattrparam
);
412 PRINTF(("Frame [%s]\n", s
.tagattrparam
));
413 add_char(ISO_lbrack
);
418 if(strncmp(s
.tagattr
, html_alt
, sizeof(html_alt
)) == 0 &&
419 s
.tagattrparam
[0] != 0) {
420 /* parse_char(ISO_lt);*/
422 tagattrparam
= &s
.tagattrparam
[0];
423 while(*tagattrparam
) {
424 /* parse_char(*tagattrparam);*/
425 add_char(*tagattrparam
);
428 /* parse_char(ISO_gt);*/
434 PRINTF(("A %s %s\n", s
.tagattr
, s
.tagattrparam
));
435 if(strncmp(s
.tagattr
, html_href
, sizeof(html_href
)) == 0 &&
436 s
.tagattrparam
[0] != 0) {
437 strcpy(s
.linkurl
, s
.tagattrparam
);
439 switch_majorstate(MAJORSTATE_LINK
);
443 if(s
.majorstate
== MAJORSTATE_LINK
) {
444 switch_majorstate(s
.lastmajorstate
);
445 s
.word
[s
.wordlen
] = 0;
446 htmlparser_link(s
.word
, s
.wordlen
, s
.linkurl
);
452 PRINTF(("Form tag\n"));
453 switch_majorstate(MAJORSTATE_FORM
);
454 if(strncmp(s
.tagattr
, html_action
, sizeof(html_action
)) == 0) {
455 PRINTF(("Form action '%s'\n", s
.tagattrparam
));
456 strncpy(s
.formaction
, s
.tagattrparam
, WWW_CONF_MAX_FORMACTIONLEN
- 1);
457 } else if(strncmp(s
.tagattr
, html_name
, sizeof(html_name
)) == 0) {
458 PRINTF(("Form name '%s'\n", s
.tagattrparam
));
459 strncpy(s
.formname
, s
.tagattrparam
, WWW_CONF_MAX_FORMNAMELEN
- 1);
461 s
.inputname
[0] = s
.inputvalue
[0] = 0;
464 switch_majorstate(MAJORSTATE_BODY
);
465 s
.formaction
[0] = s
.formname
[0] = 0;
468 if(s
.majorstate
== MAJORSTATE_FORM
) {
469 /* First check if we are called at the end of an input tag. If
470 so, we should render the input widget. */
471 if(s
.tagattr
[0] == 0 &&
472 s
.inputname
[0] != 0) {
473 PRINTF(("Render input type %d\n", s
.inputtype
));
474 switch(s
.inputtype
) {
475 case HTMLPARSER_INPUTTYPE_NONE
:
476 case HTMLPARSER_INPUTTYPE_TEXT
:
477 s
.inputvalue
[s
.inputvaluesize
] = 0;
478 htmlparser_inputfield(s
.inputvaluesize
, s
.inputvalue
, s
.inputname
,
479 s
.formname
, s
.formaction
);
481 case HTMLPARSER_INPUTTYPE_SUBMIT
:
482 case HTMLPARSER_INPUTTYPE_IMAGE
:
483 htmlparser_submitbutton(s
.inputvalue
, s
.inputname
,
484 s
.formname
, s
.formaction
);
487 s
.inputtype
= HTMLPARSER_INPUTTYPE_NONE
;
489 PRINTF(("Input '%s' '%s'\n", s
.tagattr
, s
.tagattrparam
));
490 if(strncmp(s
.tagattr
, html_type
, sizeof(html_type
)) == 0) {
491 if(strncmp(s
.tagattrparam
, html_submit
,
492 sizeof(html_submit
)) == 0) {
493 s
.inputtype
= HTMLPARSER_INPUTTYPE_SUBMIT
;
494 } else if(strncmp(s
.tagattrparam
, html_image
,
495 sizeof(html_image
)) == 0) {
496 s
.inputtype
= HTMLPARSER_INPUTTYPE_IMAGE
;
497 } else if(strncmp(s
.tagattrparam
, html_text
,
498 sizeof(html_text
)) == 0) {
499 s
.inputtype
= HTMLPARSER_INPUTTYPE_TEXT
;
501 s
.inputtype
= HTMLPARSER_INPUTTYPE_OTHER
;
503 } else if(strncmp(s
.tagattr
, html_name
,
504 sizeof(html_name
)) == 0) {
505 strncpy(s
.inputname
, s
.tagattrparam
,
506 WWW_CONF_MAX_INPUTNAMELEN
);
507 } else if(strncmp(s
.tagattr
, html_alt
,
508 sizeof(html_alt
)) == 0 &&
509 s
.inputtype
== HTMLPARSER_INPUTTYPE_IMAGE
) {
510 strncpy(s
.inputvalue
, s
.tagattrparam
,
511 WWW_CONF_MAX_INPUTVALUELEN
);
512 } else if(strncmp(s
.tagattr
, html_value
,
513 sizeof(html_value
)) == 0) {
514 strncpy(s
.inputvalue
, s
.tagattrparam
,
515 WWW_CONF_MAX_INPUTVALUELEN
);
516 } else if(strncmp(s
.tagattr
, html_size
,
517 sizeof(html_size
)) == 0) {
519 if(s
.tagattrparam
[0] >= '0' &&
520 s
.tagattrparam
[0] <= '9') {
521 size
= s
.tagattrparam
[0] - '0';
522 if(s
.tagattrparam
[1] >= '0' &&
523 s
.tagattrparam
[1] <= '9') {
524 size
= size
* 10 + (s
.tagattrparam
[1] - '0');
527 if(size
>= WWW_CONF_MAX_INPUTVALUELEN
) {
528 size
= WWW_CONF_MAX_INPUTVALUELEN
- 1;
530 s
.inputvaluesize
= size
;
531 /* strncpy(s.inputvalue, s.tagattrparam,
532 WWW_CONF_MAX_INPUTVALUELEN);*/
538 #endif /* WWW_CONF_FORMS */
539 #if WWW_CONF_RENDERSTATE
541 /* parse_char(ISO_nl); */
543 htmlparser_renderstate(HTMLPARSER_RENDERSTATE_BEGIN
|
544 HTMLPARSER_RENDERSTATE_CENTER
);
546 case TAG_SLASHCENTER
:
547 /* parse_char(ISO_nl);*/
549 htmlparser_renderstate(HTMLPARSER_RENDERSTATE_END
|
550 HTMLPARSER_RENDERSTATE_CENTER
);
552 #endif /* WWW_CONF_RENDERSTATE */
555 /*-----------------------------------------------------------------------------------*/
557 parse_word(char *data
, u8_t dlen
)
565 switch(s
.minorstate
) {
566 case MINORSTATE_TEXT
:
567 for(i
= 0; i
< len
; ++i
) {
569 if(iswhitespace(c
)) {
571 } else if(c
== ISO_lt
) {
572 s
.minorstate
= MINORSTATE_TAG
;
576 } else if(c
== ISO_ampersand
) {
577 s
.minorstate
= MINORSTATE_EXTCHAR
;
584 case MINORSTATE_EXTCHAR
:
585 for(i
= 0; i
< len
; ++i
) {
587 if(c
== ISO_semicolon
) {
588 s
.minorstate
= MINORSTATE_TEXT
;
591 } else if(iswhitespace(c
)) {
592 s
.minorstate
= MINORSTATE_TEXT
;
600 /* We are currently parsing within the name of a tag. We check
601 for the end of a tag (the '>' character) or whitespace (which
602 indicates that we should parse a tag attr argument
604 for(i
= 0; i
< len
; ++i
) {
607 /* Full tag found. We continue parsing regular text. */
608 s
.minorstate
= MINORSTATE_TEXT
;
609 s
.tagattrptr
= s
.tagattrparamptr
= 0;
613 } else if(iswhitespace(c
)) {
614 /* The name of the tag found. We continue parsing the tag
616 s
.minorstate
= MINORSTATE_TAGATTR
;
621 /* Keep track of the name of the tag, but convert it to
624 s
.tag
[s
.tagptr
] = lowercase(c
);
626 /* Check if the ->tag field is full. If so, we just eat up
627 any data left in the tag. */
628 if(s
.tagptr
== sizeof(s
.tag
)) {
629 s
.minorstate
= MINORSTATE_TAGEND
;
634 /* Check for HTML comment, indicated by <!-- */
636 s
.tag
[0] == ISO_bang
&&
637 s
.tag
[1] == ISO_dash
&&
638 s
.tag
[2] == ISO_dash
) {
639 PRINTF(("Starting comment...\n"));
640 s
.minorstate
= MINORSTATE_HTMLCOMMENT
;
647 case MINORSTATE_TAGATTR
:
648 /* We parse the "tag attr", i.e., the "href" in <a
650 for(i
= 0; i
< len
; ++i
) {
653 /* Full tag found. */
654 s
.minorstate
= MINORSTATE_TEXT
;
655 s
.tagattrparamptr
= 0;
662 } else if(iswhitespace(c
)) {
663 if(s
.tagattrptr
== 0) {
664 /* Discard leading spaces. */
666 /* A non-leading space is the end of the attribute. */
667 s
.tagattrparamptr
= 0;
670 s
.minorstate
= MINORSTATE_TAGATTRSPACE
;
675 } else if(c
== ISO_eq
) {
676 s
.minorstate
= MINORSTATE_TAGATTRPARAMNQ
;
677 s
.tagattrparamptr
= 0;
681 s
.tagattr
[s
.tagattrptr
] = lowercase(c
);
683 /* Check if the "tagattr" field is full. If so, we just eat
684 up any data left in the tag. */
685 if(s
.tagattrptr
== sizeof(s
.tagattr
)) {
686 s
.minorstate
= MINORSTATE_TAGEND
;
692 case MINORSTATE_TAGATTRSPACE
:
693 for(i
= 0; i
< len
; ++i
) {
695 if(iswhitespace(c
)) {
696 /* Discard spaces. */
697 } else if(c
== ISO_eq
) {
698 s
.minorstate
= MINORSTATE_TAGATTRPARAMNQ
;
699 s
.tagattrparamptr
= 0;
704 s
.tagattr
[0] = lowercase(c
);
706 s
.minorstate
= MINORSTATE_TAGATTR
;
711 case MINORSTATE_TAGATTRPARAMNQ
:
712 /* We are parsing the "tag attr parameter", i.e., the link part
713 in <a href="link">. */
714 for(i
= 0; i
< len
; ++i
) {
717 /* Full tag found. */
720 s
.minorstate
= MINORSTATE_TEXT
;
727 } else if(iswhitespace(c
) &&
728 s
.tagattrparamptr
== 0) {
729 /* Discard leading spaces. */
730 } else if((c
== ISO_citation
||
731 c
== ISO_citation2
) &&
732 s
.tagattrparamptr
== 0) {
733 s
.minorstate
= MINORSTATE_TAGATTRPARAM
;
735 PRINTF(("tag attr param q found\n"));
737 } else if(iswhitespace(c
)) {
738 PRINTF(("Non-leading space found at %d\n",
740 /* Stop parsing if a non-leading space was found */
744 s
.minorstate
= MINORSTATE_TAGATTR
;
749 s
.tagattrparam
[s
.tagattrparamptr
] = c
;
751 /* Check if the "tagattr" field is full. If so, we just eat
752 up any data left in the tag. */
753 if(s
.tagattrparamptr
>= sizeof(s
.tagattrparam
) - 1) {
754 s
.minorstate
= MINORSTATE_TAGEND
;
760 case MINORSTATE_TAGATTRPARAM
:
761 /* We are parsing the "tag attr parameter", i.e., the link
762 part in <a href="link">. */
763 for(i
= 0; i
< len
; ++i
) {
765 if(c
== s
.quotechar
) {
766 /* Found end of tag attr parameter. */
770 s
.minorstate
= MINORSTATE_TAGATTR
;
775 if(iswhitespace(c
)) {
776 s
.tagattrparam
[s
.tagattrparamptr
] = ISO_space
;
778 s
.tagattrparam
[s
.tagattrparamptr
] = c
;
782 /* Check if the "tagattr" field is full. If so, we just eat
783 up any data left in the tag. */
784 if(s
.tagattrparamptr
>= sizeof(s
.tagattrparam
) - 1) {
785 s
.minorstate
= MINORSTATE_TAGEND
;
791 case MINORSTATE_HTMLCOMMENT
:
792 for(i
= 0; i
< len
; ++i
) {
796 } else if(c
== ISO_gt
&& s
.tagptr
> 0) {
797 PRINTF(("Comment done.\n"));
798 s
.minorstate
= MINORSTATE_TEXT
;
805 case MINORSTATE_TAGEND
:
806 /* Discard characters until a '>' is seen. */
807 for(i
= 0; i
< len
; ++i
) {
808 if(data
[i
] == ISO_gt
) {
809 s
.minorstate
= MINORSTATE_TEXT
;
826 /*-----------------------------------------------------------------------------------*/
828 htmlparser_parse(char *data
, u16_t datalen
)
834 plen
= parse_word(data
, 255);
836 plen
= parse_word(data
, (u8_t
)datalen
);
842 /*-----------------------------------------------------------------------------------*/