6 #include "../hunspell/csutil.hxx"
7 #include "htmlparser.hxx"
14 enum { ST_NON_WORD
, ST_WORD
, ST_TAG
, ST_CHAR_ENTITY
, ST_OTHER_TAG
, ST_ATTRIB
};
16 static const char * PATTERN
[][2] = {
17 { "<script", "</script>" },
18 { "<style", "</style>" },
19 { "<code", "</code>" },
20 { "<samp", "</samp>" },
23 { "<listing", "</listing>" },
24 { "<address", "</address>" },
27 { "<[cdata[", "]]>" }, // XML comment
31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2))
33 static const char * PATTERN2
[][2] = {
34 { "<img", "alt=" }, // ALT and TITLE attrib handled spec.
39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2))
41 HTMLParser::HTMLParser(const char * wordchars
)
46 HTMLParser::HTMLParser(unsigned short * wordchars
, int len
)
51 HTMLParser::~HTMLParser()
56 int HTMLParser::look_pattern(const char * p
[][2], unsigned int len
, int column
)
58 for (unsigned int i
= 0; i
< len
; i
++) {
59 char * j
= line
[actual
] + head
;
60 const char * k
= p
[i
][column
];
61 while ((*k
!= '\0') && (tolower(*j
) == *k
)) {
65 if (*k
== '\0') return i
;
76 char * HTMLParser::next_token()
81 //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[actual]);
85 case ST_NON_WORD
: // non word chars
86 prevstate
= ST_NON_WORD
;
87 if ((pattern_num
= look_pattern(PATTERN
, PATTERN_LEN
, 0)) != -1) {
89 if ((pattern2_num
= look_pattern(PATTERN2
, PATTERN_LEN2
, 0)) != -1) {
93 } else if (is_wordchar(line
[actual
] + head
)) {
96 } else if ((latin1
= get_latin1(line
[actual
] + head
))) {
99 head
+= strlen(latin1
);
100 } else if (line
[actual
][head
] == '&') {
101 state
= ST_CHAR_ENTITY
;
104 case ST_WORD
: // wordchar
105 if ((latin1
= get_latin1(line
[actual
] + head
))) {
106 head
+= strlen(latin1
);
107 } else if (! is_wordchar(line
[actual
] + head
)) {
109 char * t
= alloc_token(token
, &head
);
113 case ST_TAG
: // comment, labels, etc
115 if ((checkattr
== 1) && ((i
= look_pattern(PATTERN2
, PATTERN_LEN2
, 1)) != -1)
116 && (strcmp(PATTERN2
[i
][0],PATTERN2
[pattern2_num
][0]) == 0)) {
118 } else if ((checkattr
> 0) && (line
[actual
][head
] == '>')) {
120 } else if (((i
= look_pattern(PATTERN
, PATTERN_LEN
, 1)) != -1) &&
121 (strcmp(PATTERN
[i
][1],PATTERN
[pattern_num
][1]) == 0)) {
123 head
+= strlen(PATTERN
[pattern_num
][1]) - 1;
124 } else if ( (strcmp(PATTERN
[pattern_num
][0], "<") == 0) &&
125 ((line
[actual
][head
] == '"') || (line
[actual
][head
] == '\''))) {
126 quotmark
= line
[actual
][head
];
130 case ST_ATTRIB
: // non word chars
131 prevstate
= ST_ATTRIB
;
132 if (line
[actual
][head
] == quotmark
) {
134 if (checkattr
== 2) checkattr
= 1;
136 } else if (is_wordchar(line
[actual
] + head
) && (checkattr
== 2)) {
139 } else if (line
[actual
][head
] == '&') {
140 state
= ST_CHAR_ENTITY
;
143 case ST_CHAR_ENTITY
: // SGML element
144 if ((tolower(line
[actual
][head
]) == ';')) {
149 if (next_char(line
[actual
], &head
)) return NULL
;