scriptindex: Fix weird error cases
[xapian.git] / xapian-applications / omega / htmlparse.cc
blobd1ebaa810ca70a91d915693700f1bc69d4195ee6
1 /* htmlparse.cc: simple HTML parser for omega indexer
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Ananova Ltd
5 * Copyright 2002,2006,2007,2008,2009,2010,2011,2012,2015,2016,2018,2020 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "htmlparse.h"
27 #include <xapian.h>
29 #include "keyword.h"
30 #include "namedents.h"
31 #include "stringutils.h"
32 #include "utf8convert.h"
34 #include <algorithm>
36 #include <cctype>
37 #include <cstring>
38 #include <cstdio>
39 #include <cstdlib>
41 using namespace std;
43 // HTML5 legacy compatibility doctype.
44 #define HTML5_LEGACY_COMPAT "about:legacy-compat"
45 #define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)
47 static inline void
48 lowercase_string(string &str)
50 for (string::iterator i = str.begin(); i != str.end(); ++i) {
51 *i = C_tolower(*i);
55 static inline bool
56 p_nottag(char c)
58 // ':' for XML namespaces.
59 return !C_isalnum(c) && c != '.' && c != '-' && c != ':';
62 static inline bool
63 p_whitespacegt(char c)
65 return C_isspace(c) || c == '>';
68 static inline bool
69 p_whitespaceeqgt(char c)
71 return C_isspace(c) || c == '=' || c == '>';
74 bool
75 HtmlParser::get_parameter(const string & param, string & value) const
77 map<string, string>::const_iterator i = parameters.find(param);
78 if (i == parameters.end()) return false;
79 value = i->second;
80 return true;
83 // UTF-8 encoded entity is always <= the entity itself in length, even if the
84 // trailing ';' is missing - for numeric (decimal and hex) entities:
86 // <= UTF-8 &#<..> &#x<..>
87 // U+007F 1 5 5
88 // U+07FF 2 6 6
89 // U+FFFF 3 7 7
90 // U+1FFFFF 4 9 9
91 // U+3FFFFFF 5 10 10
92 // U+7FFFFFFF 6 12 11
94 // Also true for named entities. This means we can work in-place within the
95 // string.
97 void
98 HtmlParser::decode_entities(string &s)
100 string::iterator out = s.begin();
101 string::iterator in = out;
102 string::iterator amp = in;
103 while ((amp = find(amp, s.end(), '&')) != s.end()) {
104 unsigned int val = 0;
105 string::iterator end, p = amp + 1;
106 if (p != s.end() && *p == '#') {
107 ++p;
108 if (p != s.end() && (*p == 'x' || *p == 'X')) {
109 // hex
110 while (++p != s.end() && C_isxdigit(*p)) {
111 val = (val << 4) | hex_digit(*p);
113 end = p;
114 } else {
115 // number
116 while (p != s.end() && C_isdigit(*p)) {
117 val = val * 10 + (*p - '0');
118 ++p;
120 end = p;
122 } else {
123 end = find_if(p, s.end(), C_isnotalnum);
124 int k = keyword2(tab, s.data() + (p - s.begin()), end - p);
125 if (k >= 0) val = named_ent_codepoint[k];
127 if (end != s.end() && *end == ';') ++end;
128 if (val) {
129 if (in != out) {
130 out = copy(in, amp, out);
131 } else {
132 out = amp;
134 in = end;
135 if (val < 0x80) {
136 *out++ = char(val);
137 } else {
138 // Convert unicode value val to UTF-8.
139 char seq[4];
140 unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
141 out = copy(seq, seq + len, out);
144 amp = end;
147 if (in != out) {
148 s.erase(out, in);
152 void
153 HtmlParser::parse(const string& body)
155 // Check for BOM.
156 string::const_iterator begin_after_bom = body.begin();
157 if (body.size() >= 3) {
158 switch (body[0]) {
159 case '\xef':
160 if (body[1] == '\xbb' && body[2] == '\xbf') {
161 charset = "utf-8";
162 begin_after_bom += 3;
164 break;
165 case '\xfe':
166 case '\xff':
167 // Match either \xfe\xff or \xff\xfe.
168 if ((body[1] ^ body[0]) == 1) {
169 // Convert to "utf-16" which will remove the BOM for us.
170 string utf8_body;
171 convert_to_utf8(body, "utf-16", utf8_body);
172 charset = "utf-8";
173 parse(utf8_body);
174 return;
176 break;
180 in_script = false;
182 parameters.clear();
183 string::const_iterator start = begin_after_bom;
185 while (true) {
186 // Skip through until we find an HTML tag, a comment, or the end of
187 // document. Ignore isolated occurrences of '<' which don't start
188 // a tag or comment.
189 string::const_iterator p = start;
190 while (true) {
191 p = find(p, body.end(), '<');
192 if (p == body.end()) break;
193 unsigned char ch = *(p + 1);
195 // Tag, closing tag, or comment (or SGML declaration).
196 if ((!in_script && C_isalpha(ch)) || ch == '/' || ch == '!') break;
198 if (ch == '?') {
199 // PHP code or XML declaration.
200 // XML declaration is only valid at the start of the first line.
201 if (p != begin_after_bom || body.size() < 20) break;
203 // XML declaration looks something like this:
204 // <?xml version="1.0" encoding="UTF-8"?>
205 if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
206 if (strchr(" \t\r\n", p[5]) == NULL) break;
208 string::const_iterator decl_end = find(p + 6, body.end(), '?');
209 if (decl_end == body.end()) break;
211 // Default charset for XML is UTF-8.
212 charset = "utf-8";
214 string decl(p + 6, decl_end);
215 size_t enc = decl.find("encoding");
216 if (enc == string::npos) break;
218 enc = decl.find_first_not_of(" \t\r\n", enc + 8);
219 if (enc == string::npos || enc == decl.size()) break;
221 if (decl[enc] != '=') break;
223 enc = decl.find_first_not_of(" \t\r\n", enc + 1);
224 if (enc == string::npos || enc == decl.size()) break;
226 if (decl[enc] != '"' && decl[enc] != '\'') break;
228 char quote = decl[enc++];
229 size_t enc_end = decl.find(quote, enc);
231 if (enc != string::npos)
232 charset.assign(decl, enc, enc_end - enc);
234 break;
236 ++p;
239 // Process text up to start of tag.
240 if (p > start) {
241 string text(body, start - body.begin(), p - start);
242 convert_to_utf8(text, charset);
243 decode_entities(text);
244 process_text(text);
247 if (p == body.end()) break;
249 start = p + 1;
251 if (start == body.end()) break;
253 if (*start == '!') {
254 if (++start == body.end()) break;
256 // Comment, SGML declaration, or HTML5 DTD.
257 char first_ch = *start;
258 if (++start == body.end()) break;
259 if (first_ch == '-' && *start == '-') {
260 ++start;
261 string::const_iterator close = find(start, body.end(), '>');
262 // An unterminated comment swallows rest of document
263 // (like Netscape, but unlike MSIE IIRC)
264 if (close == body.end()) break;
266 p = close;
267 // look for -->
268 while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
269 p = find(p + 1, body.end(), '>');
271 if (p != body.end()) {
272 // Check for htdig's "ignore this bit" comments.
273 if (p - start == CONST_STRLEN("htdig_noindex") + 2 &&
274 memcmp(&*start, "htdig_noindex",
275 CONST_STRLEN("htdig_noindex")) == 0) {
276 auto i = body.find("<!--/htdig_noindex-->",
277 p + 1 - body.begin());
278 if (i == string::npos) break;
279 start = body.begin() + i +
280 CONST_STRLEN("<!--/htdig_noindex-->");
281 continue;
283 // Check for udmcomment (similar to htdig's)
284 if (p - start == CONST_STRLEN("UdmComment") + 2 &&
285 memcmp(&*start, "UdmComment",
286 CONST_STRLEN("UdmComment")) == 0) {
287 auto i = body.find("<!--/UdmComment-->",
288 p + 1 - body.begin());
289 if (i == string::npos) break;
290 start = body.begin() + i +
291 CONST_STRLEN("<!--/UdmComment-->");
292 continue;
294 // If we found --> skip to there.
295 start = p;
296 } else {
297 // Otherwise skip to the first > we found (as Netscape does).
298 start = close;
300 } else if (first_ch == '[' &&
301 body.size() - (start - body.begin()) > 6 &&
302 body.compare(start - body.begin(), 6, "CDATA[", 6) == 0) {
303 start += 6;
304 string::size_type b = start - body.begin();
305 string::size_type i;
306 i = body.find("]]>", b);
307 string text(body, b, i - b);
308 convert_to_utf8(text, charset);
309 process_text(text);
310 if (i == string::npos) break;
311 start = body.begin() + i + 2;
312 } else if (C_tolower(first_ch) == 'd' &&
313 body.end() - start > 6 &&
314 C_tolower(start[0]) == 'o' &&
315 C_tolower(start[1]) == 'c' &&
316 C_tolower(start[2]) == 't' &&
317 C_tolower(start[3]) == 'y' &&
318 C_tolower(start[4]) == 'p' &&
319 C_tolower(start[5]) == 'e' &&
320 C_isspace(start[6])) {
321 // DOCTYPE declaration.
322 start += 7;
323 while (start != body.end() && C_isspace(*start)) {
324 ++start;
326 if (start == body.end()) break;
327 if (body.end() - start >= 5 &&
328 C_tolower(start[0]) == 'h' &&
329 C_tolower(start[1]) == 't' &&
330 C_tolower(start[2]) == 'm' &&
331 C_tolower(start[3]) == 'l' &&
332 (start[4] == '>' || C_isspace(start[4]))) {
333 start += 4;
335 // HTML doctype.
336 while (start != body.end() && C_isspace(*start)) {
337 ++start;
339 if (start == body.end()) break;
341 if (*start == '>') {
342 // <!DOCTYPE html>
343 // Default charset for HTML5 is UTF-8.
344 charset = "utf-8";
346 } else if (body.end() - start >= 29 &&
347 C_tolower(start[0]) == 's' &&
348 C_tolower(start[1]) == 'y' &&
349 C_tolower(start[2]) == 's' &&
350 C_tolower(start[3]) == 't' &&
351 C_tolower(start[4]) == 'e' &&
352 C_tolower(start[5]) == 'm' &&
353 C_isspace(start[6])) {
354 start += 7;
355 while (start != body.end() && C_isspace(*start)) {
356 ++start;
358 size_t left = body.end() - start;
359 if (left >= HTML5_LEGACY_COMPAT_LEN + 3 &&
360 (*start == '\'' || *start == '"') &&
361 start[HTML5_LEGACY_COMPAT_LEN + 1] == *start &&
362 body.compare(start - body.begin() + 1,
363 HTML5_LEGACY_COMPAT_LEN,
364 HTML5_LEGACY_COMPAT,
365 HTML5_LEGACY_COMPAT_LEN) == 0) {
366 // HTML5 legacy compatibility doctype:
367 // <!DOCTYPE html SYSTEM "about:legacy-compat">
368 start += HTML5_LEGACY_COMPAT_LEN + 2;
369 // Default charset for HTML5 is UTF-8.
370 charset = "utf-8";
373 start = find(start - 1, body.end(), '>');
374 if (start == body.end()) break;
375 } else {
376 // Some other SGML declaration - ignore it.
377 start = find(start - 1, body.end(), '>');
378 if (start == body.end()) break;
380 ++start;
381 } else if (*start == '?') {
382 if (++start == body.end()) break;
383 // PHP - swallow until ?> or EOF
384 start = find(start + 1, body.end(), '>');
386 // look for ?>
387 while (start != body.end() && *(start - 1) != '?')
388 start = find(start + 1, body.end(), '>');
390 if (start == body.end()) {
391 // The closing ?> at the end of a file is optional so ignore
392 // the rest of the document if there isn't one:
393 // https://www.php.net/basic-syntax.instruction-separation
394 } else {
395 // PHP ignores an immediately trailing newline after the
396 // closing tag:
397 // https://www.php.net/basic-syntax.instruction-separation
398 // Testing shows \n, \r and \r\n are skipped.
399 ++start;
400 if (*start == '\r') ++start;
401 if (*start == '\n') ++start;
403 } else {
404 // opening or closing tag
405 int closing = 0;
407 if (*start == '/') {
408 closing = 1;
409 start = find_if(start + 1, body.end(), C_isnotspace);
412 p = start;
413 start = find_if(start, body.end(), p_nottag);
414 string tag(body, p - body.begin(), start - p);
415 // convert tagname to lowercase
416 lowercase_string(tag);
418 if (closing) {
419 if (!closing_tag(tag))
420 return;
421 if (in_script && tag == "script") in_script = false;
423 /* ignore any bogus parameters on closing tags */
424 p = find(start, body.end(), '>');
425 if (p == body.end()) break;
426 start = p + 1;
427 } else {
428 bool empty_element = false;
429 // FIXME: parse parameters lazily.
430 while (start < body.end() && *start != '>') {
431 string name, value;
433 p = find_if(start, body.end(), p_whitespaceeqgt);
435 size_t name_len = p - start;
436 if (name_len == 1) {
437 if (*start == '/' && p < body.end() && *p == '>') {
438 // E.g. <tag foo="bar" />
439 start = p;
440 empty_element = true;
441 break;
445 name.assign(body, start - body.begin(), name_len);
447 p = find_if(p, body.end(), C_isnotspace);
449 start = p;
450 if (start != body.end() && *start == '=') {
451 start = find_if(start + 1, body.end(), C_isnotspace);
453 p = body.end();
455 int quote = *start;
456 if (quote == '"' || quote == '\'') {
457 ++start;
458 p = find(start, body.end(), quote);
461 if (p != body.end()) {
462 // quoted
463 value.assign(body, start - body.begin(), p - start);
464 ++p;
465 } else {
466 // unquoted or no closing quote
467 p = find_if(start, body.end(), p_whitespacegt);
468 value.assign(body, start - body.begin(), p - start);
470 start = find_if(p, body.end(), C_isnotspace);
472 if (!name.empty()) {
473 // convert parameter name to lowercase
474 lowercase_string(name);
475 // in case of multiple entries, use the first
476 // (as Netscape does)
477 parameters.insert(make_pair(name, value));
481 #if 0
482 cout << "<" << tag;
483 map<string, string>::const_iterator x;
484 for (x = parameters.begin(); x != parameters.end(); ++x) {
485 cout << " " << x->first << "=\"" << x->second << "\"";
487 cout << ">\n";
488 #endif
489 if (!opening_tag(tag))
490 return;
491 parameters.clear();
493 if (empty_element) {
494 if (!closing_tag(tag))
495 return;
498 // In <script> tags we ignore opening tags to avoid problems
499 // with "a<b".
500 if (tag == "script") in_script = true;
502 if (start != body.end() && *start == '>') ++start;