1 /* htmlparse.cc: simple HTML parser for omega indexer
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 Ananova Ltd
5 * Copyright 2002,2006,2007,2008,2009,2010,2011,2012,2015,2016,2018,2020 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "htmlparse.h"
30 #include "namedents.h"
31 #include "stringutils.h"
32 #include "utf8convert.h"
43 // HTML5 legacy compatibility doctype.
44 #define HTML5_LEGACY_COMPAT "about:legacy-compat"
45 #define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)
48 lowercase_string(string
&str
)
50 for (string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
58 // ':' for XML namespaces.
59 return !C_isalnum(c
) && c
!= '.' && c
!= '-' && c
!= ':';
63 p_whitespacegt(char c
)
65 return C_isspace(c
) || c
== '>';
69 p_whitespaceeqgt(char c
)
71 return C_isspace(c
) || c
== '=' || c
== '>';
75 HtmlParser::get_parameter(const string
& param
, string
& value
) const
77 map
<string
, string
>::const_iterator i
= parameters
.find(param
);
78 if (i
== parameters
.end()) return false;
83 // UTF-8 encoded entity is always <= the entity itself in length, even if the
84 // trailing ';' is missing - for numeric (decimal and hex) entities:
86 // <= UTF-8 &#<..> &#x<..>
94 // Also true for named entities. This means we can work in-place within the
98 HtmlParser::decode_entities(string
&s
)
100 string::iterator out
= s
.begin();
101 string::iterator in
= out
;
102 string::iterator amp
= in
;
103 while ((amp
= find(amp
, s
.end(), '&')) != s
.end()) {
104 unsigned int val
= 0;
105 string::iterator end
, p
= amp
+ 1;
106 if (p
!= s
.end() && *p
== '#') {
108 if (p
!= s
.end() && (*p
== 'x' || *p
== 'X')) {
110 while (++p
!= s
.end() && C_isxdigit(*p
)) {
111 val
= (val
<< 4) | hex_digit(*p
);
116 while (p
!= s
.end() && C_isdigit(*p
)) {
117 val
= val
* 10 + (*p
- '0');
123 end
= find_if(p
, s
.end(), C_isnotalnum
);
124 int k
= keyword2(tab
, s
.data() + (p
- s
.begin()), end
- p
);
125 if (k
>= 0) val
= named_ent_codepoint
[k
];
127 if (end
!= s
.end() && *end
== ';') ++end
;
130 out
= copy(in
, amp
, out
);
138 // Convert unicode value val to UTF-8.
140 unsigned len
= Xapian::Unicode::nonascii_to_utf8(val
, seq
);
141 out
= copy(seq
, seq
+ len
, out
);
153 HtmlParser::parse(const string
& body
)
156 string::const_iterator begin_after_bom
= body
.begin();
157 if (body
.size() >= 3) {
160 if (body
[1] == '\xbb' && body
[2] == '\xbf') {
162 begin_after_bom
+= 3;
167 // Match either \xfe\xff or \xff\xfe.
168 if ((body
[1] ^ body
[0]) == 1) {
169 // Convert to "utf-16" which will remove the BOM for us.
171 convert_to_utf8(body
, "utf-16", utf8_body
);
183 string::const_iterator start
= begin_after_bom
;
186 // Skip through until we find an HTML tag, a comment, or the end of
187 // document. Ignore isolated occurrences of '<' which don't start
189 string::const_iterator p
= start
;
191 p
= find(p
, body
.end(), '<');
192 if (p
== body
.end()) break;
193 unsigned char ch
= *(p
+ 1);
195 // Tag, closing tag, or comment (or SGML declaration).
196 if ((!in_script
&& C_isalpha(ch
)) || ch
== '/' || ch
== '!') break;
199 // PHP code or XML declaration.
200 // XML declaration is only valid at the start of the first line.
201 if (p
!= begin_after_bom
|| body
.size() < 20) break;
203 // XML declaration looks something like this:
204 // <?xml version="1.0" encoding="UTF-8"?>
205 if (p
[2] != 'x' || p
[3] != 'm' || p
[4] != 'l') break;
206 if (strchr(" \t\r\n", p
[5]) == NULL
) break;
208 string::const_iterator decl_end
= find(p
+ 6, body
.end(), '?');
209 if (decl_end
== body
.end()) break;
211 // Default charset for XML is UTF-8.
214 string
decl(p
+ 6, decl_end
);
215 size_t enc
= decl
.find("encoding");
216 if (enc
== string::npos
) break;
218 enc
= decl
.find_first_not_of(" \t\r\n", enc
+ 8);
219 if (enc
== string::npos
|| enc
== decl
.size()) break;
221 if (decl
[enc
] != '=') break;
223 enc
= decl
.find_first_not_of(" \t\r\n", enc
+ 1);
224 if (enc
== string::npos
|| enc
== decl
.size()) break;
226 if (decl
[enc
] != '"' && decl
[enc
] != '\'') break;
228 char quote
= decl
[enc
++];
229 size_t enc_end
= decl
.find(quote
, enc
);
231 if (enc
!= string::npos
)
232 charset
.assign(decl
, enc
, enc_end
- enc
);
239 // Process text up to start of tag.
241 string
text(body
, start
- body
.begin(), p
- start
);
242 convert_to_utf8(text
, charset
);
243 decode_entities(text
);
247 if (p
== body
.end()) break;
251 if (start
== body
.end()) break;
254 if (++start
== body
.end()) break;
256 // Comment, SGML declaration, or HTML5 DTD.
257 char first_ch
= *start
;
258 if (++start
== body
.end()) break;
259 if (first_ch
== '-' && *start
== '-') {
261 string::const_iterator close
= find(start
, body
.end(), '>');
262 // An unterminated comment swallows rest of document
263 // (like Netscape, but unlike MSIE IIRC)
264 if (close
== body
.end()) break;
268 while (p
!= body
.end() && (*(p
- 1) != '-' || *(p
- 2) != '-'))
269 p
= find(p
+ 1, body
.end(), '>');
271 if (p
!= body
.end()) {
272 // Check for htdig's "ignore this bit" comments.
273 if (p
- start
== CONST_STRLEN("htdig_noindex") + 2 &&
274 memcmp(&*start
, "htdig_noindex",
275 CONST_STRLEN("htdig_noindex")) == 0) {
276 auto i
= body
.find("<!--/htdig_noindex-->",
277 p
+ 1 - body
.begin());
278 if (i
== string::npos
) break;
279 start
= body
.begin() + i
+
280 CONST_STRLEN("<!--/htdig_noindex-->");
283 // Check for udmcomment (similar to htdig's)
284 if (p
- start
== CONST_STRLEN("UdmComment") + 2 &&
285 memcmp(&*start
, "UdmComment",
286 CONST_STRLEN("UdmComment")) == 0) {
287 auto i
= body
.find("<!--/UdmComment-->",
288 p
+ 1 - body
.begin());
289 if (i
== string::npos
) break;
290 start
= body
.begin() + i
+
291 CONST_STRLEN("<!--/UdmComment-->");
294 // If we found --> skip to there.
297 // Otherwise skip to the first > we found (as Netscape does).
300 } else if (first_ch
== '[' &&
301 body
.size() - (start
- body
.begin()) > 6 &&
302 body
.compare(start
- body
.begin(), 6, "CDATA[", 6) == 0) {
304 string::size_type b
= start
- body
.begin();
306 i
= body
.find("]]>", b
);
307 string
text(body
, b
, i
- b
);
308 convert_to_utf8(text
, charset
);
310 if (i
== string::npos
) break;
311 start
= body
.begin() + i
+ 2;
312 } else if (C_tolower(first_ch
) == 'd' &&
313 body
.end() - start
> 6 &&
314 C_tolower(start
[0]) == 'o' &&
315 C_tolower(start
[1]) == 'c' &&
316 C_tolower(start
[2]) == 't' &&
317 C_tolower(start
[3]) == 'y' &&
318 C_tolower(start
[4]) == 'p' &&
319 C_tolower(start
[5]) == 'e' &&
320 C_isspace(start
[6])) {
321 // DOCTYPE declaration.
323 while (start
!= body
.end() && C_isspace(*start
)) {
326 if (start
== body
.end()) break;
327 if (body
.end() - start
>= 5 &&
328 C_tolower(start
[0]) == 'h' &&
329 C_tolower(start
[1]) == 't' &&
330 C_tolower(start
[2]) == 'm' &&
331 C_tolower(start
[3]) == 'l' &&
332 (start
[4] == '>' || C_isspace(start
[4]))) {
336 while (start
!= body
.end() && C_isspace(*start
)) {
339 if (start
== body
.end()) break;
343 // Default charset for HTML5 is UTF-8.
346 } else if (body
.end() - start
>= 29 &&
347 C_tolower(start
[0]) == 's' &&
348 C_tolower(start
[1]) == 'y' &&
349 C_tolower(start
[2]) == 's' &&
350 C_tolower(start
[3]) == 't' &&
351 C_tolower(start
[4]) == 'e' &&
352 C_tolower(start
[5]) == 'm' &&
353 C_isspace(start
[6])) {
355 while (start
!= body
.end() && C_isspace(*start
)) {
358 size_t left
= body
.end() - start
;
359 if (left
>= HTML5_LEGACY_COMPAT_LEN
+ 3 &&
360 (*start
== '\'' || *start
== '"') &&
361 start
[HTML5_LEGACY_COMPAT_LEN
+ 1] == *start
&&
362 body
.compare(start
- body
.begin() + 1,
363 HTML5_LEGACY_COMPAT_LEN
,
365 HTML5_LEGACY_COMPAT_LEN
) == 0) {
366 // HTML5 legacy compatibility doctype:
367 // <!DOCTYPE html SYSTEM "about:legacy-compat">
368 start
+= HTML5_LEGACY_COMPAT_LEN
+ 2;
369 // Default charset for HTML5 is UTF-8.
373 start
= find(start
- 1, body
.end(), '>');
374 if (start
== body
.end()) break;
376 // Some other SGML declaration - ignore it.
377 start
= find(start
- 1, body
.end(), '>');
378 if (start
== body
.end()) break;
381 } else if (*start
== '?') {
382 if (++start
== body
.end()) break;
383 // PHP - swallow until ?> or EOF
384 start
= find(start
+ 1, body
.end(), '>');
387 while (start
!= body
.end() && *(start
- 1) != '?')
388 start
= find(start
+ 1, body
.end(), '>');
390 if (start
== body
.end()) {
391 // The closing ?> at the end of a file is optional so ignore
392 // the rest of the document if there isn't one:
393 // https://www.php.net/basic-syntax.instruction-separation
395 // PHP ignores an immediately trailing newline after the
397 // https://www.php.net/basic-syntax.instruction-separation
398 // Testing shows \n, \r and \r\n are skipped.
400 if (*start
== '\r') ++start
;
401 if (*start
== '\n') ++start
;
404 // opening or closing tag
409 start
= find_if(start
+ 1, body
.end(), C_isnotspace
);
413 start
= find_if(start
, body
.end(), p_nottag
);
414 string
tag(body
, p
- body
.begin(), start
- p
);
415 // convert tagname to lowercase
416 lowercase_string(tag
);
419 if (!closing_tag(tag
))
421 if (in_script
&& tag
== "script") in_script
= false;
423 /* ignore any bogus parameters on closing tags */
424 p
= find(start
, body
.end(), '>');
425 if (p
== body
.end()) break;
428 bool empty_element
= false;
429 // FIXME: parse parameters lazily.
430 while (start
< body
.end() && *start
!= '>') {
433 p
= find_if(start
, body
.end(), p_whitespaceeqgt
);
435 size_t name_len
= p
- start
;
437 if (*start
== '/' && p
< body
.end() && *p
== '>') {
438 // E.g. <tag foo="bar" />
440 empty_element
= true;
445 name
.assign(body
, start
- body
.begin(), name_len
);
447 p
= find_if(p
, body
.end(), C_isnotspace
);
450 if (start
!= body
.end() && *start
== '=') {
451 start
= find_if(start
+ 1, body
.end(), C_isnotspace
);
456 if (quote
== '"' || quote
== '\'') {
458 p
= find(start
, body
.end(), quote
);
461 if (p
!= body
.end()) {
463 value
.assign(body
, start
- body
.begin(), p
- start
);
466 // unquoted or no closing quote
467 p
= find_if(start
, body
.end(), p_whitespacegt
);
468 value
.assign(body
, start
- body
.begin(), p
- start
);
470 start
= find_if(p
, body
.end(), C_isnotspace
);
473 // convert parameter name to lowercase
474 lowercase_string(name
);
475 // in case of multiple entries, use the first
476 // (as Netscape does)
477 parameters
.insert(make_pair(name
, value
));
483 map
<string
, string
>::const_iterator x
;
484 for (x
= parameters
.begin(); x
!= parameters
.end(); ++x
) {
485 cout
<< " " << x
->first
<< "=\"" << x
->second
<< "\"";
489 if (!opening_tag(tag
))
494 if (!closing_tag(tag
))
498 // In <script> tags we ignore opening tags to avoid problems
500 if (tag
== "script") in_script
= true;
502 if (start
!= body
.end() && *start
== '>') ++start
;