2 * @brief XML (and HTML) parser
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Ananova Ltd
6 * Copyright 2002-2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #include "xmlparser.h"
31 #include "namedents.h"
32 #include "stringutils.h"
33 #include "utf8convert.h"
44 // HTML5 legacy compatibility doctype.
45 #define HTML5_LEGACY_COMPAT "about:legacy-compat"
46 #define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)
49 lowercase_string(string
&str
)
51 for (string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
59 // ':' for XML namespaces.
60 return !C_isalnum(c
) && c
!= '.' && c
!= '-' && c
!= ':';
64 p_whitespaceeqgt(char c
)
66 return C_isspace(c
) || c
== '=' || c
== '>';
70 XmlParser::get_attribute(const string
& name
, string
& value
) const
72 // Search the data each time an attribute is requested - in practice we
73 // aren't often asked for more than one attribute, and this way we can stop
74 // once we find the requested one, and avoid the overhead building up a
75 // data structure to hold the parsed attributes.
77 // In both XML and HTML it's invalid for the same attribute name to occur
78 // more than once on the same start tag (ignoring ASCII case for HTML) - in
79 // this situation, we just take the first (which is what browsers seem to
81 const char* p
= attribute_data
;
82 const char* end
= p
+ attribute_len
;
84 const char* start
= p
;
85 p
= find_if(p
, end
, p_whitespaceeqgt
);
87 size_t len
= p
- start
;
88 bool found
= (name
.size() == len
);
91 // XML attribute names are case sensitive.
92 found
= memcmp(start
, name
.data(), len
) == 0;
94 // Compare with lower-cased version of attribute name from tag.
95 for (size_t i
= 0; i
!= len
; ++i
) {
96 if (C_tolower(start
[i
]) != name
[i
]) {
104 p
= find_if(p
, end
, [](char ch
) { return !C_isspace(ch
); });
106 if (p
== end
|| *p
!= '=') {
107 // Boolean attribute - e.g. <input type=checkbox checked>
115 p
= find_if(p
+ 1, end
, [](char ch
) { return !C_isspace(ch
); });
120 if (quote
== '"' || quote
== '\'') {
121 p
= find(++start
, end
, quote
);
124 p
= find_if(start
, end
, [](char ch
) { return C_isspace(ch
); });
128 value
.assign(start
, p
);
135 p
= find_if(p
, end
, [](char ch
) { return !C_isspace(ch
); });
140 // UTF-8 encoded entity is always <= the entity itself in length, even if the
141 // trailing ';' is missing - for numeric (decimal and hex) entities:
143 // <= UTF-8 &#<..> &#x<..>
149 // U+7FFFFFFF 6 12 11
151 // Also true for named entities. This means we can work in-place within the
155 XmlParser::decode_entities(string
& s
)
157 string::iterator out
= s
.begin();
158 string::iterator in
= out
;
159 string::iterator amp
= in
;
160 while ((amp
= find(amp
, s
.end(), '&')) != s
.end()) {
161 unsigned int val
= 0;
162 string::iterator end
, p
= amp
+ 1;
163 if (p
!= s
.end() && *p
== '#') {
165 if (p
!= s
.end() && (*p
== 'x' || *p
== 'X')) {
167 while (++p
!= s
.end() && C_isxdigit(*p
)) {
168 val
= (val
<< 4) | hex_digit(*p
);
173 while (p
!= s
.end() && C_isdigit(*p
)) {
174 val
= val
* 10 + (*p
- '0');
180 end
= find_if(p
, s
.end(), C_isnotalnum
);
181 int k
= keyword2(tab
, s
.data() + (p
- s
.begin()), end
- p
);
182 if (k
>= 0) val
= named_ent_codepoint
[k
];
184 if (end
!= s
.end() && *end
== ';') ++end
;
187 out
= copy(in
, amp
, out
);
195 // Convert Unicode value val to UTF-8.
197 unsigned len
= Xapian::Unicode::nonascii_to_utf8(val
, seq
);
198 out
= copy(seq
, seq
+ len
, out
);
210 XmlParser::parse(string_view text
)
213 if (text
.size() >= 3) {
216 if (text
[1] == '\xbb' && text
[2] == '\xbf') {
218 text
.remove_prefix(3);
223 // Match either \xfe\xff or \xff\xfe.
224 if ((text
[1] ^ text
[0]) == 1) {
225 // Convert from "utf-16" which will select the appropriate BE
226 // or LE variant based on the BOM and also remove the BOM for
229 convert_to_utf8(text
, "utf-16", utf8_text
);
240 auto start
= text
.begin();
243 // Skip through until we find a tag, a comment, or the end of document.
244 // Ignore isolated occurrences of '<' which don't start a tag or
248 p
= find(p
, text
.end(), '<');
249 if (p
== text
.end()) break;
250 unsigned char ch
= *(p
+ 1);
252 // Opening tag, closing tag, or comment/SGML declaration.
253 if ((state
!= HTML_IN_SCRIPT
&& C_isalpha(ch
)) || ch
== '/' || ch
== '!')
257 // PHP code or XML declaration.
258 // XML declaration is only valid at the start of the first line.
259 if (p
!= text
.begin() || text
.size() < 20) break;
261 // XML declaration looks something like this:
262 // <?xml version="1.0" encoding="UTF-8"?>
263 if (p
[2] != 'x' || p
[3] != 'm' || p
[4] != 'l') break;
264 if (strchr(" \t\r\n", p
[5]) == NULL
) break;
266 // Switch for XML mode for XHTML.
269 auto decl_end
= find(p
+ 6, text
.end(), '?');
270 if (decl_end
== text
.end()) break;
272 // Default charset for XML is UTF-8.
275 string_view
decl(p
+ 6, decl_end
- (p
+ 6));
276 size_t enc
= decl
.find("encoding");
277 if (enc
== decl
.npos
) break;
279 enc
= decl
.find_first_not_of(" \t\r\n", enc
+ 8);
280 if (enc
== decl
.npos
) break;
282 if (decl
[enc
] != '=') break;
284 enc
= decl
.find_first_not_of(" \t\r\n", enc
+ 1);
285 if (enc
== decl
.npos
) break;
287 if (decl
[enc
] != '"' && decl
[enc
] != '\'') break;
289 char quote
= decl
[enc
++];
290 size_t enc_end
= decl
.find(quote
, enc
);
292 if (enc_end
!= decl
.npos
)
293 charset
.assign(decl
, enc
, enc_end
- enc
);
300 // Process content up to start of tag.
303 convert_to_utf8(string_view(text
.data() + (start
- text
.begin()),
306 decode_entities(content
);
307 process_content(content
);
310 if (p
== text
.end()) break;
314 if (start
== text
.end()) break;
317 if (++start
== text
.end()) break;
319 // Comment, SGML declaration, or HTML5 DTD.
320 char first_ch
= *start
;
321 if (++start
== text
.end()) break;
322 if (first_ch
== '-' && *start
== '-') {
324 auto close
= find(start
, text
.end(), '>');
325 // An unterminated comment swallows rest of document
326 // (like Netscape, but unlike MSIE IIRC)
327 if (close
== text
.end()) break;
331 while (p
!= text
.end() && (*(p
- 1) != '-' || *(p
- 2) != '-'))
332 p
= find(p
+ 1, text
.end(), '>');
334 if (p
!= text
.end()) {
336 // Check for htdig's "ignore this bit" comments.
337 if (p
- start
== CONST_STRLEN("htdig_noindex") + 2 &&
338 memcmp(&*start
, "htdig_noindex",
339 CONST_STRLEN("htdig_noindex")) == 0) {
340 auto i
= text
.find("<!--/htdig_noindex-->",
341 p
+ 1 - text
.begin());
342 if (i
== text
.npos
) break;
343 start
= text
.begin() + i
+
344 CONST_STRLEN("<!--/htdig_noindex-->");
347 // Check for udmcomment (similar to htdig's)
348 if (p
- start
== CONST_STRLEN("UdmComment") + 2 &&
349 memcmp(&*start
, "UdmComment",
350 CONST_STRLEN("UdmComment")) == 0) {
351 auto i
= text
.find("<!--/UdmComment-->",
352 p
+ 1 - text
.begin());
353 if (i
== text
.npos
) break;
354 start
= text
.begin() + i
+
355 CONST_STRLEN("<!--/UdmComment-->");
359 // If we found --> skip to there.
362 // Otherwise skip to the first > we found (as Netscape does).
365 } else if (first_ch
== '[' &&
366 text
.size() - (start
- text
.begin()) > 6 &&
367 memcmp(&*start
, "CDATA[", CONST_STRLEN("CDATA[")) == 0) {
369 string_view::size_type b
= start
- text
.begin();
370 string_view::size_type i
= text
.find("]]>", b
);
371 string_view::size_type e
= (i
== text
.npos
) ? text
.size() : i
;
373 convert_to_utf8(string_view(text
.data() + b
, e
- b
),
375 process_content(content
);
376 if (i
== text
.npos
) break;
377 start
= text
.begin() + i
+ 2;
378 } else if (C_tolower(first_ch
) == 'd' &&
379 text
.end() - start
> 6 &&
380 C_tolower(start
[0]) == 'o' &&
381 C_tolower(start
[1]) == 'c' &&
382 C_tolower(start
[2]) == 't' &&
383 C_tolower(start
[3]) == 'y' &&
384 C_tolower(start
[4]) == 'p' &&
385 C_tolower(start
[5]) == 'e' &&
386 C_isspace(start
[6])) {
387 // DOCTYPE declaration.
389 while (start
!= text
.end() && C_isspace(*start
)) {
392 if (start
== text
.end()) break;
393 if (text
.end() - start
>= 5 &&
394 C_tolower(start
[0]) == 'h' &&
395 C_tolower(start
[1]) == 't' &&
396 C_tolower(start
[2]) == 'm' &&
397 C_tolower(start
[3]) == 'l' &&
398 (start
[4] == '>' || C_isspace(start
[4]))) {
402 while (start
!= text
.end() && C_isspace(*start
)) {
405 if (start
== text
.end()) break;
409 // Default charset for HTML5 is UTF-8.
412 } else if (text
.end() - start
>= 29 &&
413 C_tolower(start
[0]) == 's' &&
414 C_tolower(start
[1]) == 'y' &&
415 C_tolower(start
[2]) == 's' &&
416 C_tolower(start
[3]) == 't' &&
417 C_tolower(start
[4]) == 'e' &&
418 C_tolower(start
[5]) == 'm' &&
419 C_isspace(start
[6])) {
421 while (start
!= text
.end() && C_isspace(*start
)) {
424 size_t left
= text
.end() - start
;
425 if (left
>= HTML5_LEGACY_COMPAT_LEN
+ 3 &&
426 (*start
== '\'' || *start
== '"') &&
427 start
[HTML5_LEGACY_COMPAT_LEN
+ 1] == *start
&&
428 text
.compare(start
- text
.begin() + 1,
429 HTML5_LEGACY_COMPAT_LEN
,
431 HTML5_LEGACY_COMPAT_LEN
) == 0) {
432 // HTML5 legacy compatibility doctype:
433 // <!DOCTYPE html SYSTEM "about:legacy-compat">
434 start
+= HTML5_LEGACY_COMPAT_LEN
+ 2;
435 // Default charset for HTML5 is UTF-8.
439 start
= find(start
- 1, text
.end(), '>');
440 if (start
== text
.end()) break;
442 // Some other SGML declaration - ignore it.
443 start
= find(start
- 1, text
.end(), '>');
444 if (start
== text
.end()) break;
447 } else if (*start
== '?') {
448 if (++start
== text
.end()) break;
449 // PHP - swallow until ?> or EOF
450 start
= find(start
+ 1, text
.end(), '>');
453 while (start
!= text
.end() && *(start
- 1) != '?')
454 start
= find(start
+ 1, text
.end(), '>');
456 if (start
== text
.end()) {
457 // The closing ?> at the end of a file is optional so ignore
458 // the rest of the document if there isn't one:
459 // https://www.php.net/basic-syntax.instruction-separation
461 // PHP ignores an immediately trailing newline after the
463 // https://www.php.net/basic-syntax.instruction-separation
464 // Testing shows \n, \r and \r\n are skipped.
466 if (*start
== '\r') ++start
;
467 if (*start
== '\n') ++start
;
470 // Opening or closing tag.
471 bool closing
= false;
475 start
= find_if(start
+ 1, text
.end(), C_isnotspace
);
478 p
= find_if(start
, text
.end(), p_nottag
);
479 string
tag(start
, p
);
481 // Convert tagname to lowercase.
482 lowercase_string(tag
);
486 if (!closing_tag(tag
))
488 if (state
== HTML_IN_SCRIPT
&& tag
== "script")
493 if (p
< text
.end() && *p
!= '>') {
494 // We often aren't asked for the attributes, so parse them
495 // lazily - for now we just need to skip balanced single and
498 // Ignore attributes on closing tags (they're bogus) but still
499 // skip balanced quotes, since that's what browsers do.
501 p
= find_if(p
, text
.end(),
503 return ch
== '"' || ch
== '\'' || ch
== '>';
505 if (p
== text
.end() || *p
== '>') {
509 p
= find_if(p
, text
.end(),
511 return ch
== '\'' || ch
== '>';
514 p
= find_if(p
, text
.end(),
516 return ch
== '"' || ch
== '>';
523 attribute_len
= p
- start
;
524 bool empty_element
= false;
525 if (attribute_len
> 0) {
526 // Check for empty element (e.g. <br/>).
527 attribute_data
= &*start
;
529 // <a href=foo/> isn't an empty element though
530 if (attribute_len
== 1 ||
534 empty_element
= true;
539 if (!opening_tag(tag
))
544 if (!closing_tag(tag
))
548 if (state
== HTML
&& tag
== "script") {
549 // In HTML <script> tags we ignore opening tags to avoid
550 // problems with "a<b".
551 state
= HTML_IN_SCRIPT
;
554 if (start
!= text
.end() && *start
== '>') ++start
;
557 if (p
== text
.end()) break;