[ci] Update macos jobs
[xapian.git] / xapian-applications / omega / xmlparser.cc
blob85471bafed8881b43fd3519a58b9f2cbcfd60bc6
1 /** @file
2 * @brief XML (and HTML) parser
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Ananova Ltd
6 * Copyright 2002-2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include "xmlparser.h"
28 #include <xapian.h>
30 #include "keyword.h"
31 #include "namedents.h"
32 #include "stringutils.h"
33 #include "utf8convert.h"
35 #include <algorithm>
37 #include <cctype>
38 #include <cstring>
39 #include <cstdio>
40 #include <cstdlib>
42 using namespace std;
44 // HTML5 legacy compatibility doctype.
45 #define HTML5_LEGACY_COMPAT "about:legacy-compat"
46 #define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)
48 static inline void
49 lowercase_string(string &str)
51 for (string::iterator i = str.begin(); i != str.end(); ++i) {
52 *i = C_tolower(*i);
56 static inline bool
57 p_nottag(char c)
59 // ':' for XML namespaces.
60 return !C_isalnum(c) && c != '.' && c != '-' && c != ':';
63 static inline bool
64 p_whitespaceeqgt(char c)
66 return C_isspace(c) || c == '=' || c == '>';
69 bool
70 XmlParser::get_attribute(const string& name, string& value) const
72 // Search the data each time an attribute is requested - in practice we
73 // aren't often asked for more than one attribute, and this way we can stop
74 // once we find the requested one, and avoid the overhead building up a
75 // data structure to hold the parsed attributes.
77 // In both XML and HTML it's invalid for the same attribute name to occur
78 // more than once on the same start tag (ignoring ASCII case for HTML) - in
79 // this situation, we just take the first (which is what browsers seem to
80 // do).
81 const char* p = attribute_data;
82 const char* end = p + attribute_len;
83 while (p != end) {
84 const char* start = p;
85 p = find_if(p, end, p_whitespaceeqgt);
87 size_t len = p - start;
88 bool found = (name.size() == len);
89 if (found) {
90 if (state == XML) {
91 // XML attribute names are case sensitive.
92 found = memcmp(start, name.data(), len) == 0;
93 } else {
94 // Compare with lower-cased version of attribute name from tag.
95 for (size_t i = 0; i != len; ++i) {
96 if (C_tolower(start[i]) != name[i]) {
97 found = false;
98 break;
104 p = find_if(p, end, [](char ch) { return !C_isspace(ch); });
106 if (p == end || *p != '=') {
107 // Boolean attribute - e.g. <input type=checkbox checked>
108 if (found) {
109 value.clear();
110 return true;
112 continue;
115 p = find_if(p + 1, end, [](char ch) { return !C_isspace(ch); });
116 if (p == end) break;
118 start = p;
119 char quote = *p;
120 if (quote == '"' || quote == '\'') {
121 p = find(++start, end, quote);
122 } else {
123 quote = 0;
124 p = find_if(start, end, [](char ch) { return C_isspace(ch); });
127 if (found) {
128 value.assign(start, p);
129 return true;
132 if (p == end) break;
134 if (quote) ++p;
135 p = find_if(p, end, [](char ch) { return !C_isspace(ch); });
137 return false;
140 // UTF-8 encoded entity is always <= the entity itself in length, even if the
141 // trailing ';' is missing - for numeric (decimal and hex) entities:
143 // <= UTF-8 &#<..> &#x<..>
144 // U+007F 1 5 5
145 // U+07FF 2 6 6
146 // U+FFFF 3 7 7
147 // U+1FFFFF 4 9 9
148 // U+3FFFFFF 5 10 10
149 // U+7FFFFFFF 6 12 11
151 // Also true for named entities. This means we can work in-place within the
152 // string.
154 void
155 XmlParser::decode_entities(string& s)
157 string::iterator out = s.begin();
158 string::iterator in = out;
159 string::iterator amp = in;
160 while ((amp = find(amp, s.end(), '&')) != s.end()) {
161 unsigned int val = 0;
162 string::iterator end, p = amp + 1;
163 if (p != s.end() && *p == '#') {
164 ++p;
165 if (p != s.end() && (*p == 'x' || *p == 'X')) {
166 // hex
167 while (++p != s.end() && C_isxdigit(*p)) {
168 val = (val << 4) | hex_digit(*p);
170 end = p;
171 } else {
172 // number
173 while (p != s.end() && C_isdigit(*p)) {
174 val = val * 10 + (*p - '0');
175 ++p;
177 end = p;
179 } else {
180 end = find_if(p, s.end(), C_isnotalnum);
181 int k = keyword2(tab, s.data() + (p - s.begin()), end - p);
182 if (k >= 0) val = named_ent_codepoint[k];
184 if (end != s.end() && *end == ';') ++end;
185 if (val) {
186 if (in != out) {
187 out = copy(in, amp, out);
188 } else {
189 out = amp;
191 in = end;
192 if (val < 0x80) {
193 *out++ = char(val);
194 } else {
195 // Convert Unicode value val to UTF-8.
196 char seq[4];
197 unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
198 out = copy(seq, seq + len, out);
201 amp = end;
204 if (in != out) {
205 s.erase(out, in);
209 void
210 XmlParser::parse(string_view text)
212 // Check for BOM.
213 if (text.size() >= 3) {
214 switch (text[0]) {
215 case '\xef':
216 if (text[1] == '\xbb' && text[2] == '\xbf') {
217 charset = "utf-8";
218 text.remove_prefix(3);
220 break;
221 case '\xfe':
222 case '\xff':
223 // Match either \xfe\xff or \xff\xfe.
224 if ((text[1] ^ text[0]) == 1) {
225 // Convert from "utf-16" which will select the appropriate BE
226 // or LE variant based on the BOM and also remove the BOM for
227 // us.
228 string utf8_text;
229 convert_to_utf8(text, "utf-16", utf8_text);
230 charset = "utf-8";
231 parse(utf8_text);
232 return;
234 break;
238 attribute_len = 0;
240 auto start = text.begin();
242 while (true) {
243 // Skip through until we find a tag, a comment, or the end of document.
244 // Ignore isolated occurrences of '<' which don't start a tag or
245 // comment.
246 auto p = start;
247 while (true) {
248 p = find(p, text.end(), '<');
249 if (p == text.end()) break;
250 unsigned char ch = *(p + 1);
252 // Opening tag, closing tag, or comment/SGML declaration.
253 if ((state != HTML_IN_SCRIPT && C_isalpha(ch)) || ch == '/' || ch == '!')
254 break;
256 if (ch == '?') {
257 // PHP code or XML declaration.
258 // XML declaration is only valid at the start of the first line.
259 if (p != text.begin() || text.size() < 20) break;
261 // XML declaration looks something like this:
262 // <?xml version="1.0" encoding="UTF-8"?>
263 if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
264 if (strchr(" \t\r\n", p[5]) == NULL) break;
266 // Switch for XML mode for XHTML.
267 state = XML;
269 auto decl_end = find(p + 6, text.end(), '?');
270 if (decl_end == text.end()) break;
272 // Default charset for XML is UTF-8.
273 charset = "utf-8";
275 string_view decl(p + 6, decl_end - (p + 6));
276 size_t enc = decl.find("encoding");
277 if (enc == decl.npos) break;
279 enc = decl.find_first_not_of(" \t\r\n", enc + 8);
280 if (enc == decl.npos) break;
282 if (decl[enc] != '=') break;
284 enc = decl.find_first_not_of(" \t\r\n", enc + 1);
285 if (enc == decl.npos) break;
287 if (decl[enc] != '"' && decl[enc] != '\'') break;
289 char quote = decl[enc++];
290 size_t enc_end = decl.find(quote, enc);
292 if (enc_end != decl.npos)
293 charset.assign(decl, enc, enc_end - enc);
295 break;
297 ++p;
300 // Process content up to start of tag.
301 if (p > start) {
302 string content;
303 convert_to_utf8(string_view(text.data() + (start - text.begin()),
304 p - start),
305 charset, content);
306 decode_entities(content);
307 process_content(content);
310 if (p == text.end()) break;
312 start = p + 1;
314 if (start == text.end()) break;
316 if (*start == '!') {
317 if (++start == text.end()) break;
319 // Comment, SGML declaration, or HTML5 DTD.
320 char first_ch = *start;
321 if (++start == text.end()) break;
322 if (first_ch == '-' && *start == '-') {
323 ++start;
324 auto close = find(start, text.end(), '>');
325 // An unterminated comment swallows rest of document
326 // (like Netscape, but unlike MSIE IIRC)
327 if (close == text.end()) break;
329 p = close;
330 // look for -->
331 while (p != text.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
332 p = find(p + 1, text.end(), '>');
334 if (p != text.end()) {
335 if (state != XML) {
336 // Check for htdig's "ignore this bit" comments.
337 if (p - start == CONST_STRLEN("htdig_noindex") + 2 &&
338 memcmp(&*start, "htdig_noindex",
339 CONST_STRLEN("htdig_noindex")) == 0) {
340 auto i = text.find("<!--/htdig_noindex-->",
341 p + 1 - text.begin());
342 if (i == text.npos) break;
343 start = text.begin() + i +
344 CONST_STRLEN("<!--/htdig_noindex-->");
345 continue;
347 // Check for udmcomment (similar to htdig's)
348 if (p - start == CONST_STRLEN("UdmComment") + 2 &&
349 memcmp(&*start, "UdmComment",
350 CONST_STRLEN("UdmComment")) == 0) {
351 auto i = text.find("<!--/UdmComment-->",
352 p + 1 - text.begin());
353 if (i == text.npos) break;
354 start = text.begin() + i +
355 CONST_STRLEN("<!--/UdmComment-->");
356 continue;
359 // If we found --> skip to there.
360 start = p;
361 } else {
362 // Otherwise skip to the first > we found (as Netscape does).
363 start = close;
365 } else if (first_ch == '[' &&
366 text.size() - (start - text.begin()) > 6 &&
367 memcmp(&*start, "CDATA[", CONST_STRLEN("CDATA[")) == 0) {
368 start += 6;
369 string_view::size_type b = start - text.begin();
370 string_view::size_type i = text.find("]]>", b);
371 string_view::size_type e = (i == text.npos) ? text.size() : i;
372 string content;
373 convert_to_utf8(string_view(text.data() + b, e - b),
374 charset, content);
375 process_content(content);
376 if (i == text.npos) break;
377 start = text.begin() + i + 2;
378 } else if (C_tolower(first_ch) == 'd' &&
379 text.end() - start > 6 &&
380 C_tolower(start[0]) == 'o' &&
381 C_tolower(start[1]) == 'c' &&
382 C_tolower(start[2]) == 't' &&
383 C_tolower(start[3]) == 'y' &&
384 C_tolower(start[4]) == 'p' &&
385 C_tolower(start[5]) == 'e' &&
386 C_isspace(start[6])) {
387 // DOCTYPE declaration.
388 start += 7;
389 while (start != text.end() && C_isspace(*start)) {
390 ++start;
392 if (start == text.end()) break;
393 if (text.end() - start >= 5 &&
394 C_tolower(start[0]) == 'h' &&
395 C_tolower(start[1]) == 't' &&
396 C_tolower(start[2]) == 'm' &&
397 C_tolower(start[3]) == 'l' &&
398 (start[4] == '>' || C_isspace(start[4]))) {
399 start += 4;
401 // HTML doctype.
402 while (start != text.end() && C_isspace(*start)) {
403 ++start;
405 if (start == text.end()) break;
407 if (*start == '>') {
408 // <!DOCTYPE html>
409 // Default charset for HTML5 is UTF-8.
410 charset = "utf-8";
412 } else if (text.end() - start >= 29 &&
413 C_tolower(start[0]) == 's' &&
414 C_tolower(start[1]) == 'y' &&
415 C_tolower(start[2]) == 's' &&
416 C_tolower(start[3]) == 't' &&
417 C_tolower(start[4]) == 'e' &&
418 C_tolower(start[5]) == 'm' &&
419 C_isspace(start[6])) {
420 start += 7;
421 while (start != text.end() && C_isspace(*start)) {
422 ++start;
424 size_t left = text.end() - start;
425 if (left >= HTML5_LEGACY_COMPAT_LEN + 3 &&
426 (*start == '\'' || *start == '"') &&
427 start[HTML5_LEGACY_COMPAT_LEN + 1] == *start &&
428 text.compare(start - text.begin() + 1,
429 HTML5_LEGACY_COMPAT_LEN,
430 HTML5_LEGACY_COMPAT,
431 HTML5_LEGACY_COMPAT_LEN) == 0) {
432 // HTML5 legacy compatibility doctype:
433 // <!DOCTYPE html SYSTEM "about:legacy-compat">
434 start += HTML5_LEGACY_COMPAT_LEN + 2;
435 // Default charset for HTML5 is UTF-8.
436 charset = "utf-8";
439 start = find(start - 1, text.end(), '>');
440 if (start == text.end()) break;
441 } else {
442 // Some other SGML declaration - ignore it.
443 start = find(start - 1, text.end(), '>');
444 if (start == text.end()) break;
446 ++start;
447 } else if (*start == '?') {
448 if (++start == text.end()) break;
449 // PHP - swallow until ?> or EOF
450 start = find(start + 1, text.end(), '>');
452 // look for ?>
453 while (start != text.end() && *(start - 1) != '?')
454 start = find(start + 1, text.end(), '>');
456 if (start == text.end()) {
457 // The closing ?> at the end of a file is optional so ignore
458 // the rest of the document if there isn't one:
459 // https://www.php.net/basic-syntax.instruction-separation
460 } else {
461 // PHP ignores an immediately trailing newline after the
462 // closing tag:
463 // https://www.php.net/basic-syntax.instruction-separation
464 // Testing shows \n, \r and \r\n are skipped.
465 ++start;
466 if (*start == '\r') ++start;
467 if (*start == '\n') ++start;
469 } else {
470 // Opening or closing tag.
471 bool closing = false;
473 if (*start == '/') {
474 closing = true;
475 start = find_if(start + 1, text.end(), C_isnotspace);
478 p = find_if(start, text.end(), p_nottag);
479 string tag(start, p);
480 if (state != XML) {
481 // Convert tagname to lowercase.
482 lowercase_string(tag);
485 if (closing) {
486 if (!closing_tag(tag))
487 return;
488 if (state == HTML_IN_SCRIPT && tag == "script")
489 state = HTML;
492 start = p;
493 if (p < text.end() && *p != '>') {
494 // We often aren't asked for the attributes, so parse them
495 // lazily - for now we just need to skip balanced single and
496 // double quotes.
498 // Ignore attributes on closing tags (they're bogus) but still
499 // skip balanced quotes, since that's what browsers do.
500 while (true) {
501 p = find_if(p, text.end(),
502 [](char ch) {
503 return ch == '"' || ch == '\'' || ch == '>';
505 if (p == text.end() || *p == '>') {
506 break;
508 if (*p == '"') {
509 p = find_if(p, text.end(),
510 [](char ch) {
511 return ch == '\'' || ch == '>';
513 } else {
514 p = find_if(p, text.end(),
515 [](char ch) {
516 return ch == '"' || ch == '>';
522 if (!closing) {
523 attribute_len = p - start;
524 bool empty_element = false;
525 if (attribute_len > 0) {
526 // Check for empty element (e.g. <br/>).
527 attribute_data = &*start;
528 if (p[-1] == '/') {
529 // <a href=foo/> isn't an empty element though
530 if (attribute_len == 1 ||
531 C_isspace(p[-2]) ||
532 p[-2] == '"' ||
533 p[-2] == '\'') {
534 empty_element = true;
535 --attribute_len;
539 if (!opening_tag(tag))
540 return;
541 attribute_len = 0;
543 if (empty_element) {
544 if (!closing_tag(tag))
545 return;
548 if (state == HTML && tag == "script") {
549 // In HTML <script> tags we ignore opening tags to avoid
550 // problems with "a<b".
551 state = HTML_IN_SCRIPT;
554 if (start != text.end() && *start == '>') ++start;
557 if (p == text.end()) break;
558 start = p + 1;