xapian-applications/omega/xmlparser.cc

   1 /** @file
   2  * @brief XML (and HTML) parser
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001 Ananova Ltd
   6  * Copyright 2002-2023 Olly Betts
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  21  * USA
  22  */
  23
  24 #include <config.h>
  25
  26 #include "xmlparser.h"
  27
  28 #include <xapian.h>
  29
  30 #include "keyword.h"
  31 #include "namedents.h"
  32 #include "stringutils.h"
  33 #include "utf8convert.h"
  34
  35 #include <algorithm>
  36
  37 #include <cctype>
  38 #include <cstring>
  39 #include <cstdio>
  40 #include <cstdlib>
  41
  42 using namespace std;
  43
  44 // HTML5 legacy compatibility doctype.
  45 #define HTML5_LEGACY_COMPAT "about:legacy-compat"
  46 #define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)
  47
  48 static inline void
  49 lowercase_string(string &str)
  50 {
  51     for (string::iterator i = str.begin(); i != str.end(); ++i) {
  52         *i = C_tolower(*i);
  53     }
  54 }
  55
  56 static inline bool
  57 p_nottag(char c)
  58 {
  59     // ':' for XML namespaces.
  60     return !C_isalnum(c) && c != '.' && c != '-' && c != ':';
  61 }
  62
  63 static inline bool
  64 p_whitespaceeqgt(char c)
  65 {
  66     return C_isspace(c) || c == '=' || c == '>';
  67 }
  68
  69 bool
  70 XmlParser::get_attribute(const string& name, string& value) const
  71 {
  72     // Search the data each time an attribute is requested - in practice we
  73     // aren't often asked for more than one attribute, and this way we can stop
  74     // once we find the requested one, and avoid the overhead building up a
  75     // data structure to hold the parsed attributes.
  76     //
  77     // In both XML and HTML it's invalid for the same attribute name to occur
  78     // more than once on the same start tag (ignoring ASCII case for HTML) - in
  79     // this situation, we just take the first (which is what browsers seem to
  80     // do).
  81     const char* p = attribute_data;
  82     const char* end = p + attribute_len;
  83     while (p != end) {
  84         const char* start = p;
  85         p = find_if(p, end, p_whitespaceeqgt);
  86
  87         size_t len = p - start;
  88         bool found = (name.size() == len);
  89         if (found) {
  90             if (state == XML) {
  91                 // XML attribute names are case sensitive.
  92                 found = memcmp(start, name.data(), len) == 0;
  93             } else {
  94                 // Compare with lower-cased version of attribute name from tag.
  95                 for (size_t i = 0; i != len; ++i) {
  96                     if (C_tolower(start[i]) != name[i]) {
  97                         found = false;
  98                         break;
  99                     }
 100                 }
 101             }
 102         }
 103
 104         p = find_if(p, end, [](char ch) { return !C_isspace(ch); });
 105
 106         if (p == end || *p != '=') {
 107             // Boolean attribute - e.g. <input type=checkbox checked>
 108             if (found) {
 109                 value.clear();
 110                 return true;
 111             }
 112             continue;
 113         }
 114
 115         p = find_if(p + 1, end, [](char ch) { return !C_isspace(ch); });
 116         if (p == end) break;
 117
 118         start = p;
 119         char quote = *p;
 120         if (quote == '"' || quote == '\'') {
 121             p = find(++start, end, quote);
 122         } else {
 123             quote = 0;
 124             p = find_if(start, end, [](char ch) { return C_isspace(ch); });
 125         }
 126
 127         if (found) {
 128             value.assign(start, p);
 129             return true;
 130         }
 131
 132         if (p == end) break;
 133
 134         if (quote) ++p;
 135         p = find_if(p, end, [](char ch) { return !C_isspace(ch); });
 136     }
 137     return false;
 138 }
 139
 140 // UTF-8 encoded entity is always <= the entity itself in length, even if the
 141 // trailing ';' is missing - for numeric (decimal and hex) entities:
 142 //
 143 // <=           UTF-8   &#<..>  &#x<..>
 144 // U+007F       1       5       5
 145 // U+07FF       2       6       6
 146 // U+FFFF       3       7       7
 147 // U+1FFFFF     4       9       9
 148 // U+3FFFFFF    5       10      10
 149 // U+7FFFFFFF   6       12      11
 150 //
 151 // Also true for named entities.  This means we can work in-place within the
 152 // string.
 153
 154 void
 155 XmlParser::decode_entities(string& s)
 156 {
 157     string::iterator out = s.begin();
 158     string::iterator in = out;
 159     string::iterator amp = in;
 160     while ((amp = find(amp, s.end(), '&')) != s.end()) {
 161         unsigned int val = 0;
 162         string::iterator end, p = amp + 1;
 163         if (p != s.end() && *p == '#') {
 164             ++p;
 165             if (p != s.end() && (*p == 'x' || *p == 'X')) {
 166                 // hex
 167                 while (++p != s.end() && C_isxdigit(*p)) {
 168                     val = (val << 4) | hex_digit(*p);
 169                 }
 170                 end = p;
 171             } else {
 172                 // number
 173                 while (p != s.end() && C_isdigit(*p)) {
 174                     val = val * 10 + (*p - '0');
 175                     ++p;
 176                 }
 177                 end = p;
 178             }
 179         } else {
 180             end = find_if(p, s.end(), C_isnotalnum);
 181             int k = keyword2(tab, s.data() + (p - s.begin()), end - p);
 182             if (k >= 0) val = named_ent_codepoint[k];
 183         }
 184         if (end != s.end() && *end == ';') ++end;
 185         if (val) {
 186             if (in != out) {
 187                 out = copy(in, amp, out);
 188             } else {
 189                 out = amp;
 190             }
 191             in = end;
 192             if (val < 0x80) {
 193                 *out++ = char(val);
 194             } else {
 195                 // Convert Unicode value val to UTF-8.
 196                 char seq[4];
 197                 unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
 198                 out = copy(seq, seq + len, out);
 199             }
 200         }
 201         amp = end;
 202     }
 203
 204     if (in != out) {
 205         s.erase(out, in);
 206     }
 207 }
 208
 209 void
 210 XmlParser::parse(string_view text)
 211 {
 212     // Check for BOM.
 213     if (text.size() >= 3) {
 214         switch (text[0]) {
 215           case '\xef':
 216             if (text[1] == '\xbb' && text[2] == '\xbf') {
 217                 charset = "utf-8";
 218                 text.remove_prefix(3);
 219             }
 220             break;
 221           case '\xfe':
 222           case '\xff':
 223             // Match either \xfe\xff or \xff\xfe.
 224             if ((text[1] ^ text[0]) == 1) {
 225                 // Convert from "utf-16" which will select the appropriate BE
 226                 // or LE variant based on the BOM and also remove the BOM for
 227                 // us.
 228                 string utf8_text;
 229                 convert_to_utf8(text, "utf-16", utf8_text);
 230                 charset = "utf-8";
 231                 parse(utf8_text);
 232                 return;
 233             }
 234             break;
 235         }
 236     }
 237
 238     attribute_len = 0;
 239
 240     auto start = text.begin();
 241
 242     while (true) {
 243         // Skip through until we find a tag, a comment, or the end of document.
 244         // Ignore isolated occurrences of '<' which don't start a tag or
 245         // comment.
 246         auto p = start;
 247         while (true) {
 248             p = find(p, text.end(), '<');
 249             if (p == text.end()) break;
 250             unsigned char ch = *(p + 1);
 251
 252             // Opening tag, closing tag, or comment/SGML declaration.
 253             if ((state != HTML_IN_SCRIPT && C_isalpha(ch)) || ch == '/' || ch == '!')
 254                 break;
 255
 256             if (ch == '?') {
 257                 // PHP code or XML declaration.
 258                 // XML declaration is only valid at the start of the first line.
 259                 if (p != text.begin() || text.size() < 20) break;
 260
 261                 // XML declaration looks something like this:
 262                 // <?xml version="1.0" encoding="UTF-8"?>
 263                 if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
 264                 if (strchr(" \t\r\n", p[5]) == NULL) break;
 265
 266                 // Switch for XML mode for XHTML.
 267                 state = XML;
 268
 269                 auto decl_end = find(p + 6, text.end(), '?');
 270                 if (decl_end == text.end()) break;
 271
 272                 // Default charset for XML is UTF-8.
 273                 charset = "utf-8";
 274
 275                 string_view decl(p + 6, decl_end - (p + 6));
 276                 size_t enc = decl.find("encoding");
 277                 if (enc == decl.npos) break;
 278
 279                 enc = decl.find_first_not_of(" \t\r\n", enc + 8);
 280                 if (enc == decl.npos) break;
 281
 282                 if (decl[enc] != '=') break;
 283
 284                 enc = decl.find_first_not_of(" \t\r\n", enc + 1);
 285                 if (enc == decl.npos) break;
 286
 287                 if (decl[enc] != '"' && decl[enc] != '\'') break;
 288
 289                 char quote = decl[enc++];
 290                 size_t enc_end = decl.find(quote, enc);
 291
 292                 if (enc_end != decl.npos)
 293                     charset.assign(decl, enc, enc_end - enc);
 294
 295                 break;
 296             }
 297             ++p;
 298         }
 299
 300         // Process content up to start of tag.
 301         if (p > start) {
 302             string content;
 303             convert_to_utf8(string_view(text.data() + (start - text.begin()),
 304                                         p - start),
 305                             charset, content);
 306             decode_entities(content);
 307             process_content(content);
 308         }
 309
 310         if (p == text.end()) break;
 311
 312         start = p + 1;
 313
 314         if (start == text.end()) break;
 315
 316         if (*start == '!') {
 317             if (++start == text.end()) break;
 318
 319             // Comment, SGML declaration, or HTML5 DTD.
 320             char first_ch = *start;
 321             if (++start == text.end()) break;
 322             if (first_ch == '-' && *start == '-') {
 323                 ++start;
 324                 auto close = find(start, text.end(), '>');
 325                 // An unterminated comment swallows rest of document
 326                 // (like Netscape, but unlike MSIE IIRC)
 327                 if (close == text.end()) break;
 328
 329                 p = close;
 330                 // look for -->
 331                 while (p != text.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
 332                     p = find(p + 1, text.end(), '>');
 333
 334                 if (p != text.end()) {
 335                     if (state != XML) {
 336                         // Check for htdig's "ignore this bit" comments.
 337                         if (p - start == CONST_STRLEN("htdig_noindex") + 2 &&
 338                             memcmp(&*start, "htdig_noindex",
 339                                    CONST_STRLEN("htdig_noindex")) == 0) {
 340                             auto i = text.find("<!--/htdig_noindex-->",
 341                                                p + 1 - text.begin());
 342                             if (i == text.npos) break;
 343                             start = text.begin() + i +
 344                                 CONST_STRLEN("<!--/htdig_noindex-->");
 345                             continue;
 346                         }
 347                         // Check for udmcomment (similar to htdig's)
 348                         if (p - start == CONST_STRLEN("UdmComment") + 2 &&
 349                             memcmp(&*start, "UdmComment",
 350                                    CONST_STRLEN("UdmComment")) == 0) {
 351                             auto i = text.find("<!--/UdmComment-->",
 352                                                p + 1 - text.begin());
 353                             if (i == text.npos) break;
 354                             start = text.begin() + i +
 355                                 CONST_STRLEN("<!--/UdmComment-->");
 356                             continue;
 357                         }
 358                     }
 359                     // If we found --> skip to there.
 360                     start = p;
 361                 } else {
 362                     // Otherwise skip to the first > we found (as Netscape does).
 363                     start = close;
 364                 }
 365             } else if (first_ch == '[' &&
 366                        text.size() - (start - text.begin()) > 6 &&
 367                        memcmp(&*start, "CDATA[", CONST_STRLEN("CDATA[")) == 0) {
 368                 start += 6;
 369                 string_view::size_type b = start - text.begin();
 370                 string_view::size_type i = text.find("]]>", b);
 371                 string_view::size_type e = (i == text.npos) ? text.size() : i;
 372                 string content;
 373                 convert_to_utf8(string_view(text.data() + b, e - b),
 374                                 charset, content);
 375                 process_content(content);
 376                 if (i == text.npos) break;
 377                 start = text.begin() + i + 2;
 378             } else if (C_tolower(first_ch) == 'd' &&
 379                        text.end() - start > 6 &&
 380                        C_tolower(start[0]) == 'o' &&
 381                        C_tolower(start[1]) == 'c' &&
 382                        C_tolower(start[2]) == 't' &&
 383                        C_tolower(start[3]) == 'y' &&
 384                        C_tolower(start[4]) == 'p' &&
 385                        C_tolower(start[5]) == 'e' &&
 386                        C_isspace(start[6])) {
 387                 // DOCTYPE declaration.
 388                 start += 7;
 389                 while (start != text.end() && C_isspace(*start)) {
 390                     ++start;
 391                 }
 392                 if (start == text.end()) break;
 393                 if (text.end() - start >= 5 &&
 394                     C_tolower(start[0]) == 'h' &&
 395                     C_tolower(start[1]) == 't' &&
 396                     C_tolower(start[2]) == 'm' &&
 397                     C_tolower(start[3]) == 'l' &&
 398                     (start[4] == '>' || C_isspace(start[4]))) {
 399                     start += 4;
 400
 401                     // HTML doctype.
 402                     while (start != text.end() && C_isspace(*start)) {
 403                         ++start;
 404                     }
 405                     if (start == text.end()) break;
 406
 407                     if (*start == '>') {
 408                         // <!DOCTYPE html>
 409                         // Default charset for HTML5 is UTF-8.
 410                         charset = "utf-8";
 411                     }
 412                 } else if (text.end() - start >= 29 &&
 413                            C_tolower(start[0]) == 's' &&
 414                            C_tolower(start[1]) == 'y' &&
 415                            C_tolower(start[2]) == 's' &&
 416                            C_tolower(start[3]) == 't' &&
 417                            C_tolower(start[4]) == 'e' &&
 418                            C_tolower(start[5]) == 'm' &&
 419                            C_isspace(start[6])) {
 420                     start += 7;
 421                     while (start != text.end() && C_isspace(*start)) {
 422                         ++start;
 423                     }
 424                     size_t left = text.end() - start;
 425                     if (left >= HTML5_LEGACY_COMPAT_LEN + 3 &&
 426                         (*start == '\'' || *start == '"') &&
 427                         start[HTML5_LEGACY_COMPAT_LEN + 1] == *start &&
 428                         text.compare(start - text.begin() + 1,
 429                                      HTML5_LEGACY_COMPAT_LEN,
 430                                      HTML5_LEGACY_COMPAT,
 431                                      HTML5_LEGACY_COMPAT_LEN) == 0) {
 432                         // HTML5 legacy compatibility doctype:
 433                         // <!DOCTYPE html SYSTEM "about:legacy-compat">
 434                         start += HTML5_LEGACY_COMPAT_LEN + 2;
 435                         // Default charset for HTML5 is UTF-8.
 436                         charset = "utf-8";
 437                     }
 438                 }
 439                 start = find(start - 1, text.end(), '>');
 440                 if (start == text.end()) break;
 441             } else {
 442                 // Some other SGML declaration - ignore it.
 443                 start = find(start - 1, text.end(), '>');
 444                 if (start == text.end()) break;
 445             }
 446             ++start;
 447         } else if (*start == '?') {
 448             if (++start == text.end()) break;
 449             // PHP - swallow until ?> or EOF
 450             start = find(start + 1, text.end(), '>');
 451
 452             // look for ?>
 453             while (start != text.end() && *(start - 1) != '?')
 454                 start = find(start + 1, text.end(), '>');
 455
 456             if (start == text.end()) {
 457                 // The closing ?> at the end of a file is optional so ignore
 458                 // the rest of the document if there isn't one:
 459                 // https://www.php.net/basic-syntax.instruction-separation
 460             } else {
 461                 // PHP ignores an immediately trailing newline after the
 462                 // closing tag:
 463                 // https://www.php.net/basic-syntax.instruction-separation
 464                 // Testing shows \n, \r and \r\n are skipped.
 465                 ++start;
 466                 if (*start == '\r') ++start;
 467                 if (*start == '\n') ++start;
 468             }
 469         } else {
 470             // Opening or closing tag.
 471             bool closing = false;
 472
 473             if (*start == '/') {
 474                 closing = true;
 475                 start = find_if(start + 1, text.end(), C_isnotspace);
 476             }
 477
 478             p = find_if(start, text.end(), p_nottag);
 479             string tag(start, p);
 480             if (state != XML) {
 481                 // Convert tagname to lowercase.
 482                 lowercase_string(tag);
 483             }
 484
 485             if (closing) {
 486                 if (!closing_tag(tag))
 487                     return;
 488                 if (state == HTML_IN_SCRIPT && tag == "script")
 489                     state = HTML;
 490             }
 491
 492             start = p;
 493             if (p < text.end() && *p != '>') {
 494                 // We often aren't asked for the attributes, so parse them
 495                 // lazily - for now we just need to skip balanced single and
 496                 // double quotes.
 497                 //
 498                 // Ignore attributes on closing tags (they're bogus) but still
 499                 // skip balanced quotes, since that's what browsers do.
 500                 while (true) {
 501                     p = find_if(p, text.end(),
 502                                 [](char ch) {
 503                                     return ch == '"' || ch == '\'' || ch == '>';
 504                                 });
 505                     if (p == text.end() || *p == '>') {
 506                         break;
 507                     }
 508                     if (*p == '"') {
 509                         p = find_if(p, text.end(),
 510                                     [](char ch) {
 511                                         return ch == '\'' || ch == '>';
 512                                     });
 513                     } else {
 514                         p = find_if(p, text.end(),
 515                                     [](char ch) {
 516                                         return ch == '"' || ch == '>';
 517                                     });
 518                     }
 519                 }
 520             }
 521
 522             if (!closing) {
 523                 attribute_len = p - start;
 524                 bool empty_element = false;
 525                 if (attribute_len > 0) {
 526                     // Check for empty element (e.g. <br/>).
 527                     attribute_data = &*start;
 528                     if (p[-1] == '/') {
 529                         // <a href=foo/> isn't an empty element though
 530                         if (attribute_len == 1 ||
 531                             C_isspace(p[-2]) ||
 532                             p[-2] == '"' ||
 533                             p[-2] == '\'') {
 534                             empty_element = true;
 535                             --attribute_len;
 536                         }
 537                     }
 538                 }
 539                 if (!opening_tag(tag))
 540                     return;
 541                 attribute_len = 0;
 542
 543                 if (empty_element) {
 544                     if (!closing_tag(tag))
 545                         return;
 546                 }
 547
 548                 if (state == HTML && tag == "script") {
 549                     // In HTML <script> tags we ignore opening tags to avoid
 550                     // problems with "a<b".
 551                     state = HTML_IN_SCRIPT;
 552                 }
 553
 554                 if (start != text.end() && *start == '>') ++start;
 555             }
 556
 557             if (p == text.end()) break;
 558             start = p + 1;
 559         }
 560     }
 561 }