xapian-applications/omega/htmlparse.cc

   1 /* htmlparse.cc: simple HTML parser for omega indexer
   2  *
   3  * Copyright 1999,2000,2001 BrightStation PLC
   4  * Copyright 2001 Ananova Ltd
   5  * Copyright 2002,2006,2007,2008,2009,2010,2011,2012,2015,2016,2018,2020 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "htmlparse.h"
  26
  27 #include <xapian.h>
  28
  29 #include "keyword.h"
  30 #include "namedents.h"
  31 #include "stringutils.h"
  32 #include "utf8convert.h"
  33
  34 #include <algorithm>
  35
  36 #include <cctype>
  37 #include <cstring>
  38 #include <cstdio>
  39 #include <cstdlib>
  40
  41 using namespace std;
  42
  43 // HTML5 legacy compatibility doctype.
  44 #define HTML5_LEGACY_COMPAT "about:legacy-compat"
  45 #define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)
  46
  47 static inline void
  48 lowercase_string(string &str)
  49 {
  50     for (string::iterator i = str.begin(); i != str.end(); ++i) {
  51         *i = C_tolower(*i);
  52     }
  53 }
  54
  55 static inline bool
  56 p_nottag(char c)
  57 {
  58     // ':' for XML namespaces.
  59     return !C_isalnum(c) && c != '.' && c != '-' && c != ':';
  60 }
  61
  62 static inline bool
  63 p_whitespacegt(char c)
  64 {
  65     return C_isspace(c) || c == '>';
  66 }
  67
  68 static inline bool
  69 p_whitespaceeqgt(char c)
  70 {
  71     return C_isspace(c) || c == '=' || c == '>';
  72 }
  73
  74 bool
  75 HtmlParser::get_parameter(const string & param, string & value) const
  76 {
  77     map<string, string>::const_iterator i = parameters.find(param);
  78     if (i == parameters.end()) return false;
  79     value = i->second;
  80     return true;
  81 }
  82
  83 // UTF-8 encoded entity is always <= the entity itself in length, even if the
  84 // trailing ';' is missing - for numeric (decimal and hex) entities:
  85 //
  86 // <=           UTF-8   &#<..>  &#x<..>
  87 // U+007F       1       5       5
  88 // U+07FF       2       6       6
  89 // U+FFFF       3       7       7
  90 // U+1FFFFF     4       9       9
  91 // U+3FFFFFF    5       10      10
  92 // U+7FFFFFFF   6       12      11
  93 //
  94 // Also true for named entities.  This means we can work in-place within the
  95 // string.
  96
  97 void
  98 HtmlParser::decode_entities(string &s)
  99 {
 100     string::iterator out = s.begin();
 101     string::iterator in = out;
 102     string::iterator amp = in;
 103     while ((amp = find(amp, s.end(), '&')) != s.end()) {
 104         unsigned int val = 0;
 105         string::iterator end, p = amp + 1;
 106         if (p != s.end() && *p == '#') {
 107             ++p;
 108             if (p != s.end() && (*p == 'x' || *p == 'X')) {
 109                 // hex
 110                 while (++p != s.end() && C_isxdigit(*p)) {
 111                     val = (val << 4) | hex_digit(*p);
 112                 }
 113                 end = p;
 114             } else {
 115                 // number
 116                 while (p != s.end() && C_isdigit(*p)) {
 117                     val = val * 10 + (*p - '0');
 118                     ++p;
 119                 }
 120                 end = p;
 121             }
 122         } else {
 123             end = find_if(p, s.end(), C_isnotalnum);
 124             int k = keyword2(tab, s.data() + (p - s.begin()), end - p);
 125             if (k >= 0) val = named_ent_codepoint[k];
 126         }
 127         if (end != s.end() && *end == ';') ++end;
 128         if (val) {
 129             if (in != out) {
 130                 out = copy(in, amp, out);
 131             } else {
 132                 out = amp;
 133             }
 134             in = end;
 135             if (val < 0x80) {
 136                 *out++ = char(val);
 137             } else {
 138                 // Convert unicode value val to UTF-8.
 139                 char seq[4];
 140                 unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
 141                 out = copy(seq, seq + len, out);
 142             }
 143         }
 144         amp = end;
 145     }
 146
 147     if (in != out) {
 148         s.erase(out, in);
 149     }
 150 }
 151
 152 void
 153 HtmlParser::parse(const string& body)
 154 {
 155     // Check for BOM.
 156     string::const_iterator begin_after_bom = body.begin();
 157     if (body.size() >= 3) {
 158         switch (body[0]) {
 159           case '\xef':
 160             if (body[1] == '\xbb' && body[2] == '\xbf') {
 161                 charset = "utf-8";
 162                 begin_after_bom += 3;
 163             }
 164             break;
 165           case '\xfe':
 166           case '\xff':
 167             // Match either \xfe\xff or \xff\xfe.
 168             if ((body[1] ^ body[0]) == 1) {
 169                 // Convert to "utf-16" which will remove the BOM for us.
 170                 string utf8_body;
 171                 convert_to_utf8(body, "utf-16", utf8_body);
 172                 charset = "utf-8";
 173                 parse(utf8_body);
 174                 return;
 175             }
 176             break;
 177         }
 178     }
 179
 180     in_script = false;
 181
 182     parameters.clear();
 183     string::const_iterator start = begin_after_bom;
 184
 185     while (true) {
 186         // Skip through until we find an HTML tag, a comment, or the end of
 187         // document.  Ignore isolated occurrences of '<' which don't start
 188         // a tag or comment.
 189         string::const_iterator p = start;
 190         while (true) {
 191             p = find(p, body.end(), '<');
 192             if (p == body.end()) break;
 193             unsigned char ch = *(p + 1);
 194
 195             // Tag, closing tag, or comment (or SGML declaration).
 196             if ((!in_script && C_isalpha(ch)) || ch == '/' || ch == '!') break;
 197
 198             if (ch == '?') {
 199                 // PHP code or XML declaration.
 200                 // XML declaration is only valid at the start of the first line.
 201                 if (p != begin_after_bom || body.size() < 20) break;
 202
 203                 // XML declaration looks something like this:
 204                 // <?xml version="1.0" encoding="UTF-8"?>
 205                 if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
 206                 if (strchr(" \t\r\n", p[5]) == NULL) break;
 207
 208                 string::const_iterator decl_end = find(p + 6, body.end(), '?');
 209                 if (decl_end == body.end()) break;
 210
 211                 // Default charset for XML is UTF-8.
 212                 charset = "utf-8";
 213
 214                 string decl(p + 6, decl_end);
 215                 size_t enc = decl.find("encoding");
 216                 if (enc == string::npos) break;
 217
 218                 enc = decl.find_first_not_of(" \t\r\n", enc + 8);
 219                 if (enc == string::npos || enc == decl.size()) break;
 220
 221                 if (decl[enc] != '=') break;
 222
 223                 enc = decl.find_first_not_of(" \t\r\n", enc + 1);
 224                 if (enc == string::npos || enc == decl.size()) break;
 225
 226                 if (decl[enc] != '"' && decl[enc] != '\'') break;
 227
 228                 char quote = decl[enc++];
 229                 size_t enc_end = decl.find(quote, enc);
 230
 231                 if (enc != string::npos)
 232                     charset.assign(decl, enc, enc_end - enc);
 233
 234                 break;
 235             }
 236             ++p;
 237         }
 238
 239         // Process text up to start of tag.
 240         if (p > start) {
 241             string text(body, start - body.begin(), p - start);
 242             convert_to_utf8(text, charset);
 243             decode_entities(text);
 244             process_text(text);
 245         }
 246
 247         if (p == body.end()) break;
 248
 249         start = p + 1;
 250
 251         if (start == body.end()) break;
 252
 253         if (*start == '!') {
 254             if (++start == body.end()) break;
 255
 256             // Comment, SGML declaration, or HTML5 DTD.
 257             char first_ch = *start;
 258             if (++start == body.end()) break;
 259             if (first_ch == '-' && *start == '-') {
 260                 ++start;
 261                 string::const_iterator close = find(start, body.end(), '>');
 262                 // An unterminated comment swallows rest of document
 263                 // (like Netscape, but unlike MSIE IIRC)
 264                 if (close == body.end()) break;
 265
 266                 p = close;
 267                 // look for -->
 268                 while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
 269                     p = find(p + 1, body.end(), '>');
 270
 271                 if (p != body.end()) {
 272                     // Check for htdig's "ignore this bit" comments.
 273                     if (p - start == CONST_STRLEN("htdig_noindex") + 2 &&
 274                         memcmp(&*start, "htdig_noindex",
 275                                CONST_STRLEN("htdig_noindex")) == 0) {
 276                         auto i = body.find("<!--/htdig_noindex-->",
 277                                            p + 1 - body.begin());
 278                         if (i == string::npos) break;
 279                         start = body.begin() + i +
 280                             CONST_STRLEN("<!--/htdig_noindex-->");
 281                         continue;
 282                     }
 283                     // Check for udmcomment (similar to htdig's)
 284                     if (p - start == CONST_STRLEN("UdmComment") + 2 &&
 285                         memcmp(&*start, "UdmComment",
 286                                CONST_STRLEN("UdmComment")) == 0) {
 287                         auto i = body.find("<!--/UdmComment-->",
 288                                            p + 1 - body.begin());
 289                         if (i == string::npos) break;
 290                         start = body.begin() + i +
 291                             CONST_STRLEN("<!--/UdmComment-->");
 292                         continue;
 293                     }
 294                     // If we found --> skip to there.
 295                     start = p;
 296                 } else {
 297                     // Otherwise skip to the first > we found (as Netscape does).
 298                     start = close;
 299                 }
 300             } else if (first_ch == '[' &&
 301                        body.size() - (start - body.begin()) > 6 &&
 302                        body.compare(start - body.begin(), 6, "CDATA[", 6) == 0) {
 303                 start += 6;
 304                 string::size_type b = start - body.begin();
 305                 string::size_type i;
 306                 i = body.find("]]>", b);
 307                 string text(body, b, i - b);
 308                 convert_to_utf8(text, charset);
 309                 process_text(text);
 310                 if (i == string::npos) break;
 311                 start = body.begin() + i + 2;
 312             } else if (C_tolower(first_ch) == 'd' &&
 313                        body.end() - start > 6 &&
 314                        C_tolower(start[0]) == 'o' &&
 315                        C_tolower(start[1]) == 'c' &&
 316                        C_tolower(start[2]) == 't' &&
 317                        C_tolower(start[3]) == 'y' &&
 318                        C_tolower(start[4]) == 'p' &&
 319                        C_tolower(start[5]) == 'e' &&
 320                        C_isspace(start[6])) {
 321                 // DOCTYPE declaration.
 322                 start += 7;
 323                 while (start != body.end() && C_isspace(*start)) {
 324                     ++start;
 325                 }
 326                 if (start == body.end()) break;
 327                 if (body.end() - start >= 5 &&
 328                     C_tolower(start[0]) == 'h' &&
 329                     C_tolower(start[1]) == 't' &&
 330                     C_tolower(start[2]) == 'm' &&
 331                     C_tolower(start[3]) == 'l' &&
 332                     (start[4] == '>' || C_isspace(start[4]))) {
 333                     start += 4;
 334
 335                     // HTML doctype.
 336                     while (start != body.end() && C_isspace(*start)) {
 337                         ++start;
 338                     }
 339                     if (start == body.end()) break;
 340
 341                     if (*start == '>') {
 342                         // <!DOCTYPE html>
 343                         // Default charset for HTML5 is UTF-8.
 344                         charset = "utf-8";
 345                     }
 346                 } else if (body.end() - start >= 29 &&
 347                            C_tolower(start[0]) == 's' &&
 348                            C_tolower(start[1]) == 'y' &&
 349                            C_tolower(start[2]) == 's' &&
 350                            C_tolower(start[3]) == 't' &&
 351                            C_tolower(start[4]) == 'e' &&
 352                            C_tolower(start[5]) == 'm' &&
 353                            C_isspace(start[6])) {
 354                     start += 7;
 355                     while (start != body.end() && C_isspace(*start)) {
 356                         ++start;
 357                     }
 358                     size_t left = body.end() - start;
 359                     if (left >= HTML5_LEGACY_COMPAT_LEN + 3 &&
 360                         (*start == '\'' || *start == '"') &&
 361                         start[HTML5_LEGACY_COMPAT_LEN + 1] == *start &&
 362                         body.compare(start - body.begin() + 1,
 363                                      HTML5_LEGACY_COMPAT_LEN,
 364                                      HTML5_LEGACY_COMPAT,
 365                                      HTML5_LEGACY_COMPAT_LEN) == 0) {
 366                         // HTML5 legacy compatibility doctype:
 367                         // <!DOCTYPE html SYSTEM "about:legacy-compat">
 368                         start += HTML5_LEGACY_COMPAT_LEN + 2;
 369                         // Default charset for HTML5 is UTF-8.
 370                         charset = "utf-8";
 371                     }
 372                 }
 373                 start = find(start - 1, body.end(), '>');
 374                 if (start == body.end()) break;
 375             } else {
 376                 // Some other SGML declaration - ignore it.
 377                 start = find(start - 1, body.end(), '>');
 378                 if (start == body.end()) break;
 379             }
 380             ++start;
 381         } else if (*start == '?') {
 382             if (++start == body.end()) break;
 383             // PHP - swallow until ?> or EOF
 384             start = find(start + 1, body.end(), '>');
 385
 386             // look for ?>
 387             while (start != body.end() && *(start - 1) != '?')
 388                 start = find(start + 1, body.end(), '>');
 389
 390             if (start == body.end()) {
 391                 // The closing ?> at the end of a file is optional so ignore
 392                 // the rest of the document if there isn't one:
 393                 // https://www.php.net/basic-syntax.instruction-separation
 394             } else {
 395                 // PHP ignores an immediately trailing newline after the
 396                 // closing tag:
 397                 // https://www.php.net/basic-syntax.instruction-separation
 398                 // Testing shows \n, \r and \r\n are skipped.
 399                 ++start;
 400                 if (*start == '\r') ++start;
 401                 if (*start == '\n') ++start;
 402             }
 403         } else {
 404             // opening or closing tag
 405             int closing = 0;
 406
 407             if (*start == '/') {
 408                 closing = 1;
 409                 start = find_if(start + 1, body.end(), C_isnotspace);
 410             }
 411
 412             p = start;
 413             start = find_if(start, body.end(), p_nottag);
 414             string tag(body, p - body.begin(), start - p);
 415             // convert tagname to lowercase
 416             lowercase_string(tag);
 417
 418             if (closing) {
 419                 if (!closing_tag(tag))
 420                     return;
 421                 if (in_script && tag == "script") in_script = false;
 422
 423                 /* ignore any bogus parameters on closing tags */
 424                 p = find(start, body.end(), '>');
 425                 if (p == body.end()) break;
 426                 start = p + 1;
 427             } else {
 428                 bool empty_element = false;
 429                 // FIXME: parse parameters lazily.
 430                 while (start < body.end() && *start != '>') {
 431                     string name, value;
 432
 433                     p = find_if(start, body.end(), p_whitespaceeqgt);
 434
 435                     size_t name_len = p - start;
 436                     if (name_len == 1) {
 437                         if (*start == '/' && p < body.end() && *p == '>') {
 438                             // E.g. <tag foo="bar" />
 439                             start = p;
 440                             empty_element = true;
 441                             break;
 442                         }
 443                     }
 444
 445                     name.assign(body, start - body.begin(), name_len);
 446
 447                     p = find_if(p, body.end(), C_isnotspace);
 448
 449                     start = p;
 450                     if (start != body.end() && *start == '=') {
 451                         start = find_if(start + 1, body.end(), C_isnotspace);
 452
 453                         p = body.end();
 454
 455                         int quote = *start;
 456                         if (quote == '"' || quote == '\'') {
 457                             ++start;
 458                             p = find(start, body.end(), quote);
 459                         }
 460
 461                         if (p != body.end()) {
 462                             // quoted
 463                             value.assign(body, start - body.begin(), p - start);
 464                             ++p;
 465                         } else {
 466                             // unquoted or no closing quote
 467                             p = find_if(start, body.end(), p_whitespacegt);
 468                             value.assign(body, start - body.begin(), p - start);
 469                         }
 470                         start = find_if(p, body.end(), C_isnotspace);
 471
 472                         if (!name.empty()) {
 473                             // convert parameter name to lowercase
 474                             lowercase_string(name);
 475                             // in case of multiple entries, use the first
 476                             // (as Netscape does)
 477                             parameters.insert(make_pair(name, value));
 478                         }
 479                     }
 480                 }
 481 #if 0
 482                 cout << "<" << tag;
 483                 map<string, string>::const_iterator x;
 484                 for (x = parameters.begin(); x != parameters.end(); ++x) {
 485                     cout << " " << x->first << "=\"" << x->second << "\"";
 486                 }
 487                 cout << ">\n";
 488 #endif
 489                 if (!opening_tag(tag))
 490                     return;
 491                 parameters.clear();
 492
 493                 if (empty_element) {
 494                     if (!closing_tag(tag))
 495                         return;
 496                 }
 497
 498                 // In <script> tags we ignore opening tags to avoid problems
 499                 // with "a<b".
 500                 if (tag == "script") in_script = true;
 501
 502                 if (start != body.end() && *start == '>') ++start;
 503             }
 504         }
 505     }
 506 }