xapian-applications/omega/htmlparser.cc

   1 /** @file
   2  * @brief subclass of XmlParser for extracting text from HTML.
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2002-2023 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "htmlparser.h"
  26
  27 #include "datetime.h"
  28 #include "html-tok.h"
  29 #include "keyword.h"
  30 #include "stringutils.h"
  31 #include "utf8convert.h"
  32
  33 #include <cstring>
  34
  35 using namespace std;
  36
  37 static inline void
  38 lowercase_string(string &str)
  39 {
  40     for (string::iterator i = str.begin(); i != str.end(); ++i) {
  41         *i = C_tolower(*i);
  42     }
  43 }
  44
  45 void
  46 HtmlParser::parse(string_view text,
  47                   const string& charset_,
  48                   bool charset_from_meta_)
  49 {
  50     charset = charset_;
  51     charset_from_meta = charset_from_meta_;
  52     XmlParser::parse(text);
  53 }
  54
  55 void
  56 HtmlParser::process_content(const string& content)
  57 {
  58     if (!content.empty() && !in_script_tag && !in_style_tag) {
  59         string::size_type b = content.find_first_not_of(WHITESPACE);
  60         if (b) pending_space = true;
  61         while (b != string::npos) {
  62             if (pending_space && !target->empty())
  63                 *target += ' ';
  64             string::size_type e = content.find_first_of(WHITESPACE, b);
  65             if (e == string::npos) {
  66                 target->append(content.data() + b, content.size() - b);
  67                 pending_space = false;
  68                 return;
  69             }
  70             target->append(content.data() + b, e - b);
  71             pending_space = true;
  72             b = content.find_first_not_of(WHITESPACE, e + 1);
  73         }
  74     }
  75 }
  76
  77 bool
  78 HtmlParser::opening_tag(const string& tag)
  79 {
  80     int k = keyword(tab, tag.data(), tag.size());
  81     if (k < 0)
  82         return true;
  83     pending_space = pending_space || (token_flags[k] & TOKEN_SPACE);
  84     switch (html_tag(k)) {
  85         case INPUT: {
  86             string type;
  87             if (!get_attribute("type", type))
  88                 break;
  89             if (type == "checkbox") {
  90                 if (get_attribute("checked", type)) {
  91                     *target += "\xe2\x98\x91"; // U+2611 BALLOT BOX WITH CHECK
  92                 } else {
  93                     *target += "\xe2\x98\x90"; // U+2610 BALLOT BOX
  94                 }
  95             }
  96             break;
  97         }
  98         case META: {
  99             string content;
 100             if (get_attribute("content", content)) {
 101                 string name;
 102                 if (get_attribute("name", name)) {
 103                     lowercase_string(name);
 104                     if (name == "description") {
 105                         convert_to_utf8(content, charset);
 106                         decode_entities(content);
 107                         if (description_as_sample && sample.empty()) {
 108                             swap(sample, content);
 109                         } else {
 110                             // If we're not using the description as the
 111                             // sample, or for second and subsequent
 112                             // descriptions, treat as keywords.
 113                             if (keywords.empty()) {
 114                                 swap(keywords, content);
 115                             } else {
 116                                 keywords += ' ';
 117                                 keywords += content;
 118                             }
 119                         }
 120                     } else if (name == "keywords" ||
 121                                name == "dcterms.subject" ||
 122                                name == "dcterms.description") {
 123                         // LibreOffice HTML export puts "Subject" and
 124                         // "Keywords" into DCTERMS.subject, and "Comments"
 125                         // into DCTERMS.description.  Best option seems to
 126                         // be to treat all of these as keywords, i.e. just
 127                         // more text to index, but not show in/as the
 128                         // sample.
 129                         if (!keywords.empty()) keywords += ' ';
 130                         convert_to_utf8(content, charset);
 131                         decode_entities(content);
 132                         keywords += content;
 133                     } else if (name == "author" ||
 134                                name == "dcterms.creator" ||
 135                                name == "dcterms.contributor") {
 136                         // LibreOffice HTML export includes DCTERMS.creator
 137                         // and DCTERMS.contributor.
 138                         if (!author.empty()) author += ' ';
 139                         convert_to_utf8(content, charset);
 140                         decode_entities(content);
 141                         author += content;
 142                     } else if (name == "classification") {
 143                         if (!topic.empty()) topic += ' ';
 144                         convert_to_utf8(content, charset);
 145                         decode_entities(content);
 146                         topic += content;
 147                     } else if (!ignoring_metarobots && name == "robots") {
 148                         decode_entities(content);
 149                         lowercase_string(content);
 150                         if (content.find("none") != string::npos ||
 151                             content.find("noindex") != string::npos) {
 152                             indexing_allowed = false;
 153                             return false;
 154                         }
 155                     } else if (name == "created" ||
 156                                name == "dcterms.issued") {
 157                         created = parse_datetime(content);
 158                     }
 159                     break;
 160                 }
 161                 // If the current charset came from a meta tag, don't
 162                 // force reparsing again!
 163                 if (charset_from_meta) break;
 164                 string hdr;
 165                 if (get_attribute("http-equiv", hdr)) {
 166                     lowercase_string(hdr);
 167                     if (hdr == "content-type") {
 168                         lowercase_string(content);
 169                         size_t start = content.find("charset=");
 170                         if (start == string::npos) break;
 171                         start += 8;
 172                         if (start == content.size()) break;
 173                         size_t end = start;
 174                         if (content[start] != '"') {
 175                             while (end < content.size()) {
 176                                 unsigned char ch = content[end];
 177                                 if (ch <= 32 || ch >= 127 ||
 178                                     strchr(";()<>@,:\\\"/[]?={}", ch))
 179                                     break;
 180                                 ++end;
 181                             }
 182                         } else {
 183                             ++start;
 184                             ++end;
 185                             while (end < content.size()) {
 186                                 unsigned char ch = content[end];
 187                                 if (ch == '"') break;
 188                                 if (ch == '\\') content.erase(end, 1);
 189                                 ++end;
 190                             }
 191                         }
 192                         string newcharset(content, start, end - start);
 193                         if (charset != newcharset) {
 194                             throw newcharset;
 195                         }
 196                     }
 197                 }
 198                 break;
 199             }
 200             if (charset_from_meta) break;
 201             string newcharset;
 202             if (get_attribute("charset", newcharset)) {
 203                 // HTML5 added: <meta charset="...">
 204                 lowercase_string(newcharset);
 205                 if (charset != newcharset) {
 206                     throw newcharset;
 207                 }
 208             }
 209             break;
 210         }
 211         case STYLE:
 212             in_style_tag = true;
 213             break;
 214         case SCRIPT:
 215             in_script_tag = true;
 216             break;
 217         case TITLE:
 218             target = &title;
 219             pending_space = false;
 220             break;
 221         default:
 222             /* No action */
 223             break;
 224     }
 225     return true;
 226 }
 227
 228 bool
 229 HtmlParser::closing_tag(const string& tag)
 230 {
 231     int k = keyword(tab, tag.data(), tag.size());
 232     if (k < 0 || (token_flags[k] & TOKEN_VOID))
 233         return true;
 234     pending_space = pending_space || (token_flags[k] & TOKEN_SPACE);
 235     switch (html_tag(k)) {
 236         case STYLE:
 237             in_style_tag = false;
 238             break;
 239         case SCRIPT:
 240             in_script_tag = false;
 241             break;
 242         case TITLE:
 243             target = &dump;
 244             pending_space = false;
 245             break;
 246         default:
 247             /* No action */
 248             break;
 249     }
 250     return true;
 251 }