xapian-applications/omega/myhtmlparse.cc

   1 /** @file
   2  * @brief subclass of HtmlParser for extracting text from HTML.
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2012,2013,2014,2015,2017 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "myhtmlparse.h"
  26
  27 #include "datetime.h"
  28 #include "keyword.h"
  29 #include "my-html-tok.h"
  30 #include "stringutils.h"
  31 #include "utf8convert.h"
  32
  33 #include <cstring>
  34
  35 using namespace std;
  36
  37 static const char whitespace[] = "_ \t\r\r\f";
  38
  39 static inline void
  40 lowercase_string(string &str)
  41 {
  42     for (string::iterator i = str.begin(); i != str.end(); ++i) {
  43         *i = C_tolower(*i);
  44     }
  45 }
  46
  47 void
  48 MyHtmlParser::parse_html(const string &text, const string &charset_,
  49                          bool charset_from_meta_)
  50 {
  51     charset = charset_;
  52     charset_from_meta = charset_from_meta_;
  53     parse(text);
  54 }
  55
  56 void
  57 MyHtmlParser::process_text(const string &text)
  58 {
  59     if (!text.empty() && !in_script_tag && !in_style_tag) {
  60         string::size_type b = text.find_first_not_of(WHITESPACE);
  61         if (b && !pending_space) pending_space = SPACE;
  62         while (b != string::npos) {
  63             if (pending_space && !target->empty())
  64                 *target += whitespace[pending_space];
  65             string::size_type e = text.find_first_of(WHITESPACE, b);
  66             if (e == string::npos) {
  67                 target->append(text.data() + b, text.size() - b);
  68                 pending_space = 0;
  69                 return;
  70             }
  71             target->append(text.data() + b, e - b);
  72             pending_space = SPACE;
  73             b = text.find_first_not_of(WHITESPACE, e + 1);
  74         }
  75     }
  76 }
  77
  78 bool
  79 MyHtmlParser::opening_tag(const string &tag)
  80 {
  81     int k = keyword(tab, tag.data(), tag.size());
  82     if (k < 0)
  83         return true;
  84     pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
  85     switch (html_tag(k)) {
  86         case P:
  87             if (pending_space < PAGE) {
  88                 string style;
  89                 if (get_parameter("style", style)) {
  90                     // As produced by Libreoffice's HTML export:
  91                     if (style.find("page-break-before: always") != string::npos)
  92                         pending_space = PAGE;
  93                 }
  94             }
  95             break;
  96         case META: {
  97             string content;
  98             if (get_parameter("content", content)) {
  99                 string name;
 100                 if (get_parameter("name", name)) {
 101                     lowercase_string(name);
 102                     if (name == "description") {
 103                         convert_to_utf8(content, charset);
 104                         decode_entities(content);
 105                         if (description_as_sample && sample.empty()) {
 106                             swap(sample, content);
 107                         } else {
 108                             // If we're not using the description as the
 109                             // sample, or for second and subsequent
 110                             // descriptions, treat as keywords.
 111                             if (keywords.empty()) {
 112                                 swap(keywords, content);
 113                             } else {
 114                                 keywords += ' ';
 115                                 keywords += content;
 116                             }
 117                         }
 118                     } else if (name == "keywords" ||
 119                                name == "dcterms.subject" ||
 120                                name == "dcterms.description") {
 121                         // LibreOffice HTML export puts "Subject" and
 122                         // "Keywords" into DCTERMS.subject, and "Comments"
 123                         // into DCTERMS.description.  Best option seems to
 124                         // be to treat all of these as keywords, i.e. just
 125                         // more text to index, but not show in/as the
 126                         // sample.
 127                         if (!keywords.empty()) keywords += ' ';
 128                         convert_to_utf8(content, charset);
 129                         decode_entities(content);
 130                         keywords += content;
 131                     } else if (name == "author" ||
 132                                name == "dcterms.creator" ||
 133                                name == "dcterms.contributor") {
 134                         // LibreOffice HTML export includes DCTERMS.creator
 135                         // and DCTERMS.contributor.
 136                         if (!author.empty()) author += ' ';
 137                         convert_to_utf8(content, charset);
 138                         decode_entities(content);
 139                         author += content;
 140                     } else if (name == "classification") {
 141                         if (!topic.empty()) topic += ' ';
 142                         convert_to_utf8(content, charset);
 143                         decode_entities(content);
 144                         topic += content;
 145                     } else if (!ignoring_metarobots && name == "robots") {
 146                         decode_entities(content);
 147                         lowercase_string(content);
 148                         if (content.find("none") != string::npos ||
 149                             content.find("noindex") != string::npos) {
 150                             indexing_allowed = false;
 151                             return false;
 152                         }
 153                     } else if (name == "created" ||
 154                                name == "dcterms.issued") {
 155                         created = parse_datetime(content);
 156                     }
 157                     break;
 158                 }
 159                 // If the current charset came from a meta tag, don't
 160                 // force reparsing again!
 161                 if (charset_from_meta) break;
 162                 string hdr;
 163                 if (get_parameter("http-equiv", hdr)) {
 164                     lowercase_string(hdr);
 165                     if (hdr == "content-type") {
 166                         lowercase_string(content);
 167                         size_t start = content.find("charset=");
 168                         if (start == string::npos) break;
 169                         start += 8;
 170                         if (start == content.size()) break;
 171                         size_t end = start;
 172                         if (content[start] != '"') {
 173                             while (end < content.size()) {
 174                                 unsigned char ch = content[end];
 175                                 if (ch <= 32 || ch >= 127 ||
 176                                     strchr(";()<>@,:\\\"/[]?={}", ch))
 177                                     break;
 178                                 ++end;
 179                             }
 180                         } else {
 181                             ++start;
 182                             ++end;
 183                             while (end < content.size()) {
 184                                 unsigned char ch = content[end];
 185                                 if (ch == '"') break;
 186                                 if (ch == '\\') content.erase(end, 1);
 187                                 ++end;
 188                             }
 189                         }
 190                         string newcharset(content, start, end - start);
 191                         if (charset != newcharset) {
 192                             throw newcharset;
 193                         }
 194                     }
 195                 }
 196                 break;
 197             }
 198             if (charset_from_meta) break;
 199             string newcharset;
 200             if (get_parameter("charset", newcharset)) {
 201                 // HTML5 added: <meta charset="...">
 202                 lowercase_string(newcharset);
 203                 if (charset != newcharset) {
 204                     throw newcharset;
 205                 }
 206             }
 207             break;
 208         }
 209         case STYLE:
 210             in_style_tag = true;
 211             break;
 212         case SCRIPT:
 213             in_script_tag = true;
 214             break;
 215         case TITLE:
 216             target = &title;
 217             pending_space = 0;
 218             break;
 219         default:
 220             /* No action */
 221             break;
 222     }
 223     return true;
 224 }
 225
 226 bool
 227 MyHtmlParser::closing_tag(const string &tag)
 228 {
 229     int k = keyword(tab, tag.data(), tag.size());
 230     if (k < 0 || (token_space[k] & NOCLOSE))
 231         return true;
 232     pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
 233     switch (html_tag(k)) {
 234         case STYLE:
 235             in_style_tag = false;
 236             break;
 237         case SCRIPT:
 238             in_script_tag = false;
 239             break;
 240         case TITLE:
 241             target = &dump;
 242             pending_space = 0;
 243             break;
 244         default:
 245             /* No action */
 246             break;
 247     }
 248     return true;
 249 }