xapian-applications/omega/myhtmlparse.h

   1 /** @file
   2  * @brief subclass of HtmlParser for extracting text from HTML.
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2002,2003,2004,2006,2008,2010,2011,2012,2013,2016,2017 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22
  23 #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
  24 #define OMEGA_INCLUDED_MYHTMLPARSE_H
  25
  26 #include "htmlparse.h"
  27
  28 #include <ctime>
  29
  30 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
  31 // not in all charsets and perhaps spans of all \xa0 should become a single
  32 // \xa0?
  33 #define WHITESPACE " \t\n\r"
  34
  35 class MyHtmlParser : public HtmlParser {
  36   public:
  37     int pending_space;
  38     bool in_script_tag;
  39     bool in_style_tag;
  40     bool indexing_allowed;
  41     bool ignoring_metarobots;
  42     bool charset_from_meta;
  43     bool description_as_sample;
  44     string title, sample, keywords, dump, author, topic;
  45     time_t created;
  46     string * target;
  47
  48     void process_text(const string &text);
  49     bool opening_tag(const string &tag);
  50     bool closing_tag(const string &tag);
  51     void parse_html(const string &text, const string &charset_,
  52                     bool charset_from_meta_);
  53     void ignore_metarobots() { ignoring_metarobots = true; }
  54     MyHtmlParser()
  55         : pending_space(0),
  56           in_script_tag(false),
  57           in_style_tag(false),
  58           indexing_allowed(true),
  59           ignoring_metarobots(false),
  60           charset_from_meta(false),
  61           description_as_sample(false),
  62           created(time_t(-1)),
  63           target(&dump) { }
  64
  65     void reset() {
  66         pending_space = 0;
  67         in_script_tag = false;
  68         in_style_tag = false;
  69         indexing_allowed = true;
  70         ignoring_metarobots = false;
  71         charset_from_meta = false;
  72         description_as_sample = false;
  73         title.resize(0);
  74         sample.resize(0);
  75         keywords.resize(0);
  76         dump.resize(0);
  77         author.resize(0);
  78         topic.resize(0);
  79         created = time_t(-1);
  80         target = &dump;
  81     }
  82 };
  83
  84 #endif // OMEGA_INCLUDED_MYHTMLPARSE_H