Index gzip-compressed SVG files
[xapian.git] / xapian-applications / omega / myhtmlparse.h
blob2d568b156b9ad415b6cb84cfbabfa6c110d23ee2
1 /** @file
2 * @brief subclass of HtmlParser for extracting text from HTML.
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002,2003,2004,2006,2008,2010,2011,2012,2013,2016,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
24 #define OMEGA_INCLUDED_MYHTMLPARSE_H
26 #include "htmlparse.h"
28 #include <ctime>
30 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
31 // not in all charsets and perhaps spans of all \xa0 should become a single
32 // \xa0?
33 #define WHITESPACE " \t\n\r"
35 class MyHtmlParser : public HtmlParser {
36 public:
37 int pending_space;
38 bool in_script_tag;
39 bool in_style_tag;
40 bool indexing_allowed;
41 bool ignoring_metarobots;
42 bool charset_from_meta;
43 bool description_as_sample;
44 string title, sample, keywords, dump, author, topic;
45 time_t created;
46 string * target;
48 void process_text(const string &text);
49 bool opening_tag(const string &tag);
50 bool closing_tag(const string &tag);
51 void parse_html(const string &text, const string &charset_,
52 bool charset_from_meta_);
53 void ignore_metarobots() { ignoring_metarobots = true; }
54 MyHtmlParser()
55 : pending_space(0),
56 in_script_tag(false),
57 in_style_tag(false),
58 indexing_allowed(true),
59 ignoring_metarobots(false),
60 charset_from_meta(false),
61 description_as_sample(false),
62 created(time_t(-1)),
63 target(&dump) { }
65 void reset() {
66 pending_space = 0;
67 in_script_tag = false;
68 in_style_tag = false;
69 indexing_allowed = true;
70 ignoring_metarobots = false;
71 charset_from_meta = false;
72 description_as_sample = false;
73 title.resize(0);
74 sample.resize(0);
75 keywords.resize(0);
76 dump.resize(0);
77 author.resize(0);
78 topic.resize(0);
79 created = time_t(-1);
80 target = &dump;
84 #endif // OMEGA_INCLUDED_MYHTMLPARSE_H