Fix typo in WSA error code
[xapian.git] / xapian-applications / omega / myhtmlparse.cc
blob2dbf53e569f65db26d7ee18817f314f65e823a6d
1 /** @file
2 * @brief subclass of HtmlParser for extracting text from HTML.
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2012,2013,2014,2015,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "myhtmlparse.h"
27 #include "datetime.h"
28 #include "keyword.h"
29 #include "my-html-tok.h"
30 #include "stringutils.h"
31 #include "utf8convert.h"
33 #include <cstring>
35 using namespace std;
37 static const char whitespace[] = "_ \t\r\r\f";
39 static inline void
40 lowercase_string(string &str)
42 for (string::iterator i = str.begin(); i != str.end(); ++i) {
43 *i = C_tolower(*i);
47 void
48 MyHtmlParser::parse_html(const string &text, const string &charset_,
49 bool charset_from_meta_)
51 charset = charset_;
52 charset_from_meta = charset_from_meta_;
53 parse(text);
56 void
57 MyHtmlParser::process_text(const string &text)
59 if (!text.empty() && !in_script_tag && !in_style_tag) {
60 string::size_type b = text.find_first_not_of(WHITESPACE);
61 if (b && !pending_space) pending_space = SPACE;
62 while (b != string::npos) {
63 if (pending_space && !target->empty())
64 *target += whitespace[pending_space];
65 string::size_type e = text.find_first_of(WHITESPACE, b);
66 if (e == string::npos) {
67 target->append(text.data() + b, text.size() - b);
68 pending_space = 0;
69 return;
71 target->append(text.data() + b, e - b);
72 pending_space = SPACE;
73 b = text.find_first_not_of(WHITESPACE, e + 1);
78 bool
79 MyHtmlParser::opening_tag(const string &tag)
81 int k = keyword(tab, tag.data(), tag.size());
82 if (k < 0)
83 return true;
84 pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
85 switch (html_tag(k)) {
86 case P:
87 if (pending_space < PAGE) {
88 string style;
89 if (get_parameter("style", style)) {
90 // As produced by Libreoffice's HTML export:
91 if (style.find("page-break-before: always") != string::npos)
92 pending_space = PAGE;
95 break;
96 case META: {
97 string content;
98 if (get_parameter("content", content)) {
99 string name;
100 if (get_parameter("name", name)) {
101 lowercase_string(name);
102 if (name == "description") {
103 convert_to_utf8(content, charset);
104 decode_entities(content);
105 if (description_as_sample && sample.empty()) {
106 swap(sample, content);
107 } else {
108 // If we're not using the description as the
109 // sample, or for second and subsequent
110 // descriptions, treat as keywords.
111 if (keywords.empty()) {
112 swap(keywords, content);
113 } else {
114 keywords += ' ';
115 keywords += content;
118 } else if (name == "keywords" ||
119 name == "dcterms.subject" ||
120 name == "dcterms.description") {
121 // LibreOffice HTML export puts "Subject" and
122 // "Keywords" into DCTERMS.subject, and "Comments"
123 // into DCTERMS.description. Best option seems to
124 // be to treat all of these as keywords, i.e. just
125 // more text to index, but not show in/as the
126 // sample.
127 if (!keywords.empty()) keywords += ' ';
128 convert_to_utf8(content, charset);
129 decode_entities(content);
130 keywords += content;
131 } else if (name == "author" ||
132 name == "dcterms.creator" ||
133 name == "dcterms.contributor") {
134 // LibreOffice HTML export includes DCTERMS.creator
135 // and DCTERMS.contributor.
136 if (!author.empty()) author += ' ';
137 convert_to_utf8(content, charset);
138 decode_entities(content);
139 author += content;
140 } else if (name == "classification") {
141 if (!topic.empty()) topic += ' ';
142 convert_to_utf8(content, charset);
143 decode_entities(content);
144 topic += content;
145 } else if (!ignoring_metarobots && name == "robots") {
146 decode_entities(content);
147 lowercase_string(content);
148 if (content.find("none") != string::npos ||
149 content.find("noindex") != string::npos) {
150 indexing_allowed = false;
151 return false;
153 } else if (name == "created" ||
154 name == "dcterms.issued") {
155 created = parse_datetime(content);
157 break;
159 // If the current charset came from a meta tag, don't
160 // force reparsing again!
161 if (charset_from_meta) break;
162 string hdr;
163 if (get_parameter("http-equiv", hdr)) {
164 lowercase_string(hdr);
165 if (hdr == "content-type") {
166 lowercase_string(content);
167 size_t start = content.find("charset=");
168 if (start == string::npos) break;
169 start += 8;
170 if (start == content.size()) break;
171 size_t end = start;
172 if (content[start] != '"') {
173 while (end < content.size()) {
174 unsigned char ch = content[end];
175 if (ch <= 32 || ch >= 127 ||
176 strchr(";()<>@,:\\\"/[]?={}", ch))
177 break;
178 ++end;
180 } else {
181 ++start;
182 ++end;
183 while (end < content.size()) {
184 unsigned char ch = content[end];
185 if (ch == '"') break;
186 if (ch == '\\') content.erase(end, 1);
187 ++end;
190 string newcharset(content, start, end - start);
191 if (charset != newcharset) {
192 throw newcharset;
196 break;
198 if (charset_from_meta) break;
199 string newcharset;
200 if (get_parameter("charset", newcharset)) {
201 // HTML5 added: <meta charset="...">
202 lowercase_string(newcharset);
203 if (charset != newcharset) {
204 throw newcharset;
207 break;
209 case STYLE:
210 in_style_tag = true;
211 break;
212 case SCRIPT:
213 in_script_tag = true;
214 break;
215 case TITLE:
216 target = &title;
217 pending_space = 0;
218 break;
219 default:
220 /* No action */
221 break;
223 return true;
226 bool
227 MyHtmlParser::closing_tag(const string &tag)
229 int k = keyword(tab, tag.data(), tag.size());
230 if (k < 0 || (token_space[k] & NOCLOSE))
231 return true;
232 pending_space = max(pending_space, (token_space[k] & TOKEN_SPACE_MASK));
233 switch (html_tag(k)) {
234 case STYLE:
235 in_style_tag = false;
236 break;
237 case SCRIPT:
238 in_script_tag = false;
239 break;
240 case TITLE:
241 target = &dump;
242 pending_space = 0;
243 break;
244 default:
245 /* No action */
246 break;
248 return true;