Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / htmlparser.cc
blobda269506edcaa7b212e94d8a962ac13ee3e8a14c
1 /** @file
2 * @brief subclass of XmlParser for extracting text from HTML.
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002-2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "htmlparser.h"
27 #include "datetime.h"
28 #include "html-tok.h"
29 #include "keyword.h"
30 #include "stringutils.h"
31 #include "utf8convert.h"
33 #include <cstring>
35 using namespace std;
37 static inline void
38 lowercase_string(string &str)
40 for (string::iterator i = str.begin(); i != str.end(); ++i) {
41 *i = C_tolower(*i);
45 void
46 HtmlParser::parse(string_view text,
47 const string& charset_,
48 bool charset_from_meta_)
50 charset = charset_;
51 charset_from_meta = charset_from_meta_;
52 XmlParser::parse(text);
55 void
56 HtmlParser::process_content(const string& content)
58 if (!content.empty() && !in_script_tag && !in_style_tag) {
59 string::size_type b = content.find_first_not_of(WHITESPACE);
60 if (b) pending_space = true;
61 while (b != string::npos) {
62 if (pending_space && !target->empty())
63 *target += ' ';
64 string::size_type e = content.find_first_of(WHITESPACE, b);
65 if (e == string::npos) {
66 target->append(content.data() + b, content.size() - b);
67 pending_space = false;
68 return;
70 target->append(content.data() + b, e - b);
71 pending_space = true;
72 b = content.find_first_not_of(WHITESPACE, e + 1);
77 bool
78 HtmlParser::opening_tag(const string& tag)
80 int k = keyword(tab, tag.data(), tag.size());
81 if (k < 0)
82 return true;
83 pending_space = pending_space || (token_flags[k] & TOKEN_SPACE);
84 switch (html_tag(k)) {
85 case INPUT: {
86 string type;
87 if (!get_attribute("type", type))
88 break;
89 if (type == "checkbox") {
90 if (get_attribute("checked", type)) {
91 *target += "\xe2\x98\x91"; // U+2611 BALLOT BOX WITH CHECK
92 } else {
93 *target += "\xe2\x98\x90"; // U+2610 BALLOT BOX
96 break;
98 case META: {
99 string content;
100 if (get_attribute("content", content)) {
101 string name;
102 if (get_attribute("name", name)) {
103 lowercase_string(name);
104 if (name == "description") {
105 convert_to_utf8(content, charset);
106 decode_entities(content);
107 if (description_as_sample && sample.empty()) {
108 swap(sample, content);
109 } else {
110 // If we're not using the description as the
111 // sample, or for second and subsequent
112 // descriptions, treat as keywords.
113 if (keywords.empty()) {
114 swap(keywords, content);
115 } else {
116 keywords += ' ';
117 keywords += content;
120 } else if (name == "keywords" ||
121 name == "dcterms.subject" ||
122 name == "dcterms.description") {
123 // LibreOffice HTML export puts "Subject" and
124 // "Keywords" into DCTERMS.subject, and "Comments"
125 // into DCTERMS.description. Best option seems to
126 // be to treat all of these as keywords, i.e. just
127 // more text to index, but not show in/as the
128 // sample.
129 if (!keywords.empty()) keywords += ' ';
130 convert_to_utf8(content, charset);
131 decode_entities(content);
132 keywords += content;
133 } else if (name == "author" ||
134 name == "dcterms.creator" ||
135 name == "dcterms.contributor") {
136 // LibreOffice HTML export includes DCTERMS.creator
137 // and DCTERMS.contributor.
138 if (!author.empty()) author += ' ';
139 convert_to_utf8(content, charset);
140 decode_entities(content);
141 author += content;
142 } else if (name == "classification") {
143 if (!topic.empty()) topic += ' ';
144 convert_to_utf8(content, charset);
145 decode_entities(content);
146 topic += content;
147 } else if (!ignoring_metarobots && name == "robots") {
148 decode_entities(content);
149 lowercase_string(content);
150 if (content.find("none") != string::npos ||
151 content.find("noindex") != string::npos) {
152 indexing_allowed = false;
153 return false;
155 } else if (name == "created" ||
156 name == "dcterms.issued") {
157 created = parse_datetime(content);
159 break;
161 // If the current charset came from a meta tag, don't
162 // force reparsing again!
163 if (charset_from_meta) break;
164 string hdr;
165 if (get_attribute("http-equiv", hdr)) {
166 lowercase_string(hdr);
167 if (hdr == "content-type") {
168 lowercase_string(content);
169 size_t start = content.find("charset=");
170 if (start == string::npos) break;
171 start += 8;
172 if (start == content.size()) break;
173 size_t end = start;
174 if (content[start] != '"') {
175 while (end < content.size()) {
176 unsigned char ch = content[end];
177 if (ch <= 32 || ch >= 127 ||
178 strchr(";()<>@,:\\\"/[]?={}", ch))
179 break;
180 ++end;
182 } else {
183 ++start;
184 ++end;
185 while (end < content.size()) {
186 unsigned char ch = content[end];
187 if (ch == '"') break;
188 if (ch == '\\') content.erase(end, 1);
189 ++end;
192 string newcharset(content, start, end - start);
193 if (charset != newcharset) {
194 throw newcharset;
198 break;
200 if (charset_from_meta) break;
201 string newcharset;
202 if (get_attribute("charset", newcharset)) {
203 // HTML5 added: <meta charset="...">
204 lowercase_string(newcharset);
205 if (charset != newcharset) {
206 throw newcharset;
209 break;
211 case STYLE:
212 in_style_tag = true;
213 break;
214 case SCRIPT:
215 in_script_tag = true;
216 break;
217 case TITLE:
218 target = &title;
219 pending_space = false;
220 break;
221 default:
222 /* No action */
223 break;
225 return true;
228 bool
229 HtmlParser::closing_tag(const string& tag)
231 int k = keyword(tab, tag.data(), tag.size());
232 if (k < 0 || (token_flags[k] & TOKEN_VOID))
233 return true;
234 pending_space = pending_space || (token_flags[k] & TOKEN_SPACE);
235 switch (html_tag(k)) {
236 case STYLE:
237 in_style_tag = false;
238 break;
239 case SCRIPT:
240 in_script_tag = false;
241 break;
242 case TITLE:
243 target = &dump;
244 pending_space = false;
245 break;
246 default:
247 /* No action */
248 break;
250 return true;