2 * @brief subclass of XmlParser for extracting text from HTML.
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002-2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "htmlparser.h"
30 #include "stringutils.h"
31 #include "utf8convert.h"
38 lowercase_string(string
&str
)
40 for (string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
46 HtmlParser::parse(string_view text
,
47 const string
& charset_
,
48 bool charset_from_meta_
)
51 charset_from_meta
= charset_from_meta_
;
52 XmlParser::parse(text
);
56 HtmlParser::process_content(const string
& content
)
58 if (!content
.empty() && !in_script_tag
&& !in_style_tag
) {
59 string::size_type b
= content
.find_first_not_of(WHITESPACE
);
60 if (b
) pending_space
= true;
61 while (b
!= string::npos
) {
62 if (pending_space
&& !target
->empty())
64 string::size_type e
= content
.find_first_of(WHITESPACE
, b
);
65 if (e
== string::npos
) {
66 target
->append(content
.data() + b
, content
.size() - b
);
67 pending_space
= false;
70 target
->append(content
.data() + b
, e
- b
);
72 b
= content
.find_first_not_of(WHITESPACE
, e
+ 1);
78 HtmlParser::opening_tag(const string
& tag
)
80 int k
= keyword(tab
, tag
.data(), tag
.size());
83 pending_space
= pending_space
|| (token_flags
[k
] & TOKEN_SPACE
);
84 switch (html_tag(k
)) {
87 if (!get_attribute("type", type
))
89 if (type
== "checkbox") {
90 if (get_attribute("checked", type
)) {
91 *target
+= "\xe2\x98\x91"; // U+2611 BALLOT BOX WITH CHECK
93 *target
+= "\xe2\x98\x90"; // U+2610 BALLOT BOX
100 if (get_attribute("content", content
)) {
102 if (get_attribute("name", name
)) {
103 lowercase_string(name
);
104 if (name
== "description") {
105 convert_to_utf8(content
, charset
);
106 decode_entities(content
);
107 if (description_as_sample
&& sample
.empty()) {
108 swap(sample
, content
);
110 // If we're not using the description as the
111 // sample, or for second and subsequent
112 // descriptions, treat as keywords.
113 if (keywords
.empty()) {
114 swap(keywords
, content
);
120 } else if (name
== "keywords" ||
121 name
== "dcterms.subject" ||
122 name
== "dcterms.description") {
123 // LibreOffice HTML export puts "Subject" and
124 // "Keywords" into DCTERMS.subject, and "Comments"
125 // into DCTERMS.description. Best option seems to
126 // be to treat all of these as keywords, i.e. just
127 // more text to index, but not show in/as the
129 if (!keywords
.empty()) keywords
+= ' ';
130 convert_to_utf8(content
, charset
);
131 decode_entities(content
);
133 } else if (name
== "author" ||
134 name
== "dcterms.creator" ||
135 name
== "dcterms.contributor") {
136 // LibreOffice HTML export includes DCTERMS.creator
137 // and DCTERMS.contributor.
138 if (!author
.empty()) author
+= ' ';
139 convert_to_utf8(content
, charset
);
140 decode_entities(content
);
142 } else if (name
== "classification") {
143 if (!topic
.empty()) topic
+= ' ';
144 convert_to_utf8(content
, charset
);
145 decode_entities(content
);
147 } else if (!ignoring_metarobots
&& name
== "robots") {
148 decode_entities(content
);
149 lowercase_string(content
);
150 if (content
.find("none") != string::npos
||
151 content
.find("noindex") != string::npos
) {
152 indexing_allowed
= false;
155 } else if (name
== "created" ||
156 name
== "dcterms.issued") {
157 created
= parse_datetime(content
);
161 // If the current charset came from a meta tag, don't
162 // force reparsing again!
163 if (charset_from_meta
) break;
165 if (get_attribute("http-equiv", hdr
)) {
166 lowercase_string(hdr
);
167 if (hdr
== "content-type") {
168 lowercase_string(content
);
169 size_t start
= content
.find("charset=");
170 if (start
== string::npos
) break;
172 if (start
== content
.size()) break;
174 if (content
[start
] != '"') {
175 while (end
< content
.size()) {
176 unsigned char ch
= content
[end
];
177 if (ch
<= 32 || ch
>= 127 ||
178 strchr(";()<>@,:\\\"/[]?={}", ch
))
185 while (end
< content
.size()) {
186 unsigned char ch
= content
[end
];
187 if (ch
== '"') break;
188 if (ch
== '\\') content
.erase(end
, 1);
192 string
newcharset(content
, start
, end
- start
);
193 if (charset
!= newcharset
) {
200 if (charset_from_meta
) break;
202 if (get_attribute("charset", newcharset
)) {
203 // HTML5 added: <meta charset="...">
204 lowercase_string(newcharset
);
205 if (charset
!= newcharset
) {
215 in_script_tag
= true;
219 pending_space
= false;
229 HtmlParser::closing_tag(const string
& tag
)
231 int k
= keyword(tab
, tag
.data(), tag
.size());
232 if (k
< 0 || (token_flags
[k
] & TOKEN_VOID
))
234 pending_space
= pending_space
|| (token_flags
[k
] & TOKEN_SPACE
);
235 switch (html_tag(k
)) {
237 in_style_tag
= false;
240 in_script_tag
= false;
244 pending_space
= false;