2 * @brief subclass of HtmlParser for extracting text from HTML.
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2012,2013,2014,2015,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "myhtmlparse.h"
29 #include "my-html-tok.h"
30 #include "stringutils.h"
31 #include "utf8convert.h"
37 static const char whitespace
[] = "_ \t\r\r\f";
40 lowercase_string(string
&str
)
42 for (string::iterator i
= str
.begin(); i
!= str
.end(); ++i
) {
48 MyHtmlParser::parse_html(const string
&text
, const string
&charset_
,
49 bool charset_from_meta_
)
52 charset_from_meta
= charset_from_meta_
;
57 MyHtmlParser::process_text(const string
&text
)
59 if (!text
.empty() && !in_script_tag
&& !in_style_tag
) {
60 string::size_type b
= text
.find_first_not_of(WHITESPACE
);
61 if (b
&& !pending_space
) pending_space
= SPACE
;
62 while (b
!= string::npos
) {
63 if (pending_space
&& !target
->empty())
64 *target
+= whitespace
[pending_space
];
65 string::size_type e
= text
.find_first_of(WHITESPACE
, b
);
66 if (e
== string::npos
) {
67 target
->append(text
.data() + b
, text
.size() - b
);
71 target
->append(text
.data() + b
, e
- b
);
72 pending_space
= SPACE
;
73 b
= text
.find_first_not_of(WHITESPACE
, e
+ 1);
79 MyHtmlParser::opening_tag(const string
&tag
)
81 int k
= keyword(tab
, tag
.data(), tag
.size());
84 pending_space
= max(pending_space
, (token_space
[k
] & TOKEN_SPACE_MASK
));
85 switch (html_tag(k
)) {
87 if (pending_space
< PAGE
) {
89 if (get_parameter("style", style
)) {
90 // As produced by Libreoffice's HTML export:
91 if (style
.find("page-break-before: always") != string::npos
)
98 if (get_parameter("content", content
)) {
100 if (get_parameter("name", name
)) {
101 lowercase_string(name
);
102 if (name
== "description") {
103 convert_to_utf8(content
, charset
);
104 decode_entities(content
);
105 if (description_as_sample
&& sample
.empty()) {
106 swap(sample
, content
);
108 // If we're not using the description as the
109 // sample, or for second and subsequent
110 // descriptions, treat as keywords.
111 if (keywords
.empty()) {
112 swap(keywords
, content
);
118 } else if (name
== "keywords" ||
119 name
== "dcterms.subject" ||
120 name
== "dcterms.description") {
121 // LibreOffice HTML export puts "Subject" and
122 // "Keywords" into DCTERMS.subject, and "Comments"
123 // into DCTERMS.description. Best option seems to
124 // be to treat all of these as keywords, i.e. just
125 // more text to index, but not show in/as the
127 if (!keywords
.empty()) keywords
+= ' ';
128 convert_to_utf8(content
, charset
);
129 decode_entities(content
);
131 } else if (name
== "author" ||
132 name
== "dcterms.creator" ||
133 name
== "dcterms.contributor") {
134 // LibreOffice HTML export includes DCTERMS.creator
135 // and DCTERMS.contributor.
136 if (!author
.empty()) author
+= ' ';
137 convert_to_utf8(content
, charset
);
138 decode_entities(content
);
140 } else if (name
== "classification") {
141 if (!topic
.empty()) topic
+= ' ';
142 convert_to_utf8(content
, charset
);
143 decode_entities(content
);
145 } else if (!ignoring_metarobots
&& name
== "robots") {
146 decode_entities(content
);
147 lowercase_string(content
);
148 if (content
.find("none") != string::npos
||
149 content
.find("noindex") != string::npos
) {
150 indexing_allowed
= false;
153 } else if (name
== "created" ||
154 name
== "dcterms.issued") {
155 created
= parse_datetime(content
);
159 // If the current charset came from a meta tag, don't
160 // force reparsing again!
161 if (charset_from_meta
) break;
163 if (get_parameter("http-equiv", hdr
)) {
164 lowercase_string(hdr
);
165 if (hdr
== "content-type") {
166 lowercase_string(content
);
167 size_t start
= content
.find("charset=");
168 if (start
== string::npos
) break;
170 if (start
== content
.size()) break;
172 if (content
[start
] != '"') {
173 while (end
< content
.size()) {
174 unsigned char ch
= content
[end
];
175 if (ch
<= 32 || ch
>= 127 ||
176 strchr(";()<>@,:\\\"/[]?={}", ch
))
183 while (end
< content
.size()) {
184 unsigned char ch
= content
[end
];
185 if (ch
== '"') break;
186 if (ch
== '\\') content
.erase(end
, 1);
190 string
newcharset(content
, start
, end
- start
);
191 if (charset
!= newcharset
) {
198 if (charset_from_meta
) break;
200 if (get_parameter("charset", newcharset
)) {
201 // HTML5 added: <meta charset="...">
202 lowercase_string(newcharset
);
203 if (charset
!= newcharset
) {
213 in_script_tag
= true;
227 MyHtmlParser::closing_tag(const string
&tag
)
229 int k
= keyword(tab
, tag
.data(), tag
.size());
230 if (k
< 0 || (token_space
[k
] & NOCLOSE
))
232 pending_space
= max(pending_space
, (token_space
[k
] & TOKEN_SPACE_MASK
));
233 switch (html_tag(k
)) {
235 in_style_tag
= false;
238 in_script_tag
= false;