2 * Copyright (C) 2004-2008 Geometer Plus <contact@geometerplus.com>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <ZLInputStream.h>
24 #include <ZLXMLReader.h>
26 #include <ZLStringUtil.h>
27 #include <ZLUnicodeUtil.h>
29 #include "HtmlReader.h"
30 #include "HtmlEntityCollection.h"
32 HtmlReader::HtmlReader(const std::string
&encoding
) : EncodedTextReader(encoding
) {
35 HtmlReader::~HtmlReader() {
38 void HtmlReader::setTag(HtmlTag
&tag
, const std::string
&name
) {
39 tag
.Attributes
.clear();
41 if (name
.length() == 0) {
46 tag
.Start
= name
[0] != '/';
50 tag
.Name
= name
.substr(1);
53 const size_t len
= tag
.Name
.length();
54 for (size_t i
= 0; i
< len
; ++i
) {
55 tag
.Name
[i
] = toupper(tag
.Name
[i
]);
68 PS_SPECIAL_IN_ATTRIBUTEVALUE
,
79 static bool allowSymbol(SpecialType type
, char ch
) {
81 ((type
== ST_NAME
) && isalpha(ch
)) ||
82 ((type
== ST_DEC
) && isdigit(ch
)) ||
83 ((type
== ST_HEX
) && isxdigit(ch
));
86 static int specialSymbolNumber(SpecialType type
, const std::string
&txt
) {
90 return HtmlEntityCollection::symbolNumber(txt
);
92 return strtol(txt
.c_str() + 1, &end
, 10);
94 return strtol(txt
.c_str() + 2, &end
, 16);
100 void HtmlReader::appendString(std::string
&to
, std::string
&from
) {
101 if (myConverter
.isNull()) {
104 myConverter
->convert(to
, from
);
105 myConverter
->reset();
110 void HtmlReader::readDocument(ZLInputStream
&stream
) {
111 if (!stream
.open()) {
115 startDocumentHandler();
117 ParseState state
= PS_TEXT
;
118 SpecialType state_special
= ST_UNKNOWN
;
119 std::string currentString
;
120 std::string attributeValueString
;
121 std::string specialString
;
122 int quotationCounter
= 0;
124 char endOfComment
[2] = "\0";
126 const size_t BUFSIZE
= 2048;
127 char *buffer
= new char[BUFSIZE
];
131 length
= stream
.read(buffer
, BUFSIZE
);
132 char *start
= buffer
;
133 char *endOfBuffer
= buffer
+ length
;
134 for (char *ptr
= buffer
; ptr
< endOfBuffer
; ++ptr
) {
138 if (!characterDataHandler(start
, ptr
- start
, true)) {
139 goto endOfProcessing
;
143 currentTag
.Offset
= offset
+ (ptr
- buffer
);
146 if (!characterDataHandler(start
, ptr
- start
, true)) {
147 goto endOfProcessing
;
151 state_special
= ST_UNKNOWN
;
155 case PS_SPECIAL_IN_ATTRIBUTEVALUE
:
156 if (state_special
== ST_UNKNOWN
) {
158 state_special
= ST_NUM
;
159 } else if (isalpha(*ptr
)) {
160 state_special
= ST_NAME
;
163 state
= (state
== PS_SPECIAL
) ? PS_TEXT
: PS_ATTRIBUTEVALUE
;
165 } else if (state_special
== ST_NUM
) {
167 state_special
= ST_HEX
;
168 } else if (isdigit(*ptr
)) {
169 state_special
= ST_DEC
;
172 state
= (state
== PS_SPECIAL
) ? PS_TEXT
: PS_ATTRIBUTEVALUE
;
176 specialString
.append(start
, ptr
- start
);
177 int number
= specialSymbolNumber(state_special
, specialString
);
178 if ((128 <= number
) && (number
<= 159)) {
180 if (state
== PS_SPECIAL
) {
181 characterDataHandler(&ch
, 1, true);
183 myConverter
->convert(attributeValueString
, &ch
, &ch
+ 1);
185 } else if (number
!= 0) {
187 int len
= ZLUnicodeUtil::ucs2ToUtf8(buffer
, number
);
188 if (state
== PS_SPECIAL
) {
189 characterDataHandler(buffer
, len
, false);
191 attributeValueString
.append(buffer
, len
);
194 specialString
= "&" + specialString
+ ";";
195 if (state
== PS_SPECIAL
) {
196 characterDataHandler(specialString
.c_str(), specialString
.length(), false);
198 attributeValueString
+= specialString
;
201 specialString
.erase();
203 state
= (state
== PS_SPECIAL
) ? PS_TEXT
: PS_ATTRIBUTEVALUE
;
204 } else if (!allowSymbol(state_special
, *ptr
)) {
206 state
= (state
== PS_SPECIAL
) ? PS_TEXT
: PS_ATTRIBUTEVALUE
;
211 state
= (*ptr
== '!') ? PS_COMMENT
: PS_TAGNAME
;
214 if ((endOfComment
[0] == '\0') && (*ptr
!= '-')) {
216 } else if ((endOfComment
[0] == '-') && (endOfComment
[1] == '-') && (*ptr
== '>')) {
219 endOfComment
[0] = '\0';
220 endOfComment
[1] = '\0';
222 endOfComment
[0] = endOfComment
[1];
223 endOfComment
[1] = *ptr
;
227 if ((*ptr
== '>') || isspace((unsigned char)*ptr
)) {
228 currentString
.append(start
, ptr
- start
);
230 setTag(currentTag
, currentString
);
231 currentString
.erase();
232 if (currentTag
.Name
== "") {
233 state
= (*ptr
== '>') ? PS_TEXT
: PS_SKIPTAG
;
236 if (!tagHandler(currentTag
)) {
237 goto endOfProcessing
;
241 state
= PS_ATTRIBUTENAME
;
246 case PS_ATTRIBUTENAME
:
247 if ((*ptr
== '>') || (*ptr
== '=') || isspace((unsigned char)*ptr
)) {
248 if ((ptr
!= start
) || !currentString
.empty()) {
249 currentString
.append(start
, ptr
- start
);
250 for (unsigned int i
= 0; i
< currentString
.length(); ++i
) {
251 currentString
[i
] = toupper(currentString
[i
]);
253 currentTag
.addAttribute(currentString
);
254 currentString
.erase();
258 if (!tagHandler(currentTag
)) {
259 goto endOfProcessing
;
263 state
= (*ptr
== '=') ? PS_ATTRIBUTEVALUE
: PS_ATTRIBUTENAME
;
267 case PS_ATTRIBUTEVALUE
:
269 if (((ptr
== start
) && currentString
.empty()) || (quotationCounter
> 0)) {
272 } else if (*ptr
== '&') {
273 currentString
.append(start
, ptr
- start
);
275 appendString(attributeValueString
, currentString
);
276 state
= PS_SPECIAL_IN_ATTRIBUTEVALUE
;
277 state_special
= ST_UNKNOWN
;
278 } else if ((quotationCounter
!= 1) && ((*ptr
== '>') || isspace((unsigned char)*ptr
))) {
279 if ((ptr
!= start
) || !currentString
.empty()) {
280 currentString
.append(start
, ptr
- start
);
281 if (currentString
[0] == '"') {
282 currentString
= currentString
.substr(1, currentString
.length() - 2);
284 appendString(attributeValueString
, currentString
);
285 currentTag
.setLastAttributeValue(attributeValueString
);
286 attributeValueString
.erase();
287 quotationCounter
= 0;
291 if (!tagHandler(currentTag
)) {
292 goto endOfProcessing
;
296 state
= PS_ATTRIBUTENAME
;
308 if (start
!= endOfBuffer
) {
311 if (!characterDataHandler(start
, endOfBuffer
- start
, true)) {
312 goto endOfProcessing
;
316 case PS_ATTRIBUTENAME
:
317 case PS_ATTRIBUTEVALUE
:
318 currentString
.append(start
, endOfBuffer
- start
);
321 case PS_SPECIAL_IN_ATTRIBUTEVALUE
:
322 specialString
.append(start
, endOfBuffer
- start
);
331 } while (length
== BUFSIZE
);
335 endDocumentHandler();