2 * Copyright (C) 2004-2008 Geometer Plus <contact@geometerplus.com>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <ZLFileImage.h>
23 #include <ZLStringUtil.h>
25 #include "HtmlBookReader.h"
26 #include "HtmlTagActions.h"
27 #include "../txt/PlainTextFormat.h"
28 #include "../util/MiscUtil.h"
29 #include "../../bookmodel/BookModel.h"
30 #include "StyleSheetParser.h"
32 HtmlTagAction::HtmlTagAction(HtmlBookReader
&reader
) : myReader(reader
) {
35 HtmlTagAction::~HtmlTagAction() {
38 void HtmlTagAction::reset() {
41 DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
44 void DummyHtmlTagAction::run(const HtmlReader::HtmlTag
&) {
47 HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader
&reader
, FBTextKind kind
) : HtmlTagAction(reader
), myKind(kind
) {
50 void HtmlControlTagAction::run(const HtmlReader::HtmlTag
&tag
) {
51 std::vector
<FBTextKind
> &list
= myReader
.myKindList
;
53 for (index
= list
.size() - 1; index
>= 0; --index
) {
54 if (list
[index
] == myKind
) {
60 bookReader().pushKind(myKind
);
61 myReader
.myKindList
.push_back(myKind
);
62 bookReader().addControl(myKind
, true);
66 for (int i
= list
.size() - 1; i
>= index
; --i
) {
67 bookReader().addControl(list
[i
], false);
68 bookReader().popKind();
70 for (unsigned int j
= index
+ 1; j
< list
.size(); ++j
) {
71 bookReader().addControl(list
[j
], true);
72 bookReader().pushKind(list
[j
]);
74 list
.erase(list
.begin() + index
);
79 HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader
&reader
, FBTextKind kind
) : HtmlTagAction(reader
), myKind(kind
) {
82 void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag
&tag
) {
83 bookReader().endParagraph();
84 myReader
.myIsStarted
= false;
86 if (myReader
.myBuildTableOfContent
&& !myReader
.myIgnoreTitles
) {
87 if (!bookReader().contentsParagraphIsOpen()) {
88 bookReader().insertEndOfSectionParagraph();
89 bookReader().enterTitle();
90 bookReader().beginContentsParagraph();
93 bookReader().pushKind(myKind
);
95 bookReader().popKind();
96 if (myReader
.myBuildTableOfContent
&& !myReader
.myIgnoreTitles
) {
97 bookReader().endContentsParagraph();
98 bookReader().exitTitle();
101 bookReader().beginParagraph();
104 HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
107 void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag
&tag
) {
109 ++myReader
.myIgnoreDataCounter
;
111 --myReader
.myIgnoreDataCounter
;
115 HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
118 void HtmlHrefTagAction::run(const HtmlReader::HtmlTag
&tag
) {
120 for (unsigned int i
= 0; i
< tag
.Attributes
.size(); ++i
) {
121 if (tag
.Attributes
[i
].Name
== "NAME") {
122 bookReader().addHyperlinkLabel(tag
.Attributes
[i
].Value
);
123 } else if ((hyperlinkType() == REGULAR
) && (tag
.Attributes
[i
].Name
== "HREF")) {
124 std::string value
= tag
.Attributes
[i
].Value
;
125 if (!myReader
.myFileName
.empty() &&
126 (value
.length() > myReader
.myFileName
.length()) &&
127 (value
.substr(0, myReader
.myFileName
.length()) == myReader
.myFileName
)) {
128 value
= value
.substr(myReader
.myFileName
.length());
130 if (!value
.empty()) {
131 if (value
[0] == '#') {
132 setHyperlinkType(INTERNAL_HYPERLINK
);
133 bookReader().addHyperlinkControl(INTERNAL_HYPERLINK
, value
.substr(1));
134 } else if (MiscUtil::isReference(value
)) {
135 setHyperlinkType(EXTERNAL_HYPERLINK
);
136 bookReader().addHyperlinkControl(EXTERNAL_HYPERLINK
, value
);
141 } else if (hyperlinkType() != REGULAR
) {
142 bookReader().addControl(hyperlinkType(), false);
143 setHyperlinkType(REGULAR
);
147 void HtmlHrefTagAction::reset() {
148 setHyperlinkType(REGULAR
);
151 FBTextKind
HtmlHrefTagAction::hyperlinkType() const {
152 return myHyperlinkType
;
155 void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType
) {
156 myHyperlinkType
= hyperlinkType
;
159 HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
162 void HtmlImageTagAction::run(const HtmlReader::HtmlTag
&tag
) {
164 bookReader().endParagraph();
165 for (unsigned int i
= 0; i
< tag
.Attributes
.size(); ++i
) {
166 if (tag
.Attributes
[i
].Name
== "SRC") {
167 std::string fileName
= MiscUtil::decodeHtmlURL(tag
.Attributes
[i
].Value
);
168 bookReader().addImageReference(fileName
);
169 bookReader().addImage(fileName
,
170 new ZLFileImage("image/auto", myReader
.myBaseDirPath
+ fileName
, 0)
175 bookReader().beginParagraph();
179 HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader
&reader
, BreakType breakType
) : HtmlTagAction(reader
), myBreakType(breakType
) {
182 void HtmlBreakTagAction::run(const HtmlReader::HtmlTag
&tag
) {
183 if (myReader
.myDontBreakParagraph
) {
184 myReader
.myDontBreakParagraph
= false;
188 if ((tag
.Start
&& (myBreakType
& BREAK_AT_START
)) ||
189 (!tag
.Start
&& (myBreakType
& BREAK_AT_END
))) {
190 bookReader().endParagraph();
191 if (bookReader().isKindStackEmpty()) {
192 bookReader().pushKind(REGULAR
);
194 bookReader().beginParagraph();
198 HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
201 void HtmlPreTagAction::run(const HtmlReader::HtmlTag
&tag
) {
202 bookReader().endParagraph();
203 myReader
.myIsPreformatted
= tag
.Start
;
204 myReader
.mySpaceCounter
= -1;
205 myReader
.myBreakCounter
= 0;
206 if (myReader
.myFormat
.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE
) {
208 bookReader().pushKind(PREFORMATTED
);
210 bookReader().popKind();
213 bookReader().beginParagraph();
216 HtmlListTagAction::HtmlListTagAction(HtmlBookReader
&reader
, int startIndex
) : HtmlTagAction(reader
), myStartIndex(startIndex
) {
219 void HtmlListTagAction::run(const HtmlReader::HtmlTag
&tag
) {
221 myReader
.myListNumStack
.push(myStartIndex
);
222 } else if (!myReader
.myListNumStack
.empty()) {
223 myReader
.myListNumStack
.pop();
227 HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
230 void HtmlListItemTagAction::run(const HtmlReader::HtmlTag
&tag
) {
232 bookReader().endParagraph();
233 bookReader().beginParagraph();
234 if (!myReader
.myListNumStack
.empty()) {
235 bookReader().addFixedHSpace(3 * myReader
.myListNumStack
.size());
236 int &index
= myReader
.myListNumStack
.top();
238 myReader
.addConvertedDataToBuffer("\342\200\242 ", 4, false);
241 ZLStringUtil::appendNumber(number
, index
++);
243 myReader
.addConvertedDataToBuffer(number
.data(), number
.length(), false);
245 myReader
.myDontBreakParagraph
= true;
248 myReader
.myDontBreakParagraph
= false;
252 HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
255 void HtmlTableTagAction::run(const HtmlReader::HtmlTag
&tag
) {
257 myReader
.myIgnoreTitles
= true;
259 myReader
.myIgnoreTitles
= false;
263 HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader
&reader
) : HtmlTagAction(reader
) {
266 void HtmlStyleTagAction::run(const HtmlReader::HtmlTag
&tag
) {
267 myReader
.myStyleSheetParser
= tag
.Start
? new StyleSheetParser(myReader
.myStyleSheetTable
) : 0;
270 myReader.myStyleSheetTable.dump();
275 shared_ptr
<HtmlTagAction
> HtmlBookReader::createAction(const std::string
&tag
) {
277 return new HtmlControlTagAction(*this, EMPHASIS
);
278 } else if (tag
== "STRONG") {
279 return new HtmlControlTagAction(*this, STRONG
);
280 } else if (tag
== "B") {
281 return new HtmlControlTagAction(*this, BOLD
);
282 } else if (tag
== "I") {
283 return new HtmlControlTagAction(*this, ITALIC
);
284 } else if (tag
== "TT") {
285 return new HtmlControlTagAction(*this, CODE
);
286 } else if (tag
== "CODE") {
287 return new HtmlControlTagAction(*this, CODE
);
288 } else if (tag
== "CITE") {
289 return new HtmlControlTagAction(*this, CITE
);
290 } else if (tag
== "SUB") {
291 return new HtmlControlTagAction(*this, SUB
);
292 } else if (tag
== "SUP") {
293 return new HtmlControlTagAction(*this, SUP
);
294 } else if (tag
== "H1") {
295 return new HtmlHeaderTagAction(*this, H1
);
296 } else if (tag
== "H2") {
297 return new HtmlHeaderTagAction(*this, H2
);
298 } else if (tag
== "H3") {
299 return new HtmlHeaderTagAction(*this, H3
);
300 } else if (tag
== "H4") {
301 return new HtmlHeaderTagAction(*this, H4
);
302 } else if (tag
== "H5") {
303 return new HtmlHeaderTagAction(*this, H5
);
304 } else if (tag
== "H6") {
305 return new HtmlHeaderTagAction(*this, H6
);
306 } else if (tag
== "HEAD") {
307 return new HtmlIgnoreTagAction(*this);
308 } else if (tag
== "TITLE") {
309 return new HtmlIgnoreTagAction(*this);
310 } else if (tag
== "STYLE") {
311 return new HtmlStyleTagAction(*this);
312 } else if (tag
== "SELECT") {
313 return new HtmlIgnoreTagAction(*this);
314 } else if (tag
== "SCRIPT") {
315 return new HtmlIgnoreTagAction(*this);
316 } else if (tag
== "A") {
317 return new HtmlHrefTagAction(*this);
318 } else if (tag
== "TR") {
319 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END
);
320 } else if (tag
== "DIV") {
321 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END
);
322 } else if (tag
== "DT") {
323 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START
);
324 } else if (tag
== "P") {
325 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END
);
326 } else if (tag
== "BR") {
327 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END
);
328 } else if (tag
== "IMG") {
329 return new HtmlImageTagAction(*this);
330 } else if (tag
== "UL") {
331 return new HtmlListTagAction(*this, 0);
332 } else if (tag
== "MENU") {
333 return new HtmlListTagAction(*this, 0);
334 } else if (tag
== "DIR") {
335 return new HtmlListTagAction(*this, 0);
336 } else if (tag
== "OL") {
337 return new HtmlListTagAction(*this, 1);
338 } else if (tag
== "LI") {
339 return new HtmlListItemTagAction(*this);
340 } else if (tag
== "PRE") {
341 if (myProcessPreTag
) {
342 return new HtmlPreTagAction(*this);
344 } else if (tag
== "TABLE") {
345 return new HtmlTableTagAction(*this);
348 } else if (tag == "DD") {
350 } else if (tag == "DL") {
352 } else if (tag == "DFN") {
354 } else if (tag == "SAMP") {
356 } else if (tag == "KBD") {
358 } else if (tag == "VAR") {
360 } else if (tag == "ABBR") {
362 } else if (tag == "ACRONYM") {
364 } else if (tag == "BLOCKQUOTE") {
366 } else if (tag == "Q") {
368 } else if (tag == "INS") {
370 } else if (tag == "DEL") {
372 } else if (tag == "BODY") {
375 return new DummyHtmlTagAction(*this);
378 void HtmlBookReader::setBuildTableOfContent(bool build
) {
379 myBuildTableOfContent
= build
;
382 void HtmlBookReader::setProcessPreTag(bool process
) {
383 myProcessPreTag
= process
;
386 HtmlBookReader::HtmlBookReader(const std::string
&baseDirectoryPath
, BookModel
&model
, const PlainTextFormat
&format
, const std::string
&encoding
) : HtmlReader(encoding
), myBookReader(model
), myBaseDirPath(baseDirectoryPath
), myFormat(format
), myBuildTableOfContent(true), myProcessPreTag(true) {
389 HtmlBookReader::~HtmlBookReader() {
392 void HtmlBookReader::addConvertedDataToBuffer(const char *text
, int len
, bool convert
) {
394 if (myDontBreakParagraph
) {
395 while ((len
> 0) && isspace(*text
)) {
404 myConverter
->convert(myConverterBuffer
, text
, text
+ len
);
405 myBookReader
.addData(myConverterBuffer
);
406 myBookReader
.addContentsData(myConverterBuffer
);
407 myConverterBuffer
.erase();
409 std::string
strText(text
, len
);
410 myBookReader
.addData(strText
);
411 myBookReader
.addContentsData(strText
);
413 myDontBreakParagraph
= false;
417 bool HtmlBookReader::tagHandler(const HtmlTag
&tag
) {
418 myConverter
->reset();
420 shared_ptr
<HtmlTagAction
> action
= myActionMap
[tag
.Name
];
421 if (action
.isNull()) {
422 action
= createAction(tag
.Name
);
423 myActionMap
[tag
.Name
] = action
;
430 void HtmlBookReader::preformattedCharacterDataHandler(const char *text
, int len
, bool convert
) {
431 const char *start
= text
;
432 const char *end
= text
+ len
;
434 int breakType
= myFormat
.breakType();
435 if (breakType
& PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE
) {
436 for (const char *ptr
= text
; ptr
!= end
; ++ptr
) {
440 addConvertedDataToBuffer(start
, ptr
- start
, convert
);
442 static const std::string SPACE
= " ";
443 myBookReader
.addData(SPACE
);
445 myBookReader
.endParagraph();
446 myBookReader
.beginParagraph();
448 } else if (mySpaceCounter
>= 0) {
449 if (isspace((unsigned char)*ptr
)) {
452 myBookReader
.addFixedHSpace(mySpaceCounter
);
457 addConvertedDataToBuffer(start
, end
- start
, convert
);
458 } else if (breakType
& PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT
) {
459 for (const char *ptr
= text
; ptr
!= end
; ++ptr
) {
460 if (isspace((unsigned char)*ptr
)) {
463 } else if (mySpaceCounter
>= 0) {
467 if (mySpaceCounter
> myFormat
.ignoredIndent()) {
468 if (ptr
- start
> mySpaceCounter
) {
469 addConvertedDataToBuffer(start
, ptr
- start
- mySpaceCounter
, convert
);
470 myBookReader
.endParagraph();
471 myBookReader
.beginParagraph();
478 mySpaceCounter
= std::max(mySpaceCounter
, 0);
479 if (end
- start
> mySpaceCounter
) {
480 addConvertedDataToBuffer(start
, end
- start
- mySpaceCounter
, convert
);
482 } else if (breakType
& PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE
) {
483 for (const char *ptr
= start
; ptr
!= end
; ++ptr
) {
484 if (isspace((unsigned char)*ptr
)) {
489 if (myBreakCounter
> 1) {
490 addConvertedDataToBuffer(start
, ptr
- start
, convert
);
491 myBookReader
.endParagraph();
492 myBookReader
.beginParagraph();
498 addConvertedDataToBuffer(start
, end
- start
, convert
);
502 bool HtmlBookReader::characterDataHandler(const char *text
, int len
, bool convert
) {
503 if (!myStyleSheetParser
.isNull()) {
504 myStyleSheetParser
->parse(text
, len
);
508 if (myIgnoreDataCounter
!= 0) {
512 if (myIsPreformatted
) {
513 preformattedCharacterDataHandler(text
, len
, convert
);
517 const char *ptr
= text
;
518 const char *end
= text
+ len
;
520 for (; ptr
!= end
; ++ptr
) {
521 if (!isspace((unsigned char)*ptr
)) {
528 addConvertedDataToBuffer(ptr
, end
- ptr
, convert
);
533 void HtmlBookReader::startDocumentHandler() {
534 while (!myListNumStack
.empty()) {
535 myListNumStack
.pop();
537 myConverterBuffer
.erase();
540 myBookReader
.reset();
541 myBookReader
.setMainTextModel();
542 myBookReader
.pushKind(REGULAR
);
543 myBookReader
.beginParagraph();
544 myIgnoreDataCounter
= 0;
545 myIsPreformatted
= false;
546 myDontBreakParagraph
= false;
547 for (std::map
<std::string
,shared_ptr
<HtmlTagAction
> >::const_iterator it
= myActionMap
.begin(); it
!= myActionMap
.end(); ++it
) {
551 myIgnoreTitles
= false;
553 myStyleSheetParser
= 0;
559 void HtmlBookReader::endDocumentHandler() {
560 myBookReader
.endParagraph();
563 void HtmlBookReader::setFileName(const std::string fileName
) {
564 myFileName
= fileName
;