Initial commit. FBReader 0.8.12
[lbook_fbreader.git] / fbreader / src / formats / html / HtmlBookReader.cpp
bloba1f1069fcf11f98ca1ca7ec6fb89622598b84b3e
1 /*
2 * Copyright (C) 2004-2008 Geometer Plus <contact@geometerplus.com>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
20 #include <cctype>
22 #include <ZLFileImage.h>
23 #include <ZLStringUtil.h>
25 #include "HtmlBookReader.h"
26 #include "HtmlTagActions.h"
27 #include "../txt/PlainTextFormat.h"
28 #include "../util/MiscUtil.h"
29 #include "../../bookmodel/BookModel.h"
30 #include "StyleSheetParser.h"
32 HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) {
35 HtmlTagAction::~HtmlTagAction() {
38 void HtmlTagAction::reset() {
41 DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
44 void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) {
47 HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
50 void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) {
51 std::vector<FBTextKind> &list = myReader.myKindList;
52 int index;
53 for (index = list.size() - 1; index >= 0; --index) {
54 if (list[index] == myKind) {
55 break;
58 if (tag.Start) {
59 if (index == -1) {
60 bookReader().pushKind(myKind);
61 myReader.myKindList.push_back(myKind);
62 bookReader().addControl(myKind, true);
64 } else {
65 if (index >= 0) {
66 for (int i = list.size() - 1; i >= index; --i) {
67 bookReader().addControl(list[i], false);
68 bookReader().popKind();
70 for (unsigned int j = index + 1; j < list.size(); ++j) {
71 bookReader().addControl(list[j], true);
72 bookReader().pushKind(list[j]);
74 list.erase(list.begin() + index);
79 HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
82 void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) {
83 bookReader().endParagraph();
84 myReader.myIsStarted = false;
85 if (tag.Start) {
86 if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
87 if (!bookReader().contentsParagraphIsOpen()) {
88 bookReader().insertEndOfSectionParagraph();
89 bookReader().enterTitle();
90 bookReader().beginContentsParagraph();
93 bookReader().pushKind(myKind);
94 } else {
95 bookReader().popKind();
96 if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
97 bookReader().endContentsParagraph();
98 bookReader().exitTitle();
101 bookReader().beginParagraph();
104 HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
107 void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) {
108 if (tag.Start) {
109 ++myReader.myIgnoreDataCounter;
110 } else {
111 --myReader.myIgnoreDataCounter;
115 HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
118 void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) {
119 if (tag.Start) {
120 for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
121 if (tag.Attributes[i].Name == "NAME") {
122 bookReader().addHyperlinkLabel(tag.Attributes[i].Value);
123 } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) {
124 std::string value = tag.Attributes[i].Value;
125 if (!myReader.myFileName.empty() &&
126 (value.length() > myReader.myFileName.length()) &&
127 (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) {
128 value = value.substr(myReader.myFileName.length());
130 if (!value.empty()) {
131 if (value[0] == '#') {
132 setHyperlinkType(INTERNAL_HYPERLINK);
133 bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1));
134 } else if (MiscUtil::isReference(value)) {
135 setHyperlinkType(EXTERNAL_HYPERLINK);
136 bookReader().addHyperlinkControl(EXTERNAL_HYPERLINK, value);
141 } else if (hyperlinkType() != REGULAR) {
142 bookReader().addControl(hyperlinkType(), false);
143 setHyperlinkType(REGULAR);
147 void HtmlHrefTagAction::reset() {
148 setHyperlinkType(REGULAR);
151 FBTextKind HtmlHrefTagAction::hyperlinkType() const {
152 return myHyperlinkType;
155 void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) {
156 myHyperlinkType = hyperlinkType;
159 HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
162 void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) {
163 if (tag.Start) {
164 bookReader().endParagraph();
165 for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
166 if (tag.Attributes[i].Name == "SRC") {
167 std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value);
168 bookReader().addImageReference(fileName);
169 bookReader().addImage(fileName,
170 new ZLFileImage("image/auto", myReader.myBaseDirPath + fileName, 0)
172 break;
175 bookReader().beginParagraph();
179 HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) {
182 void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) {
183 if (myReader.myDontBreakParagraph) {
184 myReader.myDontBreakParagraph = false;
185 return;
188 if ((tag.Start && (myBreakType & BREAK_AT_START)) ||
189 (!tag.Start && (myBreakType & BREAK_AT_END))) {
190 bookReader().endParagraph();
191 if (bookReader().isKindStackEmpty()) {
192 bookReader().pushKind(REGULAR);
194 bookReader().beginParagraph();
198 HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
201 void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) {
202 bookReader().endParagraph();
203 myReader.myIsPreformatted = tag.Start;
204 myReader.mySpaceCounter = -1;
205 myReader.myBreakCounter = 0;
206 if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
207 if (tag.Start) {
208 bookReader().pushKind(PREFORMATTED);
209 } else {
210 bookReader().popKind();
213 bookReader().beginParagraph();
216 HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) {
219 void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) {
220 if (tag.Start) {
221 myReader.myListNumStack.push(myStartIndex);
222 } else if (!myReader.myListNumStack.empty()) {
223 myReader.myListNumStack.pop();
227 HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
230 void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) {
231 if (tag.Start) {
232 bookReader().endParagraph();
233 bookReader().beginParagraph();
234 if (!myReader.myListNumStack.empty()) {
235 bookReader().addFixedHSpace(3 * myReader.myListNumStack.size());
236 int &index = myReader.myListNumStack.top();
237 if (index == 0) {
238 myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false);
239 } else {
240 std::string number;
241 ZLStringUtil::appendNumber(number, index++);
242 number += ". ";
243 myReader.addConvertedDataToBuffer(number.data(), number.length(), false);
245 myReader.myDontBreakParagraph = true;
247 } else {
248 myReader.myDontBreakParagraph = false;
252 HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
255 void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) {
256 if (tag.Start) {
257 myReader.myIgnoreTitles = true;
258 } else {
259 myReader.myIgnoreTitles = false;
263 HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
266 void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) {
267 myReader.myStyleSheetParser = tag.Start ? new StyleSheetParser(myReader.myStyleSheetTable) : 0;
269 if (!tag.Start) {
270 myReader.myStyleSheetTable.dump();
275 shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) {
276 if (tag == "EM") {
277 return new HtmlControlTagAction(*this, EMPHASIS);
278 } else if (tag == "STRONG") {
279 return new HtmlControlTagAction(*this, STRONG);
280 } else if (tag == "B") {
281 return new HtmlControlTagAction(*this, BOLD);
282 } else if (tag == "I") {
283 return new HtmlControlTagAction(*this, ITALIC);
284 } else if (tag == "TT") {
285 return new HtmlControlTagAction(*this, CODE);
286 } else if (tag == "CODE") {
287 return new HtmlControlTagAction(*this, CODE);
288 } else if (tag == "CITE") {
289 return new HtmlControlTagAction(*this, CITE);
290 } else if (tag == "SUB") {
291 return new HtmlControlTagAction(*this, SUB);
292 } else if (tag == "SUP") {
293 return new HtmlControlTagAction(*this, SUP);
294 } else if (tag == "H1") {
295 return new HtmlHeaderTagAction(*this, H1);
296 } else if (tag == "H2") {
297 return new HtmlHeaderTagAction(*this, H2);
298 } else if (tag == "H3") {
299 return new HtmlHeaderTagAction(*this, H3);
300 } else if (tag == "H4") {
301 return new HtmlHeaderTagAction(*this, H4);
302 } else if (tag == "H5") {
303 return new HtmlHeaderTagAction(*this, H5);
304 } else if (tag == "H6") {
305 return new HtmlHeaderTagAction(*this, H6);
306 } else if (tag == "HEAD") {
307 return new HtmlIgnoreTagAction(*this);
308 } else if (tag == "TITLE") {
309 return new HtmlIgnoreTagAction(*this);
310 } else if (tag == "STYLE") {
311 return new HtmlStyleTagAction(*this);
312 } else if (tag == "SELECT") {
313 return new HtmlIgnoreTagAction(*this);
314 } else if (tag == "SCRIPT") {
315 return new HtmlIgnoreTagAction(*this);
316 } else if (tag == "A") {
317 return new HtmlHrefTagAction(*this);
318 } else if (tag == "TR") {
319 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
320 } else if (tag == "DIV") {
321 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
322 } else if (tag == "DT") {
323 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START);
324 } else if (tag == "P") {
325 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
326 } else if (tag == "BR") {
327 return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
328 } else if (tag == "IMG") {
329 return new HtmlImageTagAction(*this);
330 } else if (tag == "UL") {
331 return new HtmlListTagAction(*this, 0);
332 } else if (tag == "MENU") {
333 return new HtmlListTagAction(*this, 0);
334 } else if (tag == "DIR") {
335 return new HtmlListTagAction(*this, 0);
336 } else if (tag == "OL") {
337 return new HtmlListTagAction(*this, 1);
338 } else if (tag == "LI") {
339 return new HtmlListItemTagAction(*this);
340 } else if (tag == "PRE") {
341 if (myProcessPreTag) {
342 return new HtmlPreTagAction(*this);
344 } else if (tag == "TABLE") {
345 return new HtmlTableTagAction(*this);
348 } else if (tag == "DD") {
349 return 0;
350 } else if (tag == "DL") {
351 return 0;
352 } else if (tag == "DFN") {
353 return 0;
354 } else if (tag == "SAMP") {
355 return 0;
356 } else if (tag == "KBD") {
357 return 0;
358 } else if (tag == "VAR") {
359 return 0;
360 } else if (tag == "ABBR") {
361 return 0;
362 } else if (tag == "ACRONYM") {
363 return 0;
364 } else if (tag == "BLOCKQUOTE") {
365 return 0;
366 } else if (tag == "Q") {
367 return 0;
368 } else if (tag == "INS") {
369 return 0;
370 } else if (tag == "DEL") {
371 return 0;
372 } else if (tag == "BODY") {
373 return 0;
375 return new DummyHtmlTagAction(*this);
378 void HtmlBookReader::setBuildTableOfContent(bool build) {
379 myBuildTableOfContent = build;
382 void HtmlBookReader::setProcessPreTag(bool process) {
383 myProcessPreTag = process;
386 HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) {
389 HtmlBookReader::~HtmlBookReader() {
392 void HtmlBookReader::addConvertedDataToBuffer(const char *text, int len, bool convert) {
393 if (len > 0) {
394 if (myDontBreakParagraph) {
395 while ((len > 0) && isspace(*text)) {
396 --len;
397 ++text;
399 if (len == 0) {
400 return;
403 if (convert) {
404 myConverter->convert(myConverterBuffer, text, text + len);
405 myBookReader.addData(myConverterBuffer);
406 myBookReader.addContentsData(myConverterBuffer);
407 myConverterBuffer.erase();
408 } else {
409 std::string strText(text, len);
410 myBookReader.addData(strText);
411 myBookReader.addContentsData(strText);
413 myDontBreakParagraph = false;
417 bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
418 myConverter->reset();
420 shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
421 if (action.isNull()) {
422 action = createAction(tag.Name);
423 myActionMap[tag.Name] = action;
425 action->run(tag);
427 return true;
430 void HtmlBookReader::preformattedCharacterDataHandler(const char *text, int len, bool convert) {
431 const char *start = text;
432 const char *end = text + len;
434 int breakType = myFormat.breakType();
435 if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
436 for (const char *ptr = text; ptr != end; ++ptr) {
437 if (*ptr == '\n') {
438 mySpaceCounter = 0;
439 if (start < ptr) {
440 addConvertedDataToBuffer(start, ptr - start, convert);
441 } else {
442 static const std::string SPACE = " ";
443 myBookReader.addData(SPACE);
445 myBookReader.endParagraph();
446 myBookReader.beginParagraph();
447 start = ptr + 1;
448 } else if (mySpaceCounter >= 0) {
449 if (isspace((unsigned char)*ptr)) {
450 ++mySpaceCounter;
451 } else {
452 myBookReader.addFixedHSpace(mySpaceCounter);
453 mySpaceCounter = -1;
457 addConvertedDataToBuffer(start, end - start, convert);
458 } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) {
459 for (const char *ptr = text; ptr != end; ++ptr) {
460 if (isspace((unsigned char)*ptr)) {
461 if (*ptr == '\n') {
462 mySpaceCounter = 0;
463 } else if (mySpaceCounter >= 0) {
464 ++mySpaceCounter;
466 } else {
467 if (mySpaceCounter > myFormat.ignoredIndent()) {
468 if (ptr - start > mySpaceCounter) {
469 addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert);
470 myBookReader.endParagraph();
471 myBookReader.beginParagraph();
473 start = ptr;
475 mySpaceCounter = -1;
478 mySpaceCounter = std::max(mySpaceCounter, 0);
479 if (end - start > mySpaceCounter) {
480 addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert);
482 } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) {
483 for (const char *ptr = start; ptr != end; ++ptr) {
484 if (isspace((unsigned char)*ptr)) {
485 if (*ptr == '\n') {
486 ++myBreakCounter;
488 } else {
489 if (myBreakCounter > 1) {
490 addConvertedDataToBuffer(start, ptr - start, convert);
491 myBookReader.endParagraph();
492 myBookReader.beginParagraph();
493 start = ptr;
495 myBreakCounter = 0;
498 addConvertedDataToBuffer(start, end - start, convert);
502 bool HtmlBookReader::characterDataHandler(const char *text, int len, bool convert) {
503 if (!myStyleSheetParser.isNull()) {
504 myStyleSheetParser->parse(text, len);
505 return true;
508 if (myIgnoreDataCounter != 0) {
509 return true;
512 if (myIsPreformatted) {
513 preformattedCharacterDataHandler(text, len, convert);
514 return true;
517 const char *ptr = text;
518 const char *end = text + len;
519 if (!myIsStarted) {
520 for (; ptr != end; ++ptr) {
521 if (!isspace((unsigned char)*ptr)) {
522 myIsStarted = true;
523 break;
527 if (myIsStarted) {
528 addConvertedDataToBuffer(ptr, end - ptr, convert);
530 return true;
533 void HtmlBookReader::startDocumentHandler() {
534 while (!myListNumStack.empty()) {
535 myListNumStack.pop();
537 myConverterBuffer.erase();
538 myKindList.clear();
540 myBookReader.reset();
541 myBookReader.setMainTextModel();
542 myBookReader.pushKind(REGULAR);
543 myBookReader.beginParagraph();
544 myIgnoreDataCounter = 0;
545 myIsPreformatted = false;
546 myDontBreakParagraph = false;
547 for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) {
548 it->second->reset();
550 myIsStarted = false;
551 myIgnoreTitles = false;
553 myStyleSheetParser = 0;
555 mySpaceCounter = -1;
556 myBreakCounter = 0;
559 void HtmlBookReader::endDocumentHandler() {
560 myBookReader.endParagraph();
563 void HtmlBookReader::setFileName(const std::string fileName) {
564 myFileName = fileName;