BibleGateway: Fix bitrotted scraper, use mobile.*
[kworship.git] / kworship / bible / biblegateway / KwBibleModuleBibleGateway.cpp
blobcf32a7096813ea0a1ca6c484a26c85288e8901fb
1 /***************************************************************************
2 * This file is part of KWorship. *
3 * Copyright 2008 James Hogan <james@albanarts.com> *
4 * *
5 * KWorship is free software: you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation, either version 2 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * KWorship is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with KWorship. If not, write to the Free Software Foundation, *
17 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
18 ***************************************************************************/
20 /**
21 * @file KwBibleModuleBibleGateway.cpp
22 * @brief A BibleGateway bible module.
23 * @author James Hogan <james@albanarts.com>
26 #include "KwBibleModuleBibleGateway.h"
27 #include "KwBiblePassage.h"
29 #include <KIO/NetAccess>
30 #include <KMessageBox>
31 #include <KLocale>
32 #include <dom/html_document.h>
33 #include <dom/html_element.h>
34 #include <dom/html_block.h>
36 #include <QStringList>
37 #include <QFile>
40 * Constructors + destructor
43 /// Default constructor.
44 KwBibleModuleBibleGateway::KwBibleModuleBibleGateway(QString vurl)
45 : KwBibleModule()
47 KUrl url("http://mobile.biblegateway.com/" + vurl);
49 QString tmpFile;
50 if (KIO::NetAccess::download(url, tmpFile, 0))
52 QFile file(tmpFile);
53 if (file.open(QFile::ReadOnly | QFile::Text))
55 QByteArray rawPage = file.readAll();
56 file.close();
58 DOM::HTMLDocument doc;
59 doc.loadXML(QString::fromUtf8(rawPage));
60 DOM::Element bookList = doc.getElementById("booklist");
61 bool tableFound = false;
62 if (!bookList.isNull())
64 // Get the next table
65 DOM::Node sibling = bookList.nextSibling();
66 while (!sibling.isNull() && sibling.nodeType() != DOM::Node::ELEMENT_NODE)
68 sibling = sibling.nextSibling();
70 DOM::Element tableElement(sibling);
71 if (!tableElement.isNull() && tableElement.tagName() == "table")
73 tableFound = true;
74 // Each row except header is a book
75 DOM::NodeList rows = tableElement.getElementsByTagName("tr");
76 bool firstRow = true;
77 for (unsigned int row = 0; row < rows.length(); ++row)
79 DOM::NodeList cells = DOM::Element(rows.item(row)).getElementsByTagName("td");
80 if (cells.length() == 2)
82 // First cell is the name
83 m_bookList.push_back(Book());
84 Book* book = &m_bookList[m_bookList.size()-1];
85 book->name = DOM::HTMLElement(cells.item(0)).innerText().string();
86 // Also check if text is right-to-left
87 if (firstRow)
89 setRightToLeft(DOM::HTMLElement(cells.item(0)).className().string().toLower().contains("rtl"));
90 firstRow = false;
92 // Second cell is the chapter links
93 DOM::NodeList chapterLinks = DOM::Element(cells.item(1)).getElementsByTagName("a");
94 for (unsigned int chapter = 0; chapter < chapterLinks.length(); ++chapter)
96 // Check the chapter number is right
97 DOM::HTMLElement link = chapterLinks.item(chapter);
98 bool numeric;
99 int check = link.innerText().string().toInt(&numeric);
100 if (!numeric || check != (int)chapter+1)
102 KMessageBox::error(0, i18n("Error parsing webpage: %1", i18n("Non sequential chapter list in book '%1'", book->name)));
103 break;
105 // Get the link
106 book->chapters.push_back(Chapter());
107 Chapter* chapter = &book->chapters[book->chapters.size()-1];
108 chapter->url = "http://mobile.biblegateway.com/" + link.getAttribute("href").string();
109 chapter->fetched = false;
115 if (!tableFound)
117 // Book list reference node not found
118 KMessageBox::error(0, i18n("Error parsing webpage: %1", i18n("Book list table not found")));
122 KIO::NetAccess::removeTempFile(tmpFile);
124 else
126 KMessageBox::error(0, KIO::NetAccess::lastErrorString());
130 /// Destructor.
131 KwBibleModuleBibleGateway::~KwBibleModuleBibleGateway()
136 * Main interface
139 QString KwBibleModuleBibleGateway::name()
141 return QString();
144 QString KwBibleModuleBibleGateway::description()
146 return QString();
149 QString KwBibleModuleBibleGateway::managerId()
151 return "BibleGateway.com";
154 int KwBibleModuleBibleGateway::numChapters(int book)
156 if (book >= 0 && book < m_bookList.size())
158 return m_bookList[book].chapters.size();
160 return 0;
163 int KwBibleModuleBibleGateway::numVerses(int book, int chapter)
165 Chapter* chap = fetchChapter(book, chapter);
166 if (0 != chap)
168 return chap->verses.size();
170 else
172 return 0;
176 bool KwBibleModuleBibleGateway::fillPassageVerse(int bookIndex, int chapterIndex, int verseIndex, KwBiblePassage* outPassage)
178 Chapter* chapter = fetchChapter(bookIndex, chapterIndex);
179 if (0 != chapter)
181 outPassage->initVerse(bookIndex, 1+chapterIndex, 1+verseIndex,
182 chapter->verses[verseIndex].heading,
183 chapter->verses[verseIndex].content);
185 return false;
189 * Protected virtual interface
192 void KwBibleModuleBibleGateway::obtainBooks()
194 QStringList list;
195 for (int book = 0; book < m_bookList.size(); ++book)
197 list << m_bookList[book].name;
199 setBooks(list);
203 * Private functions
206 /// Ensure chapter contents are fetched.
207 KwBibleModuleBibleGateway::Chapter* KwBibleModuleBibleGateway::fetchChapter(int book, int chapter)
209 if (book >= 0 && book < m_bookList.size())
211 Book* bookObj = &m_bookList[book];
212 if (chapter >= 0 && chapter < m_bookList[book].chapters.size())
214 Chapter* chap = &bookObj->chapters[chapter];
215 if (!chap->fetched)
217 QString tmpFile;
218 if (KIO::NetAccess::download(chap->url, tmpFile, 0))
220 QFile file(tmpFile);
221 if (file.open(QFile::ReadOnly | QFile::Text))
223 QByteArray rawPage = file.readAll();
224 file.close();
226 DOM::HTMLDocument doc;
227 doc.loadXML(QString::fromUtf8(rawPage));
229 // Find all spans with class="sup"
230 DOM::NodeList sups = doc.body().getElementsByClassName("versenum");
231 int verse = 0;
232 for (unsigned int i = 0; i < sups.length(); ++i)
234 DOM::HTMLElement sup = sups.item(i);
235 if (sup.tagName() == "sup")
237 // Get the verse number and validate
238 bool numeric;
239 QString verseNumber = sup.innerText().string();
240 Verse verseInfo;
241 int check = verseNumber.toInt(&numeric);
242 if (!numeric)
244 // Its not going to be a verse if it isn't numeric
245 KMessageBox::error(0, i18n("Error parsing webpage: %1", i18n("Non numeric superscript encountered: '%1'. It may correspond to a verse range.", verseNumber)));
246 continue;
248 ++verse;
249 if (check != verse)
251 KMessageBox::error(0, i18n("Error parsing webpage: %1", i18n("Non sequential verse list in chapter %1 of book '%2'. Expected verse %3 but got verse %4.", (chapter+1), bookObj->name, verse, check)));
252 break;
255 // Get any headers before it
256 DOM::Node sibling;
257 sibling = sup.previousSibling();
258 while (!sibling.isNull())
260 DOM::Element siblingElement = sibling;
261 if (!siblingElement.isNull())
263 // Stop at a sup class="versenum"
264 if (siblingElement.tagName() == "sup" && siblingElement.getAttribute("class") == "versenum")
266 break;
268 // See if its an interesting heading
269 DOM::HTMLHeadingElement heading = siblingElement;
270 if (!heading.isNull())
272 verseInfo.heading = heading.toHTML() + verseInfo.heading;
275 sibling = sibling.previousSibling();
278 // Get any text after it until the next sup
279 sibling = sup.nextSibling();
280 while (!sibling.isNull())
282 DOM::Element siblingElement = sibling;
283 bool append = true;
284 if (!siblingElement.isNull())
286 // Stop at a sup class="versenum"
287 if (siblingElement.tagName() == "sup")
289 if (siblingElement.getAttribute("class") == "versenum")
291 break;
293 // ignore footnote references
294 else if (siblingElement.getAttribute("class") == "footnote")
296 append = false;
299 // and the actual footnotes section
300 else if (siblingElement.tagName() == "div")
302 if (siblingElement.getAttribute("class") == "footnotes")
304 break;
308 // Also stop at headings
309 DOM::HTMLHeadingElement heading = siblingElement;
310 if (!heading.isNull())
312 break;
315 if (append)
317 verseInfo.content += sibling.toHTML();
319 sibling = sibling.nextSibling();
322 chap->verses.push_back(verseInfo);
327 KIO::NetAccess::removeTempFile(tmpFile);
329 else
331 KMessageBox::error(0, KIO::NetAccess::lastErrorString());
333 chap->fetched = true;
336 return chap;
339 return 0;