Initialize Article member correctly
[dueringa_WikiWalker.git] / src / WikiWalker.cpp
blobb706f6e42187076fbb97cc4d1a2b4ec358e48bf9
1 //! \file WikiWalker.cpp
3 #include "WikiWalker.h"
5 #include <iostream>
6 #include <fstream>
7 #include <cassert>
9 #include "WikimediaJsonToArticleConverter.h"
10 #include "CurlUrlCreator.h"
11 #include "Article.h"
12 #include "WalkerException.h"
13 #include "ToJsonWriter.h"
14 #include "CacheJsonToArticleConverter.h"
16 void WikiWalker::startWalking(std::string url)
18 std::string apiBaseUrl = "";
20 // this must be included in the URL.
21 std::string domain = "wikipedia.org/";
22 std::string findUrl = domain + "wiki/";
23 size_t domainpos = url.find(findUrl);
25 if(domainpos == std::string::npos) {
26 throw WalkerException("Must be an Wikipedia URL");
29 // ugly URL checking
30 size_t subdomainpos = url.find("http://");
32 if(subdomainpos != std::string::npos) {
33 if(subdomainpos != 0) {
34 throw WalkerException("http:// must be at the beginning of the URL");
36 } else {
37 subdomainpos = url.find("https://");
39 if(subdomainpos != std::string::npos) {
40 if(subdomainpos != 0) {
41 throw WalkerException("https:// must be at the beginning of the URL");
43 } else {
44 apiBaseUrl = "https://";
48 apiBaseUrl.append(url.substr(0, domainpos + domain.length())).append("w/api.php");
50 CurlUrlCreator creator(apiBaseUrl);
52 // extract Wikipedia title
53 std::string title = url.substr(domainpos + findUrl.length());
55 creator.addParameter("action", "query").addParameter("format", "json")
56 .addParameter("prop", "links").addParameter("pllimit", "max")
57 .addParameter("plnamespace", "0").addParameter("formatversion", "1");
58 creator.addParameter("titles", title);
60 std::string json = grabber.grabUrl(creator.buildUrl());
62 if(json != "") {
63 WikimediaJsonToArticleConverter conv;
64 Article* article = conv.convertToArticle(json, articleSet);
66 while(conv.hasMoreData() && conv.getContinuationData() != "") {
67 creator.addParameter("plcontinue", conv.getContinuationData());
69 json = grabber.grabUrl(creator.buildUrl());
70 Article* article2 = conv.convertToArticle(json, articleSet);
72 if(article != article2) {
73 for(auto x = article2->linkBegin(); x != article2->linkEnd(); x++) {
74 article->addLink(*x);
77 // delete duplicate article
78 delete article2;
82 std::cout << "Article " << article->getTitle() << " has " << article->getNumLinks()
83 << " links" << std::endl;
84 } else {
85 std::cerr << "Error fetching article" << std::endl;
89 void WikiWalker::readCache(std::string cacheFile)
91 CacheJsonToArticleConverter cjta;
92 std::ifstream cache(cacheFile);
94 if(!cache.is_open()) {
95 throw WalkerException("Error reading from cache file."
96 " Check permissions, and whether file exists.");
99 std::string json;
101 //! \todo what happend with megabyte-big data? looks like str.max_size is the limit
102 std::getline(cache, json);
104 assert(cache.eof());
106 if(cache.fail()) {
107 cache.close();
108 throw WalkerException("Error reading from file");
111 cjta.convertToArticle(json, articleSet);
114 void WikiWalker::writeCache(std::string cacheFile)
116 ToJsonWriter w;
118 std::ofstream cache(cacheFile, std::ios::trunc);
120 if(!cache.is_open()) {
121 throw WalkerException("Error writing to cache file. Check permissions.");
124 w.output(articleSet, cache);
126 if(cache.fail() || cache.bad()) {
127 cache.close();
128 throw WalkerException("I/O eception when writing to cache file");
131 cache.flush();
132 cache.close();