1 //! \file WikiWalker.cpp
3 #include "WikiWalker.h"
9 #include "WikimediaJsonToArticleConverter.h"
10 #include "CurlUrlCreator.h"
12 #include "WalkerException.h"
13 #include "ToJsonWriter.h"
14 #include "CacheJsonToArticleConverter.h"
16 void WikiWalker::startWalking(std::string url
)
18 std::string apiBaseUrl
= "";
20 // this must be included in the URL.
21 std::string domain
= "wikipedia.org/";
22 std::string findUrl
= domain
+ "wiki/";
23 size_t domainpos
= url
.find(findUrl
);
25 if(domainpos
== std::string::npos
) {
26 throw WalkerException("Must be an Wikipedia URL");
30 size_t subdomainpos
= url
.find("http://");
32 if(subdomainpos
!= std::string::npos
) {
33 if(subdomainpos
!= 0) {
34 throw WalkerException("http:// must be at the beginning of the URL");
37 subdomainpos
= url
.find("https://");
39 if(subdomainpos
!= std::string::npos
) {
40 if(subdomainpos
!= 0) {
41 throw WalkerException("https:// must be at the beginning of the URL");
44 apiBaseUrl
= "https://";
48 apiBaseUrl
.append(url
.substr(0, domainpos
+ domain
.length())).append("w/api.php");
50 CurlUrlCreator
creator(apiBaseUrl
);
52 // extract Wikipedia title
53 std::string title
= url
.substr(domainpos
+ findUrl
.length());
55 creator
.addParameter("action", "query").addParameter("format", "json")
56 .addParameter("prop", "links").addParameter("pllimit", "max")
57 .addParameter("plnamespace", "0").addParameter("formatversion", "1");
58 creator
.addParameter("titles", title
);
60 std::string json
= grabber
.grabUrl(creator
.buildUrl());
63 WikimediaJsonToArticleConverter conv
;
64 Article
* article
= conv
.convertToArticle(json
, articleSet
);
66 while(conv
.hasMoreData() && conv
.getContinuationData() != "") {
67 creator
.addParameter("plcontinue", conv
.getContinuationData());
69 json
= grabber
.grabUrl(creator
.buildUrl());
70 Article
* article2
= conv
.convertToArticle(json
, articleSet
);
72 if(article
!= article2
) {
73 for(auto x
= article2
->linkBegin(); x
!= article2
->linkEnd(); x
++) {
77 // delete duplicate article
82 std::cout
<< "Article " << article
->getTitle() << " has " << article
->getNumLinks()
83 << " links" << std::endl
;
85 std::cerr
<< "Error fetching article" << std::endl
;
89 void WikiWalker::readCache(std::string cacheFile
)
91 CacheJsonToArticleConverter cjta
;
92 std::ifstream
cache(cacheFile
);
94 if(!cache
.is_open()) {
95 throw WalkerException("Error reading from cache file."
96 " Check permissions, and whether file exists.");
101 //! \todo what happend with megabyte-big data? looks like str.max_size is the limit
102 std::getline(cache
, json
);
108 throw WalkerException("Error reading from file");
111 cjta
.convertToArticle(json
, articleSet
);
114 void WikiWalker::writeCache(std::string cacheFile
)
118 std::ofstream
cache(cacheFile
, std::ios::trunc
);
120 if(!cache
.is_open()) {
121 throw WalkerException("Error writing to cache file. Check permissions.");
124 w
.output(articleSet
, cache
);
126 if(cache
.fail() || cache
.bad()) {
128 throw WalkerException("I/O eception when writing to cache file");