src/WikiWalker.cpp

   1 //! \file WikiWalker.cpp
   2
   3 #include "WikiWalker.h"
   4
   5 #include <iostream>
   6 #include <fstream>
   7 #include <cassert>
   8
   9 #include "WikimediaJsonToArticleConverter.h"
  10 #include "CurlUrlCreator.h"
  11 #include "Article.h"
  12 #include "WalkerException.h"
  13 #include "ToJsonWriter.h"
  14 #include "CacheJsonToArticleConverter.h"
  15
  16 void WikiWalker::startWalking(std::string url)
  17 {
  18     std::string apiBaseUrl = "";
  19
  20     // this must be included in the URL.
  21     std::string domain = "wikipedia.org/";
  22     std::string findUrl = domain + "wiki/";
  23     size_t domainpos = url.find(findUrl);
  24
  25     if(domainpos == std::string::npos) {
  26         throw WalkerException("Must be an Wikipedia URL");
  27     }
  28
  29     // ugly URL checking
  30     size_t subdomainpos = url.find("http://");
  31
  32     if(subdomainpos != std::string::npos) {
  33         if(subdomainpos != 0) {
  34             throw WalkerException("http:// must be at the beginning of the URL");
  35         }
  36     } else {
  37         subdomainpos = url.find("https://");
  38
  39         if(subdomainpos != std::string::npos) {
  40             if(subdomainpos != 0) {
  41                 throw WalkerException("https:// must be at the beginning of the URL");
  42             }
  43         } else {
  44             apiBaseUrl = "https://";
  45         }
  46     }
  47
  48     apiBaseUrl.append(url.substr(0, domainpos + domain.length())).append("w/api.php");
  49
  50     CurlUrlCreator creator(apiBaseUrl);
  51
  52     // extract Wikipedia title
  53     std::string title = url.substr(domainpos + findUrl.length());
  54
  55     creator.addParameter("action", "query").addParameter("format", "json")
  56             .addParameter("prop", "links").addParameter("pllimit", "max")
  57             .addParameter("plnamespace", "0").addParameter("formatversion", "1");
  58     creator.addParameter("titles", title);
  59
  60     std::string json = grabber.grabUrl(creator.buildUrl());
  61
  62     if(json != "") {
  63         WikimediaJsonToArticleConverter conv;
  64         Article* article = conv.convertToArticle(json, articleSet);
  65
  66         while(conv.hasMoreData() && conv.getContinuationData() != "") {
  67             creator.addParameter("plcontinue", conv.getContinuationData());
  68
  69             json = grabber.grabUrl(creator.buildUrl());
  70             Article* article2 = conv.convertToArticle(json, articleSet);
  71
  72             if(article != article2) {
  73                 for(auto x = article2->linkBegin(); x != article2->linkEnd(); x++) {
  74                     article->addLink(*x);
  75                 }
  76
  77                 // delete duplicate article
  78                 delete article2;
  79             }
  80         }
  81
  82         std::cout << "Article " << article->getTitle() << " has " << article->getNumLinks()
  83                   << " links" << std::endl;
  84     } else {
  85         std::cerr << "Error fetching article" << std::endl;
  86     }
  87 }
  88
  89 void WikiWalker::readCache(std::string cacheFile)
  90 {
  91     CacheJsonToArticleConverter cjta;
  92     std::ifstream cache(cacheFile);
  93
  94     if(!cache.is_open()) {
  95         throw WalkerException("Error reading from cache file."
  96                               " Check permissions, and whether file exists.");
  97     }
  98
  99     std::string json;
 100
 101     //! \todo what happend with megabyte-big data? looks like str.max_size is the limit
 102     std::getline(cache, json);
 103
 104     assert(cache.eof());
 105
 106     if(cache.fail()) {
 107         cache.close();
 108         throw WalkerException("Error reading from file");
 109     }
 110
 111     cjta.convertToArticle(json, articleSet);
 112 }
 113
 114 void WikiWalker::writeCache(std::string cacheFile)
 115 {
 116     ToJsonWriter w;
 117
 118     std::ofstream cache(cacheFile, std::ios::trunc);
 119
 120     if(!cache.is_open()) {
 121         throw WalkerException("Error writing to cache file. Check permissions.");
 122     }
 123
 124     w.output(articleSet, cache);
 125
 126     if(cache.fail() || cache.bad()) {
 127         cache.close();
 128         throw WalkerException("I/O eception when writing to cache file");
 129     }
 130
 131     cache.flush();
 132     cache.close();
 133 }