Support conversion of linkshere
[dueringa_WikiWalker.git] / src / WikimediaJsonToArticleConverter.cpp
blob7a9f617f047e5e5bc235a436b32b173c46aac9ca
1 //! \file WikimediaJsonToArticleConverter.cpp
3 #include "WikimediaJsonToArticleConverter.h"
5 #include <cassert>
6 #include <sstream>
8 #include <json/json.h>
10 #include "Article.h"
11 #include "WalkerException.h"
13 namespace WikiWalker
15 //! \todo really ugly workaround, passing in the
16 //! CollectionUtils::ArticleCollection instance...
17 //! :/
18 WikimediaJsonToArticleConverter::ContinuationStatus
19 WikimediaJsonToArticleConverter::convert(
20 const std::string& json,
21 CollectionUtils::ArticleCollection& articleCache)
23 Json::Value document;
24 Json::CharReaderBuilder crb;
25 Json::CharReaderBuilder::strictMode(&crb.settings_);
26 std::istringstream jsonStream(json);
27 bool success = Json::parseFromStream(crb, jsonStream, &document, nullptr);
29 if(!success) {
30 throw WalkerException("Error parsing JSON");
33 auto allPages = document.get("query", Json::Value::nullSingleton())
34 .get("pages", Json::Value::nullSingleton());
36 for(auto& onePage : allPages) {
37 //! get normalized title not necessary, "title" is already
38 std::string oneTitle =
39 onePage.get("title", Json::Value::nullSingleton()).asString();
41 //! \todo find a better solution than get-compare-add
42 auto wantedArticle = CollectionUtils::get(articleCache, oneTitle);
44 if(wantedArticle == nullptr) {
45 wantedArticle = std::make_shared<Article>(oneTitle);
46 CollectionUtils::add(articleCache, wantedArticle);
49 if(onePage.isMember("missing") || onePage.isMember("invalid")) {
50 wantedArticle->marked(true);
51 wantedArticle->analyzed(true);
52 continue;
55 // add links
56 //! \todo support linkshere
57 std::shared_ptr<Article> par;
58 if(onePage.isMember("links")) {
59 for(const auto& linked :
60 onePage.get("links", Json::Value::nullSingleton())) {
61 auto linkedPageTitle =
62 linked.get("title", Json::Value::nullSingleton()).asString();
63 par = CollectionUtils::get(articleCache, linkedPageTitle);
65 if(par == nullptr) {
66 par = std::make_shared<Article>(linkedPageTitle);
67 CollectionUtils::add(articleCache, par);
70 wantedArticle->addLink(par);
73 // It might happen there are no links. In this case, set analyzed anyway.
74 wantedArticle->analyzed(true);
75 } else if(onePage.isMember("linkshere")) {
76 for(const auto& linksHere :
77 onePage.get("linkshere", Json::Value::nullSingleton())) {
78 auto linkingPageTitle =
79 linksHere.get("title", Json::Value::nullSingleton()).asString();
80 par = CollectionUtils::get(articleCache, linkingPageTitle);
82 if(par == nullptr) {
83 par = std::make_shared<Article>(linkingPageTitle);
84 CollectionUtils::add(articleCache, par);
87 par->addLink(wantedArticle);
88 /* par.isAnalyzed? this is strictly speaking not true, since we only
89 * know this article links here, not where else it links...
90 * However, it's still automatically set once we call addLink */
95 bool moreData;
97 // always clear, otherwise insert won't happen
98 continuationData_.clear();
100 if(document.isMember("continue")) {
101 moreData = true;
102 const auto contData =
103 document.get("continue", Json::Value::nullSingleton());
104 assert(contData.isObject());
106 for(auto it = contData.begin(); it != contData.end(); it++) {
107 continuationData_.emplace(it.name(), it->asString());
109 } else {
110 moreData = false;
113 return moreData ? ContinuationStatus::ConversionNeedsMoreData
114 : ContinuationStatus::ConversionCompleted;
116 } // namespace WikiWalker