1 //! \file WikimediaJsonToArticleConverter.cpp
3 #include "WikimediaJsonToArticleConverter.h"
11 #include "WalkerException.h"
15 //! \todo really ugly workaround, passing in the
16 //! CollectionUtils::ArticleCollection instance...
18 WikimediaJsonToArticleConverter::ContinuationStatus
19 WikimediaJsonToArticleConverter::convert(
20 const std::string
& json
,
21 CollectionUtils::ArticleCollection
& articleCache
)
24 Json::CharReaderBuilder crb
;
25 Json::CharReaderBuilder::strictMode(&crb
.settings_
);
26 std::istringstream
jsonStream(json
);
27 bool success
= Json::parseFromStream(crb
, jsonStream
, &document
, nullptr);
30 throw WalkerException("Error parsing JSON");
33 auto allPages
= document
.get("query", Json::Value::nullSingleton())
34 .get("pages", Json::Value::nullSingleton());
36 for(auto& onePage
: allPages
) {
37 //! get normalized title not necessary, "title" is already
38 std::string oneTitle
=
39 onePage
.get("title", Json::Value::nullSingleton()).asString();
41 //! \todo find a better solution than get-compare-add
42 auto wantedArticle
= CollectionUtils::get(articleCache
, oneTitle
);
44 if(wantedArticle
== nullptr) {
45 wantedArticle
= std::make_shared
<Article
>(oneTitle
);
46 CollectionUtils::add(articleCache
, wantedArticle
);
49 if(onePage
.isMember("missing") || onePage
.isMember("invalid")) {
50 wantedArticle
->marked(true);
51 wantedArticle
->analyzed(true);
56 //! \todo support linkshere
57 std::shared_ptr
<Article
> par
;
58 if(onePage
.isMember("links")) {
59 for(const auto& linked
:
60 onePage
.get("links", Json::Value::nullSingleton())) {
61 auto linkedPageTitle
=
62 linked
.get("title", Json::Value::nullSingleton()).asString();
63 par
= CollectionUtils::get(articleCache
, linkedPageTitle
);
66 par
= std::make_shared
<Article
>(linkedPageTitle
);
67 CollectionUtils::add(articleCache
, par
);
70 wantedArticle
->addLink(par
);
73 // It might happen there are no links. In this case, set analyzed anyway.
74 wantedArticle
->analyzed(true);
75 } else if(onePage
.isMember("linkshere")) {
76 for(const auto& linksHere
:
77 onePage
.get("linkshere", Json::Value::nullSingleton())) {
78 auto linkingPageTitle
=
79 linksHere
.get("title", Json::Value::nullSingleton()).asString();
80 par
= CollectionUtils::get(articleCache
, linkingPageTitle
);
83 par
= std::make_shared
<Article
>(linkingPageTitle
);
84 CollectionUtils::add(articleCache
, par
);
87 par
->addLink(wantedArticle
);
88 /* par.isAnalyzed? this is strictly speaking not true, since we only
89 * know this article links here, not where else it links...
90 * However, it's still automatically set once we call addLink */
97 // always clear, otherwise insert won't happen
98 continuationData_
.clear();
100 if(document
.isMember("continue")) {
102 const auto contData
=
103 document
.get("continue", Json::Value::nullSingleton());
104 assert(contData
.isObject());
106 for(auto it
= contData
.begin(); it
!= contData
.end(); it
++) {
107 continuationData_
.emplace(it
.name(), it
->asString());
113 return moreData
? ContinuationStatus::ConversionNeedsMoreData
114 : ContinuationStatus::ConversionCompleted
;
116 } // namespace WikiWalker