3 #include "WikimediaApi.h"
7 #include <LUrlParser.h>
9 #include "CurlUrlCreator.h"
10 #include "StringUtils.h"
11 #include "WalkerException.h"
12 #include "WikimediaJsonToArticleConverter.h"
17 * \brief Fetches articles and converts them
19 * \param collection the collection to store converted articles into
20 * \param creator the prepared CurlUrlCreator
21 * \param grabber the CurlWikiGrabber used for fetching
22 * \param continueKey the key/parameter used for continuation
23 * \param generator which generator to use, if any
25 static void grabAndConvert(CollectionUtils::ArticleCollection
& collection
,
26 CurlUrlCreator
& creator
,
27 CurlWikiGrabber
& grabber
,
28 WikimediaApi::WikimediaGenerator generator
)
31 case WikimediaApi::WikimediaGenerator::ForwardLinkGenerator
:
32 creator
.addParameter({{"generator", "links"},
33 {"gplnamespace", "0"},
34 {"gpllimit", "max"}});
36 case WikimediaApi::WikimediaGenerator::NoGenerator
:
40 throw WalkerException("Unsupported generator");
44 std::string json
= grabber
.grabUrl(creator
.buildUrl());
46 WikimediaJsonToArticleConverter conv
;
47 auto conversionStatus
= conv
.convert(json
, collection
);
49 while(WikimediaJsonToArticleConverter::ContinuationStatus::
50 ConversionNeedsMoreData
== conversionStatus
&&
51 !conv
.continuationData().empty()) {
52 creator
.addParameter(conv
.continuationData());
54 json
= grabber
.grabUrl(creator
.buildUrl());
55 conversionStatus
= conv
.convert(json
, collection
);
58 throw WalkerException("Error fetching article");
63 * \brief Gets a CurlUrlCreator with common properties
64 * \param baseUrl the Wikimedia API base URL
65 * \param title the article title
66 * \returns the prepared CurlUrlCreator
68 static CurlUrlCreator
getUrlCreator(std::string baseUrl
, std::string title
)
70 CurlUrlCreator
creator(std::move(baseUrl
));
71 creator
.addParameter({{"action", "query"},
73 {"formatversion", "2"},
78 WikimediaApi::WikimediaApi(std::string baseUrl
) : baseUrl_(std::move(baseUrl
))
80 auto result
= LUrlParser::clParseURL::ParseURL(baseUrl_
);
81 if(!result
.IsValid()) {
82 throw WalkerException("Invalid URL");
86 void WikimediaApi::fetchBackwardLinks(
87 const std::string
& title
,
88 WikimediaApi::WikimediaGenerator generator
,
89 CollectionUtils::ArticleCollection
& collection
)
92 throw WalkerException("Title can't be empty.");
95 CurlUrlCreator creator
= getUrlCreator(baseUrl_
, title
);
96 creator
.addParameter({{"prop", "linkshere"},
101 grabAndConvert(collection
, creator
, grabber_
, generator
);
104 void WikimediaApi::fetchForwardLinks(
105 const std::string
& title
,
106 WikimediaApi::WikimediaGenerator generator
,
107 CollectionUtils::ArticleCollection
& collection
)
110 throw WalkerException("Title can't be empty.");
113 CurlUrlCreator creator
= getUrlCreator(baseUrl_
, title
);
114 creator
.addParameter(
115 {{"prop", "links"}, {"plnamespace", "0"}, {"pllimit", "max"}});
117 grabAndConvert(collection
, creator
, grabber_
, generator
);
120 namespace WikimediaApiUtils
122 WikimediaUrlInfo
parseArticleUrl(const std::string
& articleUrl
)
124 WikimediaUrlInfo urlInfo
;
127 auto parsedUrl
= LUrlParser::clParseURL::ParseURL(articleUrl
);
128 if(!parsedUrl
.IsValid()) {
129 // if URL with no protocol is passed, use HTTPS
130 std::string protocol
= "https://";
131 parsedUrl
= LUrlParser::clParseURL::ParseURL(protocol
+ articleUrl
);
133 if(!parsedUrl
.IsValid()) {
134 throw WalkerException("Invalid URL");
138 std::string path
= parsedUrl
.m_Path
;
139 std::string pathMustStartWith
= "wiki/";
141 // path must begin with /wiki/
142 if(!StringUtils::startsWith(path
, pathMustStartWith
)) {
143 throw WalkerException("Must be an Wikimedia URL which uses /wiki/");
146 // extract Wikipedia title
148 path
.substr(pathMustStartWith
.length(),
149 path
.length() - pathMustStartWith
.length());
152 throw WalkerException("Must be an Wikimedia URL - Article missing");
155 std::string apiBaseUrl
;
157 apiBaseUrl
= parsedUrl
.m_Scheme
;
158 apiBaseUrl
.append("://");
159 apiBaseUrl
.append(parsedUrl
.m_Host
);
161 if(!parsedUrl
.m_Port
.empty()) {
162 apiBaseUrl
.append(":");
163 apiBaseUrl
.append(parsedUrl
.m_Port
);
166 apiBaseUrl
.append("/w/api.php");
168 urlInfo
.articleTitle
= std::move(title
);
169 urlInfo
.apiBaseUrl
= std::move(apiBaseUrl
);
173 } // namespace WikimediaApiUtils
174 } // namespace WikiWalker