Support conversion of linkshere
[dueringa_WikiWalker.git] / src / WikimediaApi.cpp
blobee67ae4d45604afdbd0819c459c75e52d796b2e9
1 //! \file
3 #include "WikimediaApi.h"
5 #include <utility>
7 #include <LUrlParser.h>
9 #include "CurlUrlCreator.h"
10 #include "StringUtils.h"
11 #include "WalkerException.h"
12 #include "WikimediaJsonToArticleConverter.h"
14 namespace WikiWalker
16 /*!
17 * \brief Fetches articles and converts them
19 * \param collection the collection to store converted articles into
20 * \param creator the prepared CurlUrlCreator
21 * \param grabber the CurlWikiGrabber used for fetching
22 * \param continueKey the key/parameter used for continuation
23 * \param generator which generator to use, if any
25 static void grabAndConvert(CollectionUtils::ArticleCollection& collection,
26 CurlUrlCreator& creator,
27 CurlWikiGrabber& grabber,
28 WikimediaApi::WikimediaGenerator generator)
30 switch(generator) {
31 case WikimediaApi::WikimediaGenerator::ForwardLinkGenerator:
32 creator.addParameter({{"generator", "links"},
33 {"gplnamespace", "0"},
34 {"gpllimit", "max"}});
35 break;
36 case WikimediaApi::WikimediaGenerator::NoGenerator:
37 // nothing to do
38 break;
39 default:
40 throw WalkerException("Unsupported generator");
41 break;
44 std::string json = grabber.grabUrl(creator.buildUrl());
45 if(!json.empty()) {
46 WikimediaJsonToArticleConverter conv;
47 auto conversionStatus = conv.convert(json, collection);
49 while(WikimediaJsonToArticleConverter::ContinuationStatus::
50 ConversionNeedsMoreData == conversionStatus &&
51 !conv.continuationData().empty()) {
52 creator.addParameter(conv.continuationData());
54 json = grabber.grabUrl(creator.buildUrl());
55 conversionStatus = conv.convert(json, collection);
57 } else {
58 throw WalkerException("Error fetching article");
62 /*!
63 * \brief Gets a CurlUrlCreator with common properties
64 * \param baseUrl the Wikimedia API base URL
65 * \param title the article title
66 * \returns the prepared CurlUrlCreator
68 static CurlUrlCreator getUrlCreator(std::string baseUrl, std::string title)
70 CurlUrlCreator creator(std::move(baseUrl));
71 creator.addParameter({{"action", "query"},
72 {"format", "json"},
73 {"formatversion", "2"},
74 {"titles", title}});
75 return creator;
78 WikimediaApi::WikimediaApi(std::string baseUrl) : baseUrl_(std::move(baseUrl))
80 auto result = LUrlParser::clParseURL::ParseURL(baseUrl_);
81 if(!result.IsValid()) {
82 throw WalkerException("Invalid URL");
86 void WikimediaApi::fetchBackwardLinks(
87 const std::string& title,
88 WikimediaApi::WikimediaGenerator generator,
89 CollectionUtils::ArticleCollection& collection)
91 if(title.empty()) {
92 throw WalkerException("Title can't be empty.");
95 CurlUrlCreator creator = getUrlCreator(baseUrl_, title);
96 creator.addParameter({{"prop", "linkshere"},
97 {"lhprop", "title"},
98 {"lhnamespace", "0"},
99 {"lhlimit", "max"}});
101 grabAndConvert(collection, creator, grabber_, generator);
104 void WikimediaApi::fetchForwardLinks(
105 const std::string& title,
106 WikimediaApi::WikimediaGenerator generator,
107 CollectionUtils::ArticleCollection& collection)
109 if(title.empty()) {
110 throw WalkerException("Title can't be empty.");
113 CurlUrlCreator creator = getUrlCreator(baseUrl_, title);
114 creator.addParameter(
115 {{"prop", "links"}, {"plnamespace", "0"}, {"pllimit", "max"}});
117 grabAndConvert(collection, creator, grabber_, generator);
120 namespace WikimediaApiUtils
122 WikimediaUrlInfo parseArticleUrl(const std::string& articleUrl)
124 WikimediaUrlInfo urlInfo;
126 // try parsing URL
127 auto parsedUrl = LUrlParser::clParseURL::ParseURL(articleUrl);
128 if(!parsedUrl.IsValid()) {
129 // if URL with no protocol is passed, use HTTPS
130 std::string protocol = "https://";
131 parsedUrl = LUrlParser::clParseURL::ParseURL(protocol + articleUrl);
133 if(!parsedUrl.IsValid()) {
134 throw WalkerException("Invalid URL");
138 std::string path = parsedUrl.m_Path;
139 std::string pathMustStartWith = "wiki/";
141 // path must begin with /wiki/
142 if(!StringUtils::startsWith(path, pathMustStartWith)) {
143 throw WalkerException("Must be an Wikimedia URL which uses /wiki/");
146 // extract Wikipedia title
147 std::string title =
148 path.substr(pathMustStartWith.length(),
149 path.length() - pathMustStartWith.length());
151 if(title.empty()) {
152 throw WalkerException("Must be an Wikimedia URL - Article missing");
155 std::string apiBaseUrl;
157 apiBaseUrl = parsedUrl.m_Scheme;
158 apiBaseUrl.append("://");
159 apiBaseUrl.append(parsedUrl.m_Host);
161 if(!parsedUrl.m_Port.empty()) {
162 apiBaseUrl.append(":");
163 apiBaseUrl.append(parsedUrl.m_Port);
166 apiBaseUrl.append("/w/api.php");
168 urlInfo.articleTitle = std::move(title);
169 urlInfo.apiBaseUrl = std::move(apiBaseUrl);
171 return urlInfo;
173 } // namespace WikimediaApiUtils
174 } // namespace WikiWalker