Merge pull request #26166 from ksooo/improve-plugin-ctx-menus
[xbmc.git] / xbmc / utils / ScraperUrl.cpp
blobf131b165957e248c43869af800161cb79e00b047
1 /*
2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
9 #include "ScraperUrl.h"
11 #include "CharsetConverter.h"
12 #include "ServiceBroker.h"
13 #include "URIUtils.h"
14 #include "URL.h"
15 #include "XMLUtils.h"
16 #include "filesystem/CurlFile.h"
17 #include "filesystem/ZipFile.h"
18 #include "settings/AdvancedSettings.h"
19 #include "settings/SettingsComponent.h"
20 #include "utils/CharsetDetection.h"
21 #include "utils/Mime.h"
22 #include "utils/StringUtils.h"
23 #include "utils/XBMCTinyXML.h"
24 #include "utils/log.h"
26 #include <algorithm>
27 #include <cstring>
28 #include <sstream>
30 CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false)
34 CScraperUrl::CScraperUrl(const std::string& strUrl) : CScraperUrl()
36 ParseFromData(strUrl);
39 CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl()
41 ParseAndAppendUrl(element);
44 CScraperUrl::~CScraperUrl() = default;
46 void CScraperUrl::Clear()
48 m_urls.clear();
49 m_data.clear();
50 m_relevance = 0.0;
51 m_parsed = false;
54 void CScraperUrl::SetData(std::string data)
56 m_data = std::move(data);
57 m_parsed = false;
60 const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const
62 const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) {
63 return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type);
64 });
65 if (url != m_urls.end())
66 return *url;
68 return SUrlEntry();
71 const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const
73 const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) {
74 return url.m_type == UrlType::Season && url.m_season == season &&
75 (type.empty() || type == "thumb" || url.m_aspect == type);
76 });
77 if (url != m_urls.end())
78 return *url;
80 return SUrlEntry();
83 unsigned int CScraperUrl::GetMaxSeasonUrl() const
85 unsigned int maxSeason = 0;
86 for (const auto& url : m_urls)
88 if (url.m_type == UrlType::Season && url.m_season > 0 &&
89 static_cast<unsigned int>(url.m_season) > maxSeason)
90 maxSeason = url.m_season;
92 return maxSeason;
95 std::string CScraperUrl::GetFirstThumbUrl() const
97 if (m_urls.empty())
98 return {};
100 return GetThumbUrl(m_urls.front());
103 void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs,
104 const std::string& type,
105 int season,
106 bool unique) const
108 for (const auto& url : m_urls)
110 if (url.m_aspect == type || type.empty() || url.m_aspect.empty())
112 if ((url.m_type == CScraperUrl::UrlType::General && season == -1) ||
113 (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season))
115 std::string thumbUrl = GetThumbUrl(url);
116 if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end())
117 thumbs.push_back(thumbUrl);
123 bool CScraperUrl::Parse()
125 if (m_parsed)
126 return true;
128 auto dataToParse = m_data;
129 m_data.clear();
130 return ParseFromData(dataToParse);
133 bool CScraperUrl::ParseFromData(const std::string& data)
135 if (data.empty())
136 return false;
138 CXBMCTinyXML doc;
139 /* strUrl is coming from internal sources (usually generated by scraper or from database)
140 * so strUrl is always in UTF-8 */
141 doc.Parse(data, TIXML_ENCODING_UTF8);
143 auto pElement = doc.RootElement();
144 if (pElement == nullptr)
146 m_urls.emplace_back(data);
147 m_data = data;
149 else
151 while (pElement != nullptr)
153 ParseAndAppendUrl(pElement);
154 pElement = pElement->NextSiblingElement(pElement->Value());
158 m_parsed = true;
159 return true;
162 bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element)
164 if (element == nullptr || element->FirstChild() == nullptr ||
165 element->FirstChild()->Value() == nullptr)
166 return false;
168 bool wasEmpty = m_data.empty();
170 std::stringstream stream;
171 stream << *element;
172 m_data += stream.str();
174 SUrlEntry url(element->FirstChild()->ValueStr());
175 url.m_spoof = XMLUtils::GetAttribute(element, "spoof");
177 const char* szPost = element->Attribute("post");
178 if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0)
179 url.m_post = true;
180 else
181 url.m_post = false;
183 const char* szIsGz = element->Attribute("gzip");
184 if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0)
185 url.m_isgz = true;
186 else
187 url.m_isgz = false;
189 url.m_cache = XMLUtils::GetAttribute(element, "cache");
191 const char* szType = element->Attribute("type");
192 if (szType && StringUtils::CompareNoCase(szType, "season") == 0)
194 url.m_type = UrlType::Season;
195 const char* szSeason = element->Attribute("season");
196 if (szSeason)
197 url.m_season = atoi(szSeason);
200 url.m_aspect = XMLUtils::GetAttribute(element, "aspect");
201 url.m_preview = XMLUtils::GetAttribute(element, "preview");
203 m_urls.push_back(url);
205 if (wasEmpty)
206 m_parsed = true;
208 return true;
211 // XML format is of strUrls is:
212 // <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
213 bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string& episodeGuide)
215 if (episodeGuide.empty())
216 return false;
218 // ok, now parse the xml file
219 CXBMCTinyXML doc;
220 /* strUrls is coming from internal sources so strUrls is always in UTF-8 */
221 doc.Parse(episodeGuide, TIXML_ENCODING_UTF8);
222 if (doc.RootElement() == nullptr)
223 return false;
225 bool wasEmpty = m_data.empty();
227 TiXmlHandle docHandle(&doc);
228 auto link = docHandle.FirstChild("episodeguide").Element();
229 if (link->FirstChildElement("url"))
231 for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
232 ParseAndAppendUrl(link);
234 else if (link->FirstChild() && link->FirstChild()->Value())
235 ParseAndAppendUrl(link);
237 if (wasEmpty)
238 m_parsed = true;
240 return true;
243 void CScraperUrl::AddParsedUrl(const std::string& url,
244 const std::string& aspect,
245 const std::string& preview,
246 const std::string& referrer,
247 const std::string& cache,
248 bool post,
249 bool isgz,
250 int season)
252 bool wasEmpty = m_data.empty();
254 TiXmlElement thumb("thumb");
255 thumb.SetAttribute("spoof", referrer);
256 thumb.SetAttribute("cache", cache);
257 if (post)
258 thumb.SetAttribute("post", "yes");
259 if (isgz)
260 thumb.SetAttribute("gzip", "yes");
261 if (season >= 0)
263 thumb.SetAttribute("season", std::to_string(season));
264 thumb.SetAttribute("type", "season");
266 thumb.SetAttribute("aspect", aspect);
267 thumb.SetAttribute("preview", preview);
268 TiXmlText text(url);
269 thumb.InsertEndChild(text);
271 m_data << thumb;
273 SUrlEntry nUrl(url);
274 nUrl.m_spoof = referrer;
275 nUrl.m_post = post;
276 nUrl.m_isgz = isgz;
277 nUrl.m_cache = cache;
278 nUrl.m_preview = preview;
279 if (season >= 0)
281 nUrl.m_type = UrlType::Season;
282 nUrl.m_season = season;
284 nUrl.m_aspect = aspect;
286 m_urls.push_back(nUrl);
288 if (wasEmpty)
289 m_parsed = true;
292 std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry)
294 if (entry.m_spoof.empty())
295 return entry.m_url;
297 return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof);
300 bool CScraperUrl::Get(const SUrlEntry& scrURL,
301 std::string& strHTML,
302 XFILE::CCurlFile& http,
303 const std::string& cacheContext)
305 CURL url(scrURL.m_url);
306 http.SetReferer(scrURL.m_spoof);
307 std::string strCachePath;
309 if (!scrURL.m_cache.empty())
311 strCachePath = URIUtils::AddFileToFolder(
312 CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
313 cacheContext, scrURL.m_cache);
314 if (XFILE::CFile::Exists(strCachePath))
316 XFILE::CFile file;
317 std::vector<uint8_t> buffer;
318 if (file.LoadFile(strCachePath, buffer) > 0)
320 strHTML.assign(reinterpret_cast<char*>(buffer.data()), buffer.size());
321 return true;
326 auto strHTML1 = strHTML;
328 if (scrURL.m_post)
330 std::string strOptions = url.GetOptions();
331 strOptions = strOptions.substr(1);
332 url.SetOptions("");
334 if (!http.Post(url.Get(), strOptions, strHTML1))
335 return false;
337 else if (!http.Get(url.Get(), strHTML1))
338 return false;
340 strHTML = strHTML1;
342 const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE);
343 CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
344 if (ftype == CMime::FileTypeUnknown)
345 ftype = CMime::GetFileTypeFromContent(strHTML);
347 if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
349 XFILE::CZipFile file;
350 std::string strBuffer;
351 auto iSize = file.UnpackFromMemory(
352 strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
353 if (iSize > 0)
355 strHTML = strBuffer;
356 CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url);
358 else
359 CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__,
360 scrURL.m_url);
363 const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET);
364 if (ftype == CMime::FileTypeHtml)
366 std::string realHtmlCharset, converted;
367 if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
368 CLog::Log(LOGWARNING,
369 "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback",
370 __FUNCTION__, scrURL.m_url, realHtmlCharset);
371 else
372 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset,
373 scrURL.m_url);
375 strHTML = converted;
377 else if (ftype == CMime::FileTypeXml)
379 CXBMCTinyXML xmlDoc;
380 xmlDoc.Parse(strHTML, reportedCharset);
382 const auto realXmlCharset = xmlDoc.GetUsedCharset();
383 if (!realXmlCharset.empty())
385 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset,
386 scrURL.m_url);
387 std::string converted;
388 g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
389 strHTML = converted;
392 else if (ftype == CMime::FileTypePlainText ||
393 StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/"))
395 std::string realTextCharset;
396 std::string converted;
397 CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
398 strHTML = converted;
399 if (reportedCharset != realTextCharset)
400 CLog::Log(LOGWARNING,
401 "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" "
402 "charset",
403 __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset);
404 else
405 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__,
406 realTextCharset, scrURL.m_url);
408 else if (!reportedCharset.empty())
410 CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset,
411 scrURL.m_url);
412 if (reportedCharset != "UTF-8")
414 std::string converted;
415 g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
416 strHTML = converted;
419 else
420 CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset",
421 __FUNCTION__, scrURL.m_url);
423 if (!scrURL.m_cache.empty())
425 const auto strCachePath = URIUtils::AddFileToFolder(
426 CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
427 cacheContext, scrURL.m_cache);
428 XFILE::CFile file;
429 if (!file.OpenForWrite(strCachePath, true) ||
430 file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size()))
431 return false;
433 return true;