2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
9 #include "ScraperUrl.h"
11 #include "CharsetConverter.h"
12 #include "ServiceBroker.h"
16 #include "filesystem/CurlFile.h"
17 #include "filesystem/ZipFile.h"
18 #include "settings/AdvancedSettings.h"
19 #include "settings/SettingsComponent.h"
20 #include "utils/CharsetDetection.h"
21 #include "utils/Mime.h"
22 #include "utils/StringUtils.h"
23 #include "utils/XBMCTinyXML.h"
24 #include "utils/log.h"
30 CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false)
34 CScraperUrl::CScraperUrl(const std::string
& strUrl
) : CScraperUrl()
36 ParseFromData(strUrl
);
39 CScraperUrl::CScraperUrl(const TiXmlElement
* element
) : CScraperUrl()
41 ParseAndAppendUrl(element
);
44 CScraperUrl::~CScraperUrl() = default;
46 void CScraperUrl::Clear()
54 void CScraperUrl::SetData(std::string data
)
56 m_data
= std::move(data
);
60 const CScraperUrl::SUrlEntry
CScraperUrl::GetFirstUrlByType(const std::string
& type
) const
62 const auto url
= std::find_if(m_urls
.begin(), m_urls
.end(), [type
](const SUrlEntry
& url
) {
63 return url
.m_type
== UrlType::General
&& (type
.empty() || url
.m_aspect
== type
);
65 if (url
!= m_urls
.end())
71 const CScraperUrl::SUrlEntry
CScraperUrl::GetSeasonUrl(int season
, const std::string
& type
) const
73 const auto url
= std::find_if(m_urls
.begin(), m_urls
.end(), [season
, type
](const SUrlEntry
& url
) {
74 return url
.m_type
== UrlType::Season
&& url
.m_season
== season
&&
75 (type
.empty() || type
== "thumb" || url
.m_aspect
== type
);
77 if (url
!= m_urls
.end())
83 unsigned int CScraperUrl::GetMaxSeasonUrl() const
85 unsigned int maxSeason
= 0;
86 for (const auto& url
: m_urls
)
88 if (url
.m_type
== UrlType::Season
&& url
.m_season
> 0 &&
89 static_cast<unsigned int>(url
.m_season
) > maxSeason
)
90 maxSeason
= url
.m_season
;
95 std::string
CScraperUrl::GetFirstThumbUrl() const
100 return GetThumbUrl(m_urls
.front());
103 void CScraperUrl::GetThumbUrls(std::vector
<std::string
>& thumbs
,
104 const std::string
& type
,
108 for (const auto& url
: m_urls
)
110 if (url
.m_aspect
== type
|| type
.empty() || url
.m_aspect
.empty())
112 if ((url
.m_type
== CScraperUrl::UrlType::General
&& season
== -1) ||
113 (url
.m_type
== CScraperUrl::UrlType::Season
&& url
.m_season
== season
))
115 std::string thumbUrl
= GetThumbUrl(url
);
116 if (!unique
|| std::find(thumbs
.begin(), thumbs
.end(), thumbUrl
) == thumbs
.end())
117 thumbs
.push_back(thumbUrl
);
123 bool CScraperUrl::Parse()
128 auto dataToParse
= m_data
;
130 return ParseFromData(dataToParse
);
133 bool CScraperUrl::ParseFromData(const std::string
& data
)
139 /* strUrl is coming from internal sources (usually generated by scraper or from database)
140 * so strUrl is always in UTF-8 */
141 doc
.Parse(data
, TIXML_ENCODING_UTF8
);
143 auto pElement
= doc
.RootElement();
144 if (pElement
== nullptr)
146 m_urls
.emplace_back(data
);
151 while (pElement
!= nullptr)
153 ParseAndAppendUrl(pElement
);
154 pElement
= pElement
->NextSiblingElement(pElement
->Value());
162 bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement
* element
)
164 if (element
== nullptr || element
->FirstChild() == nullptr ||
165 element
->FirstChild()->Value() == nullptr)
168 bool wasEmpty
= m_data
.empty();
170 std::stringstream stream
;
172 m_data
+= stream
.str();
174 SUrlEntry
url(element
->FirstChild()->ValueStr());
175 url
.m_spoof
= XMLUtils::GetAttribute(element
, "spoof");
177 const char* szPost
= element
->Attribute("post");
178 if (szPost
&& StringUtils::CompareNoCase(szPost
, "yes") == 0)
183 const char* szIsGz
= element
->Attribute("gzip");
184 if (szIsGz
&& StringUtils::CompareNoCase(szIsGz
, "yes") == 0)
189 url
.m_cache
= XMLUtils::GetAttribute(element
, "cache");
191 const char* szType
= element
->Attribute("type");
192 if (szType
&& StringUtils::CompareNoCase(szType
, "season") == 0)
194 url
.m_type
= UrlType::Season
;
195 const char* szSeason
= element
->Attribute("season");
197 url
.m_season
= atoi(szSeason
);
200 url
.m_aspect
= XMLUtils::GetAttribute(element
, "aspect");
201 url
.m_preview
= XMLUtils::GetAttribute(element
, "preview");
203 m_urls
.push_back(url
);
211 // XML format is of strUrls is:
212 // <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
213 bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string
& episodeGuide
)
215 if (episodeGuide
.empty())
218 // ok, now parse the xml file
220 /* strUrls is coming from internal sources so strUrls is always in UTF-8 */
221 doc
.Parse(episodeGuide
, TIXML_ENCODING_UTF8
);
222 if (doc
.RootElement() == nullptr)
225 bool wasEmpty
= m_data
.empty();
227 TiXmlHandle
docHandle(&doc
);
228 auto link
= docHandle
.FirstChild("episodeguide").Element();
229 if (link
->FirstChildElement("url"))
231 for (link
= link
->FirstChildElement("url"); link
; link
= link
->NextSiblingElement("url"))
232 ParseAndAppendUrl(link
);
234 else if (link
->FirstChild() && link
->FirstChild()->Value())
235 ParseAndAppendUrl(link
);
243 void CScraperUrl::AddParsedUrl(const std::string
& url
,
244 const std::string
& aspect
,
245 const std::string
& preview
,
246 const std::string
& referrer
,
247 const std::string
& cache
,
252 bool wasEmpty
= m_data
.empty();
254 TiXmlElement
thumb("thumb");
255 thumb
.SetAttribute("spoof", referrer
);
256 thumb
.SetAttribute("cache", cache
);
258 thumb
.SetAttribute("post", "yes");
260 thumb
.SetAttribute("gzip", "yes");
263 thumb
.SetAttribute("season", std::to_string(season
));
264 thumb
.SetAttribute("type", "season");
266 thumb
.SetAttribute("aspect", aspect
);
267 thumb
.SetAttribute("preview", preview
);
269 thumb
.InsertEndChild(text
);
274 nUrl
.m_spoof
= referrer
;
277 nUrl
.m_cache
= cache
;
278 nUrl
.m_preview
= preview
;
281 nUrl
.m_type
= UrlType::Season
;
282 nUrl
.m_season
= season
;
284 nUrl
.m_aspect
= aspect
;
286 m_urls
.push_back(nUrl
);
292 std::string
CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry
& entry
)
294 if (entry
.m_spoof
.empty())
297 return entry
.m_url
+ "|Referer=" + CURL::Encode(entry
.m_spoof
);
300 bool CScraperUrl::Get(const SUrlEntry
& scrURL
,
301 std::string
& strHTML
,
302 XFILE::CCurlFile
& http
,
303 const std::string
& cacheContext
)
305 CURL
url(scrURL
.m_url
);
306 http
.SetReferer(scrURL
.m_spoof
);
307 std::string strCachePath
;
309 if (!scrURL
.m_cache
.empty())
311 strCachePath
= URIUtils::AddFileToFolder(
312 CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath
, "scrapers",
313 cacheContext
, scrURL
.m_cache
);
314 if (XFILE::CFile::Exists(strCachePath
))
317 std::vector
<uint8_t> buffer
;
318 if (file
.LoadFile(strCachePath
, buffer
) > 0)
320 strHTML
.assign(reinterpret_cast<char*>(buffer
.data()), buffer
.size());
326 auto strHTML1
= strHTML
;
330 std::string strOptions
= url
.GetOptions();
331 strOptions
= strOptions
.substr(1);
334 if (!http
.Post(url
.Get(), strOptions
, strHTML1
))
337 else if (!http
.Get(url
.Get(), strHTML1
))
342 const auto mimeType
= http
.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE
);
343 CMime::EFileType ftype
= CMime::GetFileTypeFromMime(mimeType
);
344 if (ftype
== CMime::FileTypeUnknown
)
345 ftype
= CMime::GetFileTypeFromContent(strHTML
);
347 if (ftype
== CMime::FileTypeZip
|| ftype
== CMime::FileTypeGZip
)
349 XFILE::CZipFile file
;
350 std::string strBuffer
;
351 auto iSize
= file
.UnpackFromMemory(
352 strBuffer
, strHTML
, scrURL
.m_isgz
); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
356 CLog::Log(LOGDEBUG
, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__
, scrURL
.m_url
);
359 CLog::Log(LOGWARNING
, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__
,
363 const auto reportedCharset
= http
.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET
);
364 if (ftype
== CMime::FileTypeHtml
)
366 std::string realHtmlCharset
, converted
;
367 if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML
, converted
, reportedCharset
, realHtmlCharset
))
368 CLog::Log(LOGWARNING
,
369 "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback",
370 __FUNCTION__
, scrURL
.m_url
, realHtmlCharset
);
372 CLog::Log(LOGDEBUG
, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__
, realHtmlCharset
,
377 else if (ftype
== CMime::FileTypeXml
)
380 xmlDoc
.Parse(strHTML
, reportedCharset
);
382 const auto realXmlCharset
= xmlDoc
.GetUsedCharset();
383 if (!realXmlCharset
.empty())
385 CLog::Log(LOGDEBUG
, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__
, realXmlCharset
,
387 std::string converted
;
388 g_charsetConverter
.ToUtf8(realXmlCharset
, strHTML
, converted
);
392 else if (ftype
== CMime::FileTypePlainText
||
393 StringUtils::EqualsNoCase(mimeType
.substr(0, 5), "text/"))
395 std::string realTextCharset
;
396 std::string converted
;
397 CCharsetDetection::ConvertPlainTextToUtf8(strHTML
, converted
, reportedCharset
, realTextCharset
);
399 if (reportedCharset
!= realTextCharset
)
400 CLog::Log(LOGWARNING
,
401 "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" "
403 __FUNCTION__
, realTextCharset
, scrURL
.m_url
, reportedCharset
);
405 CLog::Log(LOGDEBUG
, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__
,
406 realTextCharset
, scrURL
.m_url
);
408 else if (!reportedCharset
.empty())
410 CLog::Log(LOGDEBUG
, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__
, reportedCharset
,
412 if (reportedCharset
!= "UTF-8")
414 std::string converted
;
415 g_charsetConverter
.ToUtf8(reportedCharset
, strHTML
, converted
);
420 CLog::Log(LOGDEBUG
, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset",
421 __FUNCTION__
, scrURL
.m_url
);
423 if (!scrURL
.m_cache
.empty())
425 const auto strCachePath
= URIUtils::AddFileToFolder(
426 CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath
, "scrapers",
427 cacheContext
, scrURL
.m_cache
);
429 if (!file
.OpenForWrite(strCachePath
, true) ||
430 file
.Write(strHTML
.data(), strHTML
.size()) != static_cast<ssize_t
>(strHTML
.size()))