xbmc/utils/ScraperUrl.cpp

   1 /*
   2  *  Copyright (C) 2005-2018 Team Kodi
   3  *  This file is part of Kodi - https://kodi.tv
   4  *
   5  *  SPDX-License-Identifier: GPL-2.0-or-later
   6  *  See LICENSES/README.md for more information.
   7  */
   8
   9 #include "ScraperUrl.h"
  10
  11 #include "CharsetConverter.h"
  12 #include "ServiceBroker.h"
  13 #include "URIUtils.h"
  14 #include "URL.h"
  15 #include "XMLUtils.h"
  16 #include "filesystem/CurlFile.h"
  17 #include "filesystem/ZipFile.h"
  18 #include "settings/AdvancedSettings.h"
  19 #include "settings/SettingsComponent.h"
  20 #include "utils/CharsetDetection.h"
  21 #include "utils/Mime.h"
  22 #include "utils/StringUtils.h"
  23 #include "utils/XBMCTinyXML.h"
  24 #include "utils/log.h"
  25
  26 #include <algorithm>
  27 #include <cstring>
  28 #include <sstream>
  29
  30 CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false)
  31 {
  32 }
  33
  34 CScraperUrl::CScraperUrl(const std::string& strUrl) : CScraperUrl()
  35 {
  36   ParseFromData(strUrl);
  37 }
  38
  39 CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl()
  40 {
  41   ParseAndAppendUrl(element);
  42 }
  43
  44 CScraperUrl::~CScraperUrl() = default;
  45
  46 void CScraperUrl::Clear()
  47 {
  48   m_urls.clear();
  49   m_data.clear();
  50   m_relevance = 0.0;
  51   m_parsed = false;
  52 }
  53
  54 void CScraperUrl::SetData(std::string data)
  55 {
  56   m_data = std::move(data);
  57   m_parsed = false;
  58 }
  59
  60 const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const
  61 {
  62   const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) {
  63     return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type);
  64   });
  65   if (url != m_urls.end())
  66     return *url;
  67
  68   return SUrlEntry();
  69 }
  70
  71 const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const
  72 {
  73   const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) {
  74     return url.m_type == UrlType::Season && url.m_season == season &&
  75            (type.empty() || type == "thumb" || url.m_aspect == type);
  76   });
  77   if (url != m_urls.end())
  78     return *url;
  79
  80   return SUrlEntry();
  81 }
  82
  83 unsigned int CScraperUrl::GetMaxSeasonUrl() const
  84 {
  85   unsigned int maxSeason = 0;
  86   for (const auto& url : m_urls)
  87   {
  88     if (url.m_type == UrlType::Season && url.m_season > 0 &&
  89         static_cast<unsigned int>(url.m_season) > maxSeason)
  90       maxSeason = url.m_season;
  91   }
  92   return maxSeason;
  93 }
  94
  95 std::string CScraperUrl::GetFirstThumbUrl() const
  96 {
  97   if (m_urls.empty())
  98     return {};
  99
 100   return GetThumbUrl(m_urls.front());
 101 }
 102
 103 void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs,
 104                                const std::string& type,
 105                                int season,
 106                                bool unique) const
 107 {
 108   for (const auto& url : m_urls)
 109   {
 110     if (url.m_aspect == type || type.empty() || url.m_aspect.empty())
 111     {
 112       if ((url.m_type == CScraperUrl::UrlType::General && season == -1) ||
 113           (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season))
 114       {
 115         std::string thumbUrl = GetThumbUrl(url);
 116         if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end())
 117           thumbs.push_back(thumbUrl);
 118       }
 119     }
 120   }
 121 }
 122
 123 bool CScraperUrl::Parse()
 124 {
 125   if (m_parsed)
 126     return true;
 127
 128   auto dataToParse = m_data;
 129   m_data.clear();
 130   return ParseFromData(dataToParse);
 131 }
 132
 133 bool CScraperUrl::ParseFromData(const std::string& data)
 134 {
 135   if (data.empty())
 136     return false;
 137
 138   CXBMCTinyXML doc;
 139   /* strUrl is coming from internal sources (usually generated by scraper or from database)
 140    * so strUrl is always in UTF-8 */
 141   doc.Parse(data, TIXML_ENCODING_UTF8);
 142
 143   auto pElement = doc.RootElement();
 144   if (pElement == nullptr)
 145   {
 146     m_urls.emplace_back(data);
 147     m_data = data;
 148   }
 149   else
 150   {
 151     while (pElement != nullptr)
 152     {
 153       ParseAndAppendUrl(pElement);
 154       pElement = pElement->NextSiblingElement(pElement->Value());
 155     }
 156   }
 157
 158   m_parsed = true;
 159   return true;
 160 }
 161
 162 bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element)
 163 {
 164   if (element == nullptr || element->FirstChild() == nullptr ||
 165       element->FirstChild()->Value() == nullptr)
 166     return false;
 167
 168   bool wasEmpty = m_data.empty();
 169
 170   std::stringstream stream;
 171   stream << *element;
 172   m_data += stream.str();
 173
 174   SUrlEntry url(element->FirstChild()->ValueStr());
 175   url.m_spoof = XMLUtils::GetAttribute(element, "spoof");
 176
 177   const char* szPost = element->Attribute("post");
 178   if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0)
 179     url.m_post = true;
 180   else
 181     url.m_post = false;
 182
 183   const char* szIsGz = element->Attribute("gzip");
 184   if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0)
 185     url.m_isgz = true;
 186   else
 187     url.m_isgz = false;
 188
 189   url.m_cache = XMLUtils::GetAttribute(element, "cache");
 190
 191   const char* szType = element->Attribute("type");
 192   if (szType && StringUtils::CompareNoCase(szType, "season") == 0)
 193   {
 194     url.m_type = UrlType::Season;
 195     const char* szSeason = element->Attribute("season");
 196     if (szSeason)
 197       url.m_season = atoi(szSeason);
 198   }
 199
 200   url.m_aspect = XMLUtils::GetAttribute(element, "aspect");
 201   url.m_preview = XMLUtils::GetAttribute(element, "preview");
 202
 203   m_urls.push_back(url);
 204
 205   if (wasEmpty)
 206     m_parsed = true;
 207
 208   return true;
 209 }
 210
 211 // XML format is of strUrls is:
 212 // <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
 213 bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string& episodeGuide)
 214 {
 215   if (episodeGuide.empty())
 216     return false;
 217
 218   // ok, now parse the xml file
 219   CXBMCTinyXML doc;
 220   /* strUrls is coming from internal sources so strUrls is always in UTF-8 */
 221   doc.Parse(episodeGuide, TIXML_ENCODING_UTF8);
 222   if (doc.RootElement() == nullptr)
 223     return false;
 224
 225   bool wasEmpty = m_data.empty();
 226
 227   TiXmlHandle docHandle(&doc);
 228   auto link = docHandle.FirstChild("episodeguide").Element();
 229   if (link->FirstChildElement("url"))
 230   {
 231     for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
 232       ParseAndAppendUrl(link);
 233   }
 234   else if (link->FirstChild() && link->FirstChild()->Value())
 235     ParseAndAppendUrl(link);
 236
 237   if (wasEmpty)
 238     m_parsed = true;
 239
 240   return true;
 241 }
 242
 243 void CScraperUrl::AddParsedUrl(const std::string& url,
 244                                const std::string& aspect,
 245                                const std::string& preview,
 246                                const std::string& referrer,
 247                                const std::string& cache,
 248                                bool post,
 249                                bool isgz,
 250                                int season)
 251 {
 252   bool wasEmpty = m_data.empty();
 253
 254   TiXmlElement thumb("thumb");
 255   thumb.SetAttribute("spoof", referrer);
 256   thumb.SetAttribute("cache", cache);
 257   if (post)
 258     thumb.SetAttribute("post", "yes");
 259   if (isgz)
 260     thumb.SetAttribute("gzip", "yes");
 261   if (season >= 0)
 262   {
 263     thumb.SetAttribute("season", std::to_string(season));
 264     thumb.SetAttribute("type", "season");
 265   }
 266   thumb.SetAttribute("aspect", aspect);
 267   thumb.SetAttribute("preview", preview);
 268   TiXmlText text(url);
 269   thumb.InsertEndChild(text);
 270
 271   m_data << thumb;
 272
 273   SUrlEntry nUrl(url);
 274   nUrl.m_spoof = referrer;
 275   nUrl.m_post = post;
 276   nUrl.m_isgz = isgz;
 277   nUrl.m_cache = cache;
 278   nUrl.m_preview = preview;
 279   if (season >= 0)
 280   {
 281     nUrl.m_type = UrlType::Season;
 282     nUrl.m_season = season;
 283   }
 284   nUrl.m_aspect = aspect;
 285
 286   m_urls.push_back(nUrl);
 287
 288   if (wasEmpty)
 289     m_parsed = true;
 290 }
 291
 292 std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry)
 293 {
 294   if (entry.m_spoof.empty())
 295     return entry.m_url;
 296
 297   return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof);
 298 }
 299
 300 bool CScraperUrl::Get(const SUrlEntry& scrURL,
 301                       std::string& strHTML,
 302                       XFILE::CCurlFile& http,
 303                       const std::string& cacheContext)
 304 {
 305   CURL url(scrURL.m_url);
 306   http.SetReferer(scrURL.m_spoof);
 307   std::string strCachePath;
 308
 309   if (!scrURL.m_cache.empty())
 310   {
 311     strCachePath = URIUtils::AddFileToFolder(
 312         CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
 313         cacheContext, scrURL.m_cache);
 314     if (XFILE::CFile::Exists(strCachePath))
 315     {
 316       XFILE::CFile file;
 317       std::vector<uint8_t> buffer;
 318       if (file.LoadFile(strCachePath, buffer) > 0)
 319       {
 320         strHTML.assign(reinterpret_cast<char*>(buffer.data()), buffer.size());
 321         return true;
 322       }
 323     }
 324   }
 325
 326   auto strHTML1 = strHTML;
 327
 328   if (scrURL.m_post)
 329   {
 330     std::string strOptions = url.GetOptions();
 331     strOptions = strOptions.substr(1);
 332     url.SetOptions("");
 333
 334     if (!http.Post(url.Get(), strOptions, strHTML1))
 335       return false;
 336   }
 337   else if (!http.Get(url.Get(), strHTML1))
 338     return false;
 339
 340   strHTML = strHTML1;
 341
 342   const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE);
 343   CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
 344   if (ftype == CMime::FileTypeUnknown)
 345     ftype = CMime::GetFileTypeFromContent(strHTML);
 346
 347   if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
 348   {
 349     XFILE::CZipFile file;
 350     std::string strBuffer;
 351     auto iSize = file.UnpackFromMemory(
 352         strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
 353     if (iSize > 0)
 354     {
 355       strHTML = strBuffer;
 356       CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url);
 357     }
 358     else
 359       CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__,
 360                 scrURL.m_url);
 361   }
 362
 363   const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET);
 364   if (ftype == CMime::FileTypeHtml)
 365   {
 366     std::string realHtmlCharset, converted;
 367     if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
 368       CLog::Log(LOGWARNING,
 369                 "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback",
 370                 __FUNCTION__, scrURL.m_url, realHtmlCharset);
 371     else
 372       CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset,
 373                 scrURL.m_url);
 374
 375     strHTML = converted;
 376   }
 377   else if (ftype == CMime::FileTypeXml)
 378   {
 379     CXBMCTinyXML xmlDoc;
 380     xmlDoc.Parse(strHTML, reportedCharset);
 381
 382     const auto realXmlCharset = xmlDoc.GetUsedCharset();
 383     if (!realXmlCharset.empty())
 384     {
 385       CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset,
 386                 scrURL.m_url);
 387       std::string converted;
 388       g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
 389       strHTML = converted;
 390     }
 391   }
 392   else if (ftype == CMime::FileTypePlainText ||
 393            StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/"))
 394   {
 395     std::string realTextCharset;
 396     std::string converted;
 397     CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
 398     strHTML = converted;
 399     if (reportedCharset != realTextCharset)
 400       CLog::Log(LOGWARNING,
 401                 "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" "
 402                 "charset",
 403                 __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset);
 404     else
 405       CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__,
 406                 realTextCharset, scrURL.m_url);
 407   }
 408   else if (!reportedCharset.empty())
 409   {
 410     CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset,
 411               scrURL.m_url);
 412     if (reportedCharset != "UTF-8")
 413     {
 414       std::string converted;
 415       g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
 416       strHTML = converted;
 417     }
 418   }
 419   else
 420     CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset",
 421               __FUNCTION__, scrURL.m_url);
 422
 423   if (!scrURL.m_cache.empty())
 424   {
 425     const auto strCachePath = URIUtils::AddFileToFolder(
 426         CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
 427         cacheContext, scrURL.m_cache);
 428     XFILE::CFile file;
 429     if (!file.OpenForWrite(strCachePath, true) ||
 430         file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size()))
 431       return false;
 432   }
 433   return true;
 434 }