xbmc/utils/ScraperParser.cpp

   1 /*
   2  *  Copyright (C) 2012-2018 Team Kodi
   3  *  This file is part of Kodi - https://kodi.tv
   4  *
   5  *  SPDX-License-Identifier: GPL-2.0-or-later
   6  *  See LICENSES/README.md for more information.
   7  */
   8
   9 #include "ScraperParser.h"
  10
  11 #include "guilib/LocalizeStrings.h"
  12 #include "RegExp.h"
  13 #include "HTMLUtil.h"
  14 #include "addons/Scraper.h"
  15 #include "URL.h"
  16 #include "utils/StringUtils.h"
  17 #include "log.h"
  18 #include "CharsetConverter.h"
  19 #ifdef HAVE_LIBXSLT
  20 #include "utils/XSLTUtils.h"
  21 #endif
  22 #include "utils/XMLUtils.h"
  23 #include <sstream>
  24 #include <cstring>
  25
  26 using namespace ADDON;
  27 using namespace XFILE;
  28
  29 CScraperParser::CScraperParser()
  30 {
  31   m_pRootElement = NULL;
  32   m_document = NULL;
  33   m_SearchStringEncoding = "UTF-8";
  34   m_scraper = NULL;
  35   m_isNoop = true;
  36 }
  37
  38 CScraperParser::CScraperParser(const CScraperParser& parser)
  39 {
  40   m_pRootElement = NULL;
  41   m_document = NULL;
  42   m_SearchStringEncoding = "UTF-8";
  43   m_scraper = NULL;
  44   m_isNoop = true;
  45   *this = parser;
  46 }
  47
  48 CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
  49 {
  50   if (this != &parser)
  51   {
  52     Clear();
  53     if (parser.m_document)
  54     {
  55       m_scraper = parser.m_scraper;
  56       m_document = new CXBMCTinyXML(*parser.m_document);
  57       LoadFromXML();
  58     }
  59     else
  60       m_scraper = NULL;
  61   }
  62   return *this;
  63 }
  64
  65 CScraperParser::~CScraperParser()
  66 {
  67   Clear();
  68 }
  69
  70 void CScraperParser::Clear()
  71 {
  72   m_pRootElement = NULL;
  73   delete m_document;
  74
  75   m_document = NULL;
  76   m_strFile.clear();
  77 }
  78
  79 bool CScraperParser::Load(const std::string& strXMLFile)
  80 {
  81   Clear();
  82
  83   m_document = new CXBMCTinyXML();
  84
  85   if (!m_document)
  86     return false;
  87
  88   m_strFile = strXMLFile;
  89
  90   if (m_document->LoadFile(strXMLFile))
  91     return LoadFromXML();
  92
  93   delete m_document;
  94   m_document = NULL;
  95   return false;
  96 }
  97
  98 bool CScraperParser::LoadFromXML()
  99 {
 100   if (!m_document)
 101     return false;
 102
 103   m_pRootElement = m_document->RootElement();
 104   std::string strValue = m_pRootElement->ValueStr();
 105   if (strValue == "scraper")
 106   {
 107     TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
 108     if (pChildElement)
 109     {
 110       m_isNoop = false;
 111       if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
 112         m_SearchStringEncoding = "UTF-8";
 113     }
 114
 115     pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
 116     if (pChildElement)
 117     {
 118       m_isNoop = false;
 119       if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
 120         m_SearchStringEncoding = "UTF-8";
 121     }
 122     pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
 123     if (pChildElement)
 124     {
 125       m_isNoop = false;
 126       if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
 127         m_SearchStringEncoding = "UTF-8";
 128     }
 129
 130     return true;
 131   }
 132
 133   delete m_document;
 134   m_document = NULL;
 135   m_pRootElement = NULL;
 136   return false;
 137 }
 138
 139 void CScraperParser::ReplaceBuffers(std::string& strDest)
 140 {
 141   // insert buffers
 142   size_t iIndex;
 143   for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
 144   {
 145     iIndex = 0;
 146     std::string temp = StringUtils::Format("$${}", i + 1);
 147     while ((iIndex = strDest.find(temp,iIndex)) != std::string::npos)
 148     {
 149       strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.size(),m_param[i]);
 150       iIndex += m_param[i].length();
 151     }
 152   }
 153   // insert settings
 154   iIndex = 0;
 155   while ((iIndex = strDest.find("$INFO[", iIndex)) != std::string::npos)
 156   {
 157     size_t iEnd = strDest.find(']', iIndex);
 158     std::string strInfo = strDest.substr(iIndex+6, iEnd - iIndex - 6);
 159     std::string strReplace;
 160     if (m_scraper)
 161       strReplace = m_scraper->GetSetting(strInfo);
 162     strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
 163     iIndex += strReplace.length();
 164   }
 165   // insert localize strings
 166   iIndex = 0;
 167   while ((iIndex = strDest.find("$LOCALIZE[", iIndex)) != std::string::npos)
 168   {
 169     size_t iEnd = strDest.find(']', iIndex);
 170     std::string strInfo = strDest.substr(iIndex+10, iEnd - iIndex - 10);
 171     std::string strReplace;
 172     if (m_scraper)
 173       strReplace = g_localizeStrings.GetAddonString(m_scraper->ID(), strtol(strInfo.c_str(),NULL,10));
 174     strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
 175     iIndex += strReplace.length();
 176   }
 177   iIndex = 0;
 178   while ((iIndex = strDest.find("\\n",iIndex)) != std::string::npos)
 179     strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
 180 }
 181
 182 void CScraperParser::ParseExpression(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
 183 {
 184   std::string strOutput = XMLUtils::GetAttribute(element, "output");
 185
 186   TiXmlElement* pExpression = element->FirstChildElement("expression");
 187   if (pExpression)
 188   {
 189     bool bInsensitive=true;
 190     const char* sensitive = pExpression->Attribute("cs");
 191     if (sensitive)
 192       if (StringUtils::CompareNoCase(sensitive, "yes") == 0)
 193         bInsensitive=false; // match case sensitive
 194
 195     CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
 196     const char* const strUtf8 = pExpression->Attribute("utf8");
 197     if (strUtf8)
 198     {
 199       if (StringUtils::CompareNoCase(strUtf8, "yes") == 0)
 200         eUtf8 = CRegExp::forceUtf8;
 201       else if (StringUtils::CompareNoCase(strUtf8, "no") == 0)
 202         eUtf8 = CRegExp::asciiOnly;
 203       else if (StringUtils::CompareNoCase(strUtf8, "auto") == 0)
 204         eUtf8 = CRegExp::autoUtf8;
 205     }
 206
 207     CRegExp reg(bInsensitive, eUtf8);
 208     std::string strExpression;
 209     if (pExpression->FirstChild())
 210       strExpression = pExpression->FirstChild()->Value();
 211     else
 212       strExpression = "(.*)";
 213     ReplaceBuffers(strExpression);
 214     ReplaceBuffers(strOutput);
 215
 216     if (!reg.RegComp(strExpression.c_str()))
 217     {
 218       return;
 219     }
 220
 221     bool bRepeat = false;
 222     const char* szRepeat = pExpression->Attribute("repeat");
 223     if (szRepeat)
 224       if (StringUtils::CompareNoCase(szRepeat, "yes") == 0)
 225         bRepeat = true;
 226
 227     const char* szClear = pExpression->Attribute("clear");
 228     if (szClear)
 229       if (StringUtils::CompareNoCase(szClear, "yes") == 0)
 230         dest=""; // clear no matter if regexp fails
 231
 232     bool bClean[MAX_SCRAPER_BUFFERS];
 233     GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
 234
 235     bool bTrim[MAX_SCRAPER_BUFFERS];
 236     GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
 237
 238     bool bFixChars[MAX_SCRAPER_BUFFERS];
 239     GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
 240
 241     bool bEncode[MAX_SCRAPER_BUFFERS];
 242     GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
 243
 244     int iOptional = -1;
 245     pExpression->QueryIntAttribute("optional",&iOptional);
 246
 247     int iCompare = -1;
 248     pExpression->QueryIntAttribute("compare",&iCompare);
 249     if (iCompare > -1)
 250       StringUtils::ToLower(m_param[iCompare-1]);
 251     std::string curInput = input;
 252     for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
 253     {
 254       if (bClean[iBuf])
 255         InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
 256       if (bTrim[iBuf])
 257         InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
 258       if (bFixChars[iBuf])
 259         InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
 260       if (bEncode[iBuf])
 261         InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
 262     }
 263     int i = reg.RegFind(curInput.c_str());
 264     while (i > -1 && (i < (int)curInput.size() || curInput.empty()))
 265     {
 266       if (!bAppend)
 267       {
 268         dest = "";
 269         bAppend = true;
 270       }
 271       std::string strCurOutput=strOutput;
 272
 273       if (iOptional > -1) // check that required param is there
 274       {
 275         char temp[12];
 276         snprintf(temp, sizeof(temp), "\\%i", iOptional);
 277         std::string szParam = reg.GetReplaceString(temp);
 278         CRegExp reg2;
 279         reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
 280         int i2=reg2.RegFind(strCurOutput.c_str());
 281         while (i2 > -1)
 282         {
 283           std::string szRemove(reg2.GetMatch(2));
 284           int iRemove = szRemove.size();
 285           int i3 = strCurOutput.find(szRemove);
 286           if (!szParam.empty())
 287           {
 288             strCurOutput.erase(i3+iRemove,2);
 289             strCurOutput.erase(i3,2);
 290           }
 291           else
 292             strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
 293
 294           i2 = reg2.RegFind(strCurOutput.c_str());
 295         }
 296       }
 297
 298       int iLen = reg.GetFindLen();
 299       // nasty hack #1 - & means \0 in a replace string
 300       StringUtils::Replace(strCurOutput, "&","!!!AMPAMP!!!");
 301       std::string result = reg.GetReplaceString(strCurOutput);
 302       if (!result.empty())
 303       {
 304         std::string strResult(result);
 305         StringUtils::Replace(strResult, "!!!AMPAMP!!!","&");
 306         Clean(strResult);
 307         ReplaceBuffers(strResult);
 308         if (iCompare > -1)
 309         {
 310           std::string strResultNoCase = strResult;
 311           StringUtils::ToLower(strResultNoCase);
 312           if (strResultNoCase.find(m_param[iCompare-1]) != std::string::npos)
 313             dest += strResult;
 314         }
 315         else
 316           dest += strResult;
 317       }
 318       if (bRepeat && iLen > 0)
 319       {
 320         curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
 321         i = reg.RegFind(curInput.c_str());
 322       }
 323       else
 324         i = -1;
 325     }
 326   }
 327 }
 328
 329 void CScraperParser::ParseXSLT(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
 330 {
 331 #ifdef HAVE_LIBXSLT
 332   TiXmlElement* pSheet = element->FirstChildElement();
 333   if (pSheet)
 334   {
 335     XSLTUtils xsltUtils;
 336     std::string strXslt;
 337     strXslt << *pSheet;
 338     ReplaceBuffers(strXslt);
 339
 340     if (!xsltUtils.SetInput(input))
 341       CLog::Log(LOGDEBUG, "could not parse input XML");
 342
 343     if (!xsltUtils.SetStylesheet(strXslt))
 344       CLog::Log(LOGDEBUG, "could not parse stylesheet XML");
 345
 346     xsltUtils.XSLTTransform(dest);
 347   }
 348 #endif
 349 }
 350
 351 TiXmlElement *FirstChildScraperElement(TiXmlElement *element)
 352 {
 353   for (TiXmlElement *child = element->FirstChildElement(); child; child = child->NextSiblingElement())
 354   {
 355 #ifdef HAVE_LIBXSLT
 356     if (child->ValueStr() == "XSLT")
 357       return child;
 358 #endif
 359     if (child->ValueStr() == "RegExp")
 360       return child;
 361   }
 362   return NULL;
 363 }
 364
 365 TiXmlElement *NextSiblingScraperElement(TiXmlElement *element)
 366 {
 367   for (TiXmlElement *next = element->NextSiblingElement(); next; next = next->NextSiblingElement())
 368   {
 369 #ifdef HAVE_LIBXSLT
 370     if (next->ValueStr() == "XSLT")
 371       return next;
 372 #endif
 373     if (next->ValueStr() == "RegExp")
 374       return next;
 375   }
 376   return NULL;
 377 }
 378
 379 void CScraperParser::ParseNext(TiXmlElement* element)
 380 {
 381   TiXmlElement* pReg = element;
 382   while (pReg)
 383   {
 384     TiXmlElement* pChildReg = FirstChildScraperElement(pReg);
 385     if (pChildReg)
 386       ParseNext(pChildReg);
 387     else
 388     {
 389       TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
 390       if (pChildReg)
 391         ParseNext(pChildReg);
 392     }
 393
 394     int iDest = 1;
 395     bool bAppend = false;
 396     const char* szDest = pReg->Attribute("dest");
 397     if (szDest && strlen(szDest))
 398     {
 399       if (szDest[strlen(szDest)-1] == '+')
 400         bAppend = true;
 401
 402       iDest = atoi(szDest);
 403     }
 404
 405     const char *szInput = pReg->Attribute("input");
 406     std::string strInput;
 407     if (szInput)
 408     {
 409       strInput = szInput;
 410       ReplaceBuffers(strInput);
 411     }
 412     else
 413       strInput = m_param[0];
 414
 415     const char* szConditional = pReg->Attribute("conditional");
 416     bool bExecute = true;
 417     if (szConditional)
 418     {
 419       bool bInverse=false;
 420       if (szConditional[0] == '!')
 421       {
 422         bInverse = true;
 423         szConditional++;
 424       }
 425       std::string strSetting;
 426       if (m_scraper && m_scraper->HasSettings())
 427         strSetting = m_scraper->GetSetting(szConditional);
 428       bExecute = bInverse != (strSetting == "true");
 429     }
 430
 431     if (bExecute)
 432     {
 433       if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
 434       {
 435 #ifdef HAVE_LIBXSLT
 436         if (pReg->ValueStr() == "XSLT")
 437           ParseXSLT(strInput, m_param[iDest - 1], pReg, bAppend);
 438         else
 439 #endif
 440           ParseExpression(strInput, m_param[iDest - 1],pReg,bAppend);
 441       }
 442       else
 443         CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
 444                            "out of bounds, skipping expression");
 445     }
 446     pReg = NextSiblingScraperElement(pReg);
 447   }
 448 }
 449
 450 const std::string CScraperParser::Parse(const std::string& strTag,
 451                                        CScraper* scraper)
 452 {
 453   TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
 454   if(pChildElement == NULL)
 455   {
 456     CLog::Log(LOGERROR, "{}: Could not find scraper function {}", __FUNCTION__, strTag);
 457     return "";
 458   }
 459   int iResult = 1; // default to param 1
 460   pChildElement->QueryIntAttribute("dest",&iResult);
 461   TiXmlElement* pChildStart = FirstChildScraperElement(pChildElement);
 462   m_scraper = scraper;
 463   ParseNext(pChildStart);
 464   std::string tmp = m_param[iResult-1];
 465
 466   const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
 467   if (!szClearBuffers || StringUtils::CompareNoCase(szClearBuffers, "no") != 0)
 468     ClearBuffers();
 469
 470   return tmp;
 471 }
 472
 473 void CScraperParser::Clean(std::string& strDirty)
 474 {
 475   size_t i = 0;
 476   std::string strBuffer;
 477   while ((i = strDirty.find("!!!CLEAN!!!",i)) != std::string::npos)
 478   {
 479     size_t i2;
 480     if ((i2 = strDirty.find("!!!CLEAN!!!",i+11)) != std::string::npos)
 481     {
 482       strBuffer = strDirty.substr(i+11,i2-i-11);
 483       std::string strConverted(strBuffer);
 484       HTML::CHTMLUtil::RemoveTags(strConverted);
 485       StringUtils::Trim(strConverted);
 486       strDirty.replace(i, i2-i+11, strConverted);
 487       i += strConverted.size();
 488     }
 489     else
 490       break;
 491   }
 492   i=0;
 493   while ((i = strDirty.find("!!!TRIM!!!",i)) != std::string::npos)
 494   {
 495     size_t i2;
 496     if ((i2 = strDirty.find("!!!TRIM!!!",i+10)) != std::string::npos)
 497     {
 498       strBuffer = strDirty.substr(i+10,i2-i-10);
 499       StringUtils::Trim(strBuffer);
 500       strDirty.replace(i, i2-i+10, strBuffer);
 501       i += strBuffer.size();
 502     }
 503     else
 504       break;
 505   }
 506   i=0;
 507   while ((i = strDirty.find("!!!FIXCHARS!!!",i)) != std::string::npos)
 508   {
 509     size_t i2;
 510     if ((i2 = strDirty.find("!!!FIXCHARS!!!",i+14)) != std::string::npos)
 511     {
 512       strBuffer = strDirty.substr(i+14,i2-i-14);
 513       std::wstring wbuffer;
 514       g_charsetConverter.utf8ToW(strBuffer, wbuffer, false, false, false);
 515       std::wstring wConverted;
 516       HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
 517       g_charsetConverter.wToUTF8(wConverted, strBuffer, false);
 518       StringUtils::Trim(strBuffer);
 519       ConvertJSON(strBuffer);
 520       strDirty.replace(i, i2-i+14, strBuffer);
 521       i += strBuffer.size();
 522     }
 523     else
 524       break;
 525   }
 526   i=0;
 527   while ((i=strDirty.find("!!!ENCODE!!!",i)) != std::string::npos)
 528   {
 529     size_t i2;
 530     if ((i2 = strDirty.find("!!!ENCODE!!!",i+12)) != std::string::npos)
 531     {
 532       strBuffer = CURL::Encode(strDirty.substr(i + 12, i2 - i - 12));
 533       strDirty.replace(i, i2-i+12, strBuffer);
 534       i += strBuffer.size();
 535     }
 536     else
 537       break;
 538   }
 539 }
 540
 541 void CScraperParser::ConvertJSON(std::string &string)
 542 {
 543   CRegExp reg;
 544   reg.RegComp("\\\\u([0-f]{4})");
 545   while (reg.RegFind(string.c_str()) > -1)
 546   {
 547     int pos = reg.GetSubStart(1);
 548     std::string szReplace(reg.GetMatch(1));
 549
 550     std::string replace = StringUtils::Format("&#x{};", szReplace);
 551     string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
 552   }
 553
 554   CRegExp reg2;
 555   reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
 556   while (reg2.RegFind(string.c_str()) > -1)
 557   {
 558     int pos1 = reg2.GetSubStart(1);
 559     int pos2 = reg2.GetSubStart(2);
 560     std::string szHexValue(reg2.GetMatch(1));
 561
 562     std::string replace = std::to_string(std::stol(szHexValue, NULL, 16));
 563     string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
 564   }
 565
 566   StringUtils::Replace(string, "\\\"","\"");
 567 }
 568
 569 void CScraperParser::ClearBuffers()
 570 {
 571   //clear all m_param strings
 572   for (std::string& param : m_param)
 573     param.clear();
 574 }
 575
 576 void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
 577 {
 578   for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
 579     result[iBuf] = defvalue;
 580   if (attribute)
 581   {
 582     std::vector<std::string> vecBufs;
 583     StringUtils::Tokenize(attribute,vecBufs,",");
 584     for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
 585     {
 586       int index = atoi(vecBufs[nToken].c_str())-1;
 587       if (index < MAX_SCRAPER_BUFFERS)
 588         result[index] = !defvalue;
 589     }
 590   }
 591 }
 592
 593 void CScraperParser::InsertToken(std::string& strOutput, int buf, const char* token)
 594 {
 595   char temp[4];
 596   snprintf(temp, sizeof(temp), "\\%i", buf);
 597   size_t i2=0;
 598   while ((i2 = strOutput.find(temp,i2)) != std::string::npos)
 599   {
 600     strOutput.insert(i2,token);
 601     i2 += strlen(token) + strlen(temp);
 602     strOutput.insert(i2,token);
 603   }
 604 }
 605
 606 void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
 607 {
 608   const TiXmlNode* node = doc->RootElement()->FirstChild();
 609   while (node)
 610   {
 611     m_pRootElement->InsertEndChild(*node);
 612     node = node->NextSibling();
 613   }
 614 }
 615