Merge pull request #26220 from 78andyp/blurayfixes
[xbmc.git] / xbmc / utils / ScraperParser.cpp
blob5fdef4d41142bd59de9763f6592bf0f6e4b813f6
1 /*
2 * Copyright (C) 2012-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
9 #include "ScraperParser.h"
11 #include "guilib/LocalizeStrings.h"
12 #include "RegExp.h"
13 #include "HTMLUtil.h"
14 #include "addons/Scraper.h"
15 #include "URL.h"
16 #include "utils/StringUtils.h"
17 #include "log.h"
18 #include "CharsetConverter.h"
19 #ifdef HAVE_LIBXSLT
20 #include "utils/XSLTUtils.h"
21 #endif
22 #include "utils/XMLUtils.h"
23 #include <sstream>
24 #include <cstring>
26 using namespace ADDON;
27 using namespace XFILE;
29 CScraperParser::CScraperParser()
31 m_pRootElement = NULL;
32 m_document = NULL;
33 m_SearchStringEncoding = "UTF-8";
34 m_scraper = NULL;
35 m_isNoop = true;
38 CScraperParser::CScraperParser(const CScraperParser& parser)
40 m_pRootElement = NULL;
41 m_document = NULL;
42 m_SearchStringEncoding = "UTF-8";
43 m_scraper = NULL;
44 m_isNoop = true;
45 *this = parser;
48 CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
50 if (this != &parser)
52 Clear();
53 if (parser.m_document)
55 m_scraper = parser.m_scraper;
56 m_document = new CXBMCTinyXML(*parser.m_document);
57 LoadFromXML();
59 else
60 m_scraper = NULL;
62 return *this;
65 CScraperParser::~CScraperParser()
67 Clear();
70 void CScraperParser::Clear()
72 m_pRootElement = NULL;
73 delete m_document;
75 m_document = NULL;
76 m_strFile.clear();
79 bool CScraperParser::Load(const std::string& strXMLFile)
81 Clear();
83 m_document = new CXBMCTinyXML();
85 if (!m_document)
86 return false;
88 m_strFile = strXMLFile;
90 if (m_document->LoadFile(strXMLFile))
91 return LoadFromXML();
93 delete m_document;
94 m_document = NULL;
95 return false;
98 bool CScraperParser::LoadFromXML()
100 if (!m_document)
101 return false;
103 m_pRootElement = m_document->RootElement();
104 std::string strValue = m_pRootElement->ValueStr();
105 if (strValue == "scraper")
107 TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
108 if (pChildElement)
110 m_isNoop = false;
111 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
112 m_SearchStringEncoding = "UTF-8";
115 pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
116 if (pChildElement)
118 m_isNoop = false;
119 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
120 m_SearchStringEncoding = "UTF-8";
122 pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
123 if (pChildElement)
125 m_isNoop = false;
126 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
127 m_SearchStringEncoding = "UTF-8";
130 return true;
133 delete m_document;
134 m_document = NULL;
135 m_pRootElement = NULL;
136 return false;
139 void CScraperParser::ReplaceBuffers(std::string& strDest)
141 // insert buffers
142 size_t iIndex;
143 for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
145 iIndex = 0;
146 std::string temp = StringUtils::Format("$${}", i + 1);
147 while ((iIndex = strDest.find(temp,iIndex)) != std::string::npos)
149 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.size(),m_param[i]);
150 iIndex += m_param[i].length();
153 // insert settings
154 iIndex = 0;
155 while ((iIndex = strDest.find("$INFO[", iIndex)) != std::string::npos)
157 size_t iEnd = strDest.find(']', iIndex);
158 std::string strInfo = strDest.substr(iIndex+6, iEnd - iIndex - 6);
159 std::string strReplace;
160 if (m_scraper)
161 strReplace = m_scraper->GetSetting(strInfo);
162 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
163 iIndex += strReplace.length();
165 // insert localize strings
166 iIndex = 0;
167 while ((iIndex = strDest.find("$LOCALIZE[", iIndex)) != std::string::npos)
169 size_t iEnd = strDest.find(']', iIndex);
170 std::string strInfo = strDest.substr(iIndex+10, iEnd - iIndex - 10);
171 std::string strReplace;
172 if (m_scraper)
173 strReplace = g_localizeStrings.GetAddonString(m_scraper->ID(), strtol(strInfo.c_str(),NULL,10));
174 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
175 iIndex += strReplace.length();
177 iIndex = 0;
178 while ((iIndex = strDest.find("\\n",iIndex)) != std::string::npos)
179 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
182 void CScraperParser::ParseExpression(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
184 std::string strOutput = XMLUtils::GetAttribute(element, "output");
186 TiXmlElement* pExpression = element->FirstChildElement("expression");
187 if (pExpression)
189 bool bInsensitive=true;
190 const char* sensitive = pExpression->Attribute("cs");
191 if (sensitive)
192 if (StringUtils::CompareNoCase(sensitive, "yes") == 0)
193 bInsensitive=false; // match case sensitive
195 CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
196 const char* const strUtf8 = pExpression->Attribute("utf8");
197 if (strUtf8)
199 if (StringUtils::CompareNoCase(strUtf8, "yes") == 0)
200 eUtf8 = CRegExp::forceUtf8;
201 else if (StringUtils::CompareNoCase(strUtf8, "no") == 0)
202 eUtf8 = CRegExp::asciiOnly;
203 else if (StringUtils::CompareNoCase(strUtf8, "auto") == 0)
204 eUtf8 = CRegExp::autoUtf8;
207 CRegExp reg(bInsensitive, eUtf8);
208 std::string strExpression;
209 if (pExpression->FirstChild())
210 strExpression = pExpression->FirstChild()->Value();
211 else
212 strExpression = "(.*)";
213 ReplaceBuffers(strExpression);
214 ReplaceBuffers(strOutput);
216 if (!reg.RegComp(strExpression.c_str()))
218 return;
221 bool bRepeat = false;
222 const char* szRepeat = pExpression->Attribute("repeat");
223 if (szRepeat)
224 if (StringUtils::CompareNoCase(szRepeat, "yes") == 0)
225 bRepeat = true;
227 const char* szClear = pExpression->Attribute("clear");
228 if (szClear)
229 if (StringUtils::CompareNoCase(szClear, "yes") == 0)
230 dest=""; // clear no matter if regexp fails
232 bool bClean[MAX_SCRAPER_BUFFERS];
233 GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
235 bool bTrim[MAX_SCRAPER_BUFFERS];
236 GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
238 bool bFixChars[MAX_SCRAPER_BUFFERS];
239 GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
241 bool bEncode[MAX_SCRAPER_BUFFERS];
242 GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
244 int iOptional = -1;
245 pExpression->QueryIntAttribute("optional",&iOptional);
247 int iCompare = -1;
248 pExpression->QueryIntAttribute("compare",&iCompare);
249 if (iCompare > -1)
250 StringUtils::ToLower(m_param[iCompare-1]);
251 std::string curInput = input;
252 for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
254 if (bClean[iBuf])
255 InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
256 if (bTrim[iBuf])
257 InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
258 if (bFixChars[iBuf])
259 InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
260 if (bEncode[iBuf])
261 InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
263 int i = reg.RegFind(curInput.c_str());
264 while (i > -1 && (i < (int)curInput.size() || curInput.empty()))
266 if (!bAppend)
268 dest = "";
269 bAppend = true;
271 std::string strCurOutput=strOutput;
273 if (iOptional > -1) // check that required param is there
275 char temp[12];
276 snprintf(temp, sizeof(temp), "\\%i", iOptional);
277 std::string szParam = reg.GetReplaceString(temp);
278 CRegExp reg2;
279 reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
280 int i2=reg2.RegFind(strCurOutput.c_str());
281 while (i2 > -1)
283 std::string szRemove(reg2.GetMatch(2));
284 int iRemove = szRemove.size();
285 int i3 = strCurOutput.find(szRemove);
286 if (!szParam.empty())
288 strCurOutput.erase(i3+iRemove,2);
289 strCurOutput.erase(i3,2);
291 else
292 strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
294 i2 = reg2.RegFind(strCurOutput.c_str());
298 int iLen = reg.GetFindLen();
299 // nasty hack #1 - & means \0 in a replace string
300 StringUtils::Replace(strCurOutput, "&","!!!AMPAMP!!!");
301 std::string result = reg.GetReplaceString(strCurOutput);
302 if (!result.empty())
304 std::string strResult(result);
305 StringUtils::Replace(strResult, "!!!AMPAMP!!!","&");
306 Clean(strResult);
307 ReplaceBuffers(strResult);
308 if (iCompare > -1)
310 std::string strResultNoCase = strResult;
311 StringUtils::ToLower(strResultNoCase);
312 if (strResultNoCase.find(m_param[iCompare-1]) != std::string::npos)
313 dest += strResult;
315 else
316 dest += strResult;
318 if (bRepeat && iLen > 0)
320 curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
321 i = reg.RegFind(curInput.c_str());
323 else
324 i = -1;
329 void CScraperParser::ParseXSLT(const std::string& input, std::string& dest, TiXmlElement* element, bool bAppend)
331 #ifdef HAVE_LIBXSLT
332 TiXmlElement* pSheet = element->FirstChildElement();
333 if (pSheet)
335 XSLTUtils xsltUtils;
336 std::string strXslt;
337 strXslt << *pSheet;
338 ReplaceBuffers(strXslt);
340 if (!xsltUtils.SetInput(input))
341 CLog::Log(LOGDEBUG, "could not parse input XML");
343 if (!xsltUtils.SetStylesheet(strXslt))
344 CLog::Log(LOGDEBUG, "could not parse stylesheet XML");
346 xsltUtils.XSLTTransform(dest);
348 #endif
351 TiXmlElement *FirstChildScraperElement(TiXmlElement *element)
353 for (TiXmlElement *child = element->FirstChildElement(); child; child = child->NextSiblingElement())
355 #ifdef HAVE_LIBXSLT
356 if (child->ValueStr() == "XSLT")
357 return child;
358 #endif
359 if (child->ValueStr() == "RegExp")
360 return child;
362 return NULL;
365 TiXmlElement *NextSiblingScraperElement(TiXmlElement *element)
367 for (TiXmlElement *next = element->NextSiblingElement(); next; next = next->NextSiblingElement())
369 #ifdef HAVE_LIBXSLT
370 if (next->ValueStr() == "XSLT")
371 return next;
372 #endif
373 if (next->ValueStr() == "RegExp")
374 return next;
376 return NULL;
379 void CScraperParser::ParseNext(TiXmlElement* element)
381 TiXmlElement* pReg = element;
382 while (pReg)
384 TiXmlElement* pChildReg = FirstChildScraperElement(pReg);
385 if (pChildReg)
386 ParseNext(pChildReg);
387 else
389 TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
390 if (pChildReg)
391 ParseNext(pChildReg);
394 int iDest = 1;
395 bool bAppend = false;
396 const char* szDest = pReg->Attribute("dest");
397 if (szDest && strlen(szDest))
399 if (szDest[strlen(szDest)-1] == '+')
400 bAppend = true;
402 iDest = atoi(szDest);
405 const char *szInput = pReg->Attribute("input");
406 std::string strInput;
407 if (szInput)
409 strInput = szInput;
410 ReplaceBuffers(strInput);
412 else
413 strInput = m_param[0];
415 const char* szConditional = pReg->Attribute("conditional");
416 bool bExecute = true;
417 if (szConditional)
419 bool bInverse=false;
420 if (szConditional[0] == '!')
422 bInverse = true;
423 szConditional++;
425 std::string strSetting;
426 if (m_scraper && m_scraper->HasSettings())
427 strSetting = m_scraper->GetSetting(szConditional);
428 bExecute = bInverse != (strSetting == "true");
431 if (bExecute)
433 if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
435 #ifdef HAVE_LIBXSLT
436 if (pReg->ValueStr() == "XSLT")
437 ParseXSLT(strInput, m_param[iDest - 1], pReg, bAppend);
438 else
439 #endif
440 ParseExpression(strInput, m_param[iDest - 1],pReg,bAppend);
442 else
443 CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
444 "out of bounds, skipping expression");
446 pReg = NextSiblingScraperElement(pReg);
450 const std::string CScraperParser::Parse(const std::string& strTag,
451 CScraper* scraper)
453 TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
454 if(pChildElement == NULL)
456 CLog::Log(LOGERROR, "{}: Could not find scraper function {}", __FUNCTION__, strTag);
457 return "";
459 int iResult = 1; // default to param 1
460 pChildElement->QueryIntAttribute("dest",&iResult);
461 TiXmlElement* pChildStart = FirstChildScraperElement(pChildElement);
462 m_scraper = scraper;
463 ParseNext(pChildStart);
464 std::string tmp = m_param[iResult-1];
466 const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
467 if (!szClearBuffers || StringUtils::CompareNoCase(szClearBuffers, "no") != 0)
468 ClearBuffers();
470 return tmp;
473 void CScraperParser::Clean(std::string& strDirty)
475 size_t i = 0;
476 std::string strBuffer;
477 while ((i = strDirty.find("!!!CLEAN!!!",i)) != std::string::npos)
479 size_t i2;
480 if ((i2 = strDirty.find("!!!CLEAN!!!",i+11)) != std::string::npos)
482 strBuffer = strDirty.substr(i+11,i2-i-11);
483 std::string strConverted(strBuffer);
484 HTML::CHTMLUtil::RemoveTags(strConverted);
485 StringUtils::Trim(strConverted);
486 strDirty.replace(i, i2-i+11, strConverted);
487 i += strConverted.size();
489 else
490 break;
492 i=0;
493 while ((i = strDirty.find("!!!TRIM!!!",i)) != std::string::npos)
495 size_t i2;
496 if ((i2 = strDirty.find("!!!TRIM!!!",i+10)) != std::string::npos)
498 strBuffer = strDirty.substr(i+10,i2-i-10);
499 StringUtils::Trim(strBuffer);
500 strDirty.replace(i, i2-i+10, strBuffer);
501 i += strBuffer.size();
503 else
504 break;
506 i=0;
507 while ((i = strDirty.find("!!!FIXCHARS!!!",i)) != std::string::npos)
509 size_t i2;
510 if ((i2 = strDirty.find("!!!FIXCHARS!!!",i+14)) != std::string::npos)
512 strBuffer = strDirty.substr(i+14,i2-i-14);
513 std::wstring wbuffer;
514 g_charsetConverter.utf8ToW(strBuffer, wbuffer, false, false, false);
515 std::wstring wConverted;
516 HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
517 g_charsetConverter.wToUTF8(wConverted, strBuffer, false);
518 StringUtils::Trim(strBuffer);
519 ConvertJSON(strBuffer);
520 strDirty.replace(i, i2-i+14, strBuffer);
521 i += strBuffer.size();
523 else
524 break;
526 i=0;
527 while ((i=strDirty.find("!!!ENCODE!!!",i)) != std::string::npos)
529 size_t i2;
530 if ((i2 = strDirty.find("!!!ENCODE!!!",i+12)) != std::string::npos)
532 strBuffer = CURL::Encode(strDirty.substr(i + 12, i2 - i - 12));
533 strDirty.replace(i, i2-i+12, strBuffer);
534 i += strBuffer.size();
536 else
537 break;
541 void CScraperParser::ConvertJSON(std::string &string)
543 CRegExp reg;
544 reg.RegComp("\\\\u([0-f]{4})");
545 while (reg.RegFind(string.c_str()) > -1)
547 int pos = reg.GetSubStart(1);
548 std::string szReplace(reg.GetMatch(1));
550 std::string replace = StringUtils::Format("&#x{};", szReplace);
551 string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
554 CRegExp reg2;
555 reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
556 while (reg2.RegFind(string.c_str()) > -1)
558 int pos1 = reg2.GetSubStart(1);
559 int pos2 = reg2.GetSubStart(2);
560 std::string szHexValue(reg2.GetMatch(1));
562 std::string replace = std::to_string(std::stol(szHexValue, NULL, 16));
563 string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
566 StringUtils::Replace(string, "\\\"","\"");
569 void CScraperParser::ClearBuffers()
571 //clear all m_param strings
572 for (std::string& param : m_param)
573 param.clear();
576 void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
578 for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
579 result[iBuf] = defvalue;
580 if (attribute)
582 std::vector<std::string> vecBufs;
583 StringUtils::Tokenize(attribute,vecBufs,",");
584 for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
586 int index = atoi(vecBufs[nToken].c_str())-1;
587 if (index < MAX_SCRAPER_BUFFERS)
588 result[index] = !defvalue;
593 void CScraperParser::InsertToken(std::string& strOutput, int buf, const char* token)
595 char temp[4];
596 snprintf(temp, sizeof(temp), "\\%i", buf);
597 size_t i2=0;
598 while ((i2 = strOutput.find(temp,i2)) != std::string::npos)
600 strOutput.insert(i2,token);
601 i2 += strlen(token) + strlen(temp);
602 strOutput.insert(i2,token);
606 void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
608 const TiXmlNode* node = doc->RootElement()->FirstChild();
609 while (node)
611 m_pRootElement->InsertEndChild(*node);
612 node = node->NextSibling();