2 * Copyright (C) 2012-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
9 #include "ScraperParser.h"
11 #include "guilib/LocalizeStrings.h"
14 #include "addons/Scraper.h"
16 #include "utils/StringUtils.h"
18 #include "CharsetConverter.h"
20 #include "utils/XSLTUtils.h"
22 #include "utils/XMLUtils.h"
26 using namespace ADDON
;
27 using namespace XFILE
;
29 CScraperParser::CScraperParser()
31 m_pRootElement
= NULL
;
33 m_SearchStringEncoding
= "UTF-8";
38 CScraperParser::CScraperParser(const CScraperParser
& parser
)
40 m_pRootElement
= NULL
;
42 m_SearchStringEncoding
= "UTF-8";
48 CScraperParser
&CScraperParser::operator=(const CScraperParser
&parser
)
53 if (parser
.m_document
)
55 m_scraper
= parser
.m_scraper
;
56 m_document
= new CXBMCTinyXML(*parser
.m_document
);
65 CScraperParser::~CScraperParser()
70 void CScraperParser::Clear()
72 m_pRootElement
= NULL
;
79 bool CScraperParser::Load(const std::string
& strXMLFile
)
83 m_document
= new CXBMCTinyXML();
88 m_strFile
= strXMLFile
;
90 if (m_document
->LoadFile(strXMLFile
))
98 bool CScraperParser::LoadFromXML()
103 m_pRootElement
= m_document
->RootElement();
104 std::string strValue
= m_pRootElement
->ValueStr();
105 if (strValue
== "scraper")
107 TiXmlElement
* pChildElement
= m_pRootElement
->FirstChildElement("CreateSearchUrl");
111 if (!(m_SearchStringEncoding
= pChildElement
->Attribute("SearchStringEncoding")))
112 m_SearchStringEncoding
= "UTF-8";
115 pChildElement
= m_pRootElement
->FirstChildElement("CreateArtistSearchUrl");
119 if (!(m_SearchStringEncoding
= pChildElement
->Attribute("SearchStringEncoding")))
120 m_SearchStringEncoding
= "UTF-8";
122 pChildElement
= m_pRootElement
->FirstChildElement("CreateAlbumSearchUrl");
126 if (!(m_SearchStringEncoding
= pChildElement
->Attribute("SearchStringEncoding")))
127 m_SearchStringEncoding
= "UTF-8";
135 m_pRootElement
= NULL
;
139 void CScraperParser::ReplaceBuffers(std::string
& strDest
)
143 for (int i
=MAX_SCRAPER_BUFFERS
-1; i
>=0; i
--)
146 std::string temp
= StringUtils::Format("$${}", i
+ 1);
147 while ((iIndex
= strDest
.find(temp
,iIndex
)) != std::string::npos
)
149 strDest
.replace(strDest
.begin()+iIndex
,strDest
.begin()+iIndex
+temp
.size(),m_param
[i
]);
150 iIndex
+= m_param
[i
].length();
155 while ((iIndex
= strDest
.find("$INFO[", iIndex
)) != std::string::npos
)
157 size_t iEnd
= strDest
.find(']', iIndex
);
158 std::string strInfo
= strDest
.substr(iIndex
+6, iEnd
- iIndex
- 6);
159 std::string strReplace
;
161 strReplace
= m_scraper
->GetSetting(strInfo
);
162 strDest
.replace(strDest
.begin()+iIndex
,strDest
.begin()+iEnd
+1,strReplace
);
163 iIndex
+= strReplace
.length();
165 // insert localize strings
167 while ((iIndex
= strDest
.find("$LOCALIZE[", iIndex
)) != std::string::npos
)
169 size_t iEnd
= strDest
.find(']', iIndex
);
170 std::string strInfo
= strDest
.substr(iIndex
+10, iEnd
- iIndex
- 10);
171 std::string strReplace
;
173 strReplace
= g_localizeStrings
.GetAddonString(m_scraper
->ID(), strtol(strInfo
.c_str(),NULL
,10));
174 strDest
.replace(strDest
.begin()+iIndex
,strDest
.begin()+iEnd
+1,strReplace
);
175 iIndex
+= strReplace
.length();
178 while ((iIndex
= strDest
.find("\\n",iIndex
)) != std::string::npos
)
179 strDest
.replace(strDest
.begin()+iIndex
,strDest
.begin()+iIndex
+2,"\n");
182 void CScraperParser::ParseExpression(const std::string
& input
, std::string
& dest
, TiXmlElement
* element
, bool bAppend
)
184 std::string strOutput
= XMLUtils::GetAttribute(element
, "output");
186 TiXmlElement
* pExpression
= element
->FirstChildElement("expression");
189 bool bInsensitive
=true;
190 const char* sensitive
= pExpression
->Attribute("cs");
192 if (StringUtils::CompareNoCase(sensitive
, "yes") == 0)
193 bInsensitive
=false; // match case sensitive
195 CRegExp::utf8Mode eUtf8
= CRegExp::autoUtf8
;
196 const char* const strUtf8
= pExpression
->Attribute("utf8");
199 if (StringUtils::CompareNoCase(strUtf8
, "yes") == 0)
200 eUtf8
= CRegExp::forceUtf8
;
201 else if (StringUtils::CompareNoCase(strUtf8
, "no") == 0)
202 eUtf8
= CRegExp::asciiOnly
;
203 else if (StringUtils::CompareNoCase(strUtf8
, "auto") == 0)
204 eUtf8
= CRegExp::autoUtf8
;
207 CRegExp
reg(bInsensitive
, eUtf8
);
208 std::string strExpression
;
209 if (pExpression
->FirstChild())
210 strExpression
= pExpression
->FirstChild()->Value();
212 strExpression
= "(.*)";
213 ReplaceBuffers(strExpression
);
214 ReplaceBuffers(strOutput
);
216 if (!reg
.RegComp(strExpression
.c_str()))
221 bool bRepeat
= false;
222 const char* szRepeat
= pExpression
->Attribute("repeat");
224 if (StringUtils::CompareNoCase(szRepeat
, "yes") == 0)
227 const char* szClear
= pExpression
->Attribute("clear");
229 if (StringUtils::CompareNoCase(szClear
, "yes") == 0)
230 dest
=""; // clear no matter if regexp fails
232 bool bClean
[MAX_SCRAPER_BUFFERS
];
233 GetBufferParams(bClean
,pExpression
->Attribute("noclean"),true);
235 bool bTrim
[MAX_SCRAPER_BUFFERS
];
236 GetBufferParams(bTrim
,pExpression
->Attribute("trim"),false);
238 bool bFixChars
[MAX_SCRAPER_BUFFERS
];
239 GetBufferParams(bFixChars
,pExpression
->Attribute("fixchars"),false);
241 bool bEncode
[MAX_SCRAPER_BUFFERS
];
242 GetBufferParams(bEncode
,pExpression
->Attribute("encode"),false);
245 pExpression
->QueryIntAttribute("optional",&iOptional
);
248 pExpression
->QueryIntAttribute("compare",&iCompare
);
250 StringUtils::ToLower(m_param
[iCompare
-1]);
251 std::string curInput
= input
;
252 for (int iBuf
=0;iBuf
<MAX_SCRAPER_BUFFERS
;++iBuf
)
255 InsertToken(strOutput
,iBuf
+1,"!!!CLEAN!!!");
257 InsertToken(strOutput
,iBuf
+1,"!!!TRIM!!!");
259 InsertToken(strOutput
,iBuf
+1,"!!!FIXCHARS!!!");
261 InsertToken(strOutput
,iBuf
+1,"!!!ENCODE!!!");
263 int i
= reg
.RegFind(curInput
.c_str());
264 while (i
> -1 && (i
< (int)curInput
.size() || curInput
.empty()))
271 std::string strCurOutput
=strOutput
;
273 if (iOptional
> -1) // check that required param is there
276 snprintf(temp
, sizeof(temp
), "\\%i", iOptional
);
277 std::string szParam
= reg
.GetReplaceString(temp
);
279 reg2
.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
280 int i2
=reg2
.RegFind(strCurOutput
.c_str());
283 std::string
szRemove(reg2
.GetMatch(2));
284 int iRemove
= szRemove
.size();
285 int i3
= strCurOutput
.find(szRemove
);
286 if (!szParam
.empty())
288 strCurOutput
.erase(i3
+iRemove
,2);
289 strCurOutput
.erase(i3
,2);
292 strCurOutput
.replace(strCurOutput
.begin()+i3
,strCurOutput
.begin()+i3
+iRemove
+2,"");
294 i2
= reg2
.RegFind(strCurOutput
.c_str());
298 int iLen
= reg
.GetFindLen();
299 // nasty hack #1 - & means \0 in a replace string
300 StringUtils::Replace(strCurOutput
, "&","!!!AMPAMP!!!");
301 std::string result
= reg
.GetReplaceString(strCurOutput
);
304 std::string
strResult(result
);
305 StringUtils::Replace(strResult
, "!!!AMPAMP!!!","&");
307 ReplaceBuffers(strResult
);
310 std::string strResultNoCase
= strResult
;
311 StringUtils::ToLower(strResultNoCase
);
312 if (strResultNoCase
.find(m_param
[iCompare
-1]) != std::string::npos
)
318 if (bRepeat
&& iLen
> 0)
320 curInput
.erase(0,i
+iLen
>(int)curInput
.size()?curInput
.size():i
+iLen
);
321 i
= reg
.RegFind(curInput
.c_str());
329 void CScraperParser::ParseXSLT(const std::string
& input
, std::string
& dest
, TiXmlElement
* element
, bool bAppend
)
332 TiXmlElement
* pSheet
= element
->FirstChildElement();
338 ReplaceBuffers(strXslt
);
340 if (!xsltUtils
.SetInput(input
))
341 CLog::Log(LOGDEBUG
, "could not parse input XML");
343 if (!xsltUtils
.SetStylesheet(strXslt
))
344 CLog::Log(LOGDEBUG
, "could not parse stylesheet XML");
346 xsltUtils
.XSLTTransform(dest
);
351 TiXmlElement
*FirstChildScraperElement(TiXmlElement
*element
)
353 for (TiXmlElement
*child
= element
->FirstChildElement(); child
; child
= child
->NextSiblingElement())
356 if (child
->ValueStr() == "XSLT")
359 if (child
->ValueStr() == "RegExp")
365 TiXmlElement
*NextSiblingScraperElement(TiXmlElement
*element
)
367 for (TiXmlElement
*next
= element
->NextSiblingElement(); next
; next
= next
->NextSiblingElement())
370 if (next
->ValueStr() == "XSLT")
373 if (next
->ValueStr() == "RegExp")
379 void CScraperParser::ParseNext(TiXmlElement
* element
)
381 TiXmlElement
* pReg
= element
;
384 TiXmlElement
* pChildReg
= FirstChildScraperElement(pReg
);
386 ParseNext(pChildReg
);
389 TiXmlElement
* pChildReg
= pReg
->FirstChildElement("clear");
391 ParseNext(pChildReg
);
395 bool bAppend
= false;
396 const char* szDest
= pReg
->Attribute("dest");
397 if (szDest
&& strlen(szDest
))
399 if (szDest
[strlen(szDest
)-1] == '+')
402 iDest
= atoi(szDest
);
405 const char *szInput
= pReg
->Attribute("input");
406 std::string strInput
;
410 ReplaceBuffers(strInput
);
413 strInput
= m_param
[0];
415 const char* szConditional
= pReg
->Attribute("conditional");
416 bool bExecute
= true;
420 if (szConditional
[0] == '!')
425 std::string strSetting
;
426 if (m_scraper
&& m_scraper
->HasSettings())
427 strSetting
= m_scraper
->GetSetting(szConditional
);
428 bExecute
= bInverse
!= (strSetting
== "true");
433 if (iDest
-1 < MAX_SCRAPER_BUFFERS
&& iDest
-1 > -1)
436 if (pReg
->ValueStr() == "XSLT")
437 ParseXSLT(strInput
, m_param
[iDest
- 1], pReg
, bAppend
);
440 ParseExpression(strInput
, m_param
[iDest
- 1],pReg
,bAppend
);
443 CLog::Log(LOGERROR
,"CScraperParser::ParseNext: destination buffer "
444 "out of bounds, skipping expression");
446 pReg
= NextSiblingScraperElement(pReg
);
450 const std::string
CScraperParser::Parse(const std::string
& strTag
,
453 TiXmlElement
* pChildElement
= m_pRootElement
->FirstChildElement(strTag
.c_str());
454 if(pChildElement
== NULL
)
456 CLog::Log(LOGERROR
, "{}: Could not find scraper function {}", __FUNCTION__
, strTag
);
459 int iResult
= 1; // default to param 1
460 pChildElement
->QueryIntAttribute("dest",&iResult
);
461 TiXmlElement
* pChildStart
= FirstChildScraperElement(pChildElement
);
463 ParseNext(pChildStart
);
464 std::string tmp
= m_param
[iResult
-1];
466 const char* szClearBuffers
= pChildElement
->Attribute("clearbuffers");
467 if (!szClearBuffers
|| StringUtils::CompareNoCase(szClearBuffers
, "no") != 0)
473 void CScraperParser::Clean(std::string
& strDirty
)
476 std::string strBuffer
;
477 while ((i
= strDirty
.find("!!!CLEAN!!!",i
)) != std::string::npos
)
480 if ((i2
= strDirty
.find("!!!CLEAN!!!",i
+11)) != std::string::npos
)
482 strBuffer
= strDirty
.substr(i
+11,i2
-i
-11);
483 std::string
strConverted(strBuffer
);
484 HTML::CHTMLUtil::RemoveTags(strConverted
);
485 StringUtils::Trim(strConverted
);
486 strDirty
.replace(i
, i2
-i
+11, strConverted
);
487 i
+= strConverted
.size();
493 while ((i
= strDirty
.find("!!!TRIM!!!",i
)) != std::string::npos
)
496 if ((i2
= strDirty
.find("!!!TRIM!!!",i
+10)) != std::string::npos
)
498 strBuffer
= strDirty
.substr(i
+10,i2
-i
-10);
499 StringUtils::Trim(strBuffer
);
500 strDirty
.replace(i
, i2
-i
+10, strBuffer
);
501 i
+= strBuffer
.size();
507 while ((i
= strDirty
.find("!!!FIXCHARS!!!",i
)) != std::string::npos
)
510 if ((i2
= strDirty
.find("!!!FIXCHARS!!!",i
+14)) != std::string::npos
)
512 strBuffer
= strDirty
.substr(i
+14,i2
-i
-14);
513 std::wstring wbuffer
;
514 g_charsetConverter
.utf8ToW(strBuffer
, wbuffer
, false, false, false);
515 std::wstring wConverted
;
516 HTML::CHTMLUtil::ConvertHTMLToW(wbuffer
,wConverted
);
517 g_charsetConverter
.wToUTF8(wConverted
, strBuffer
, false);
518 StringUtils::Trim(strBuffer
);
519 ConvertJSON(strBuffer
);
520 strDirty
.replace(i
, i2
-i
+14, strBuffer
);
521 i
+= strBuffer
.size();
527 while ((i
=strDirty
.find("!!!ENCODE!!!",i
)) != std::string::npos
)
530 if ((i2
= strDirty
.find("!!!ENCODE!!!",i
+12)) != std::string::npos
)
532 strBuffer
= CURL::Encode(strDirty
.substr(i
+ 12, i2
- i
- 12));
533 strDirty
.replace(i
, i2
-i
+12, strBuffer
);
534 i
+= strBuffer
.size();
541 void CScraperParser::ConvertJSON(std::string
&string
)
544 reg
.RegComp("\\\\u([0-f]{4})");
545 while (reg
.RegFind(string
.c_str()) > -1)
547 int pos
= reg
.GetSubStart(1);
548 std::string
szReplace(reg
.GetMatch(1));
550 std::string replace
= StringUtils::Format("&#x{};", szReplace
);
551 string
.replace(string
.begin()+pos
-2, string
.begin()+pos
+4, replace
);
555 reg2
.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
556 while (reg2
.RegFind(string
.c_str()) > -1)
558 int pos1
= reg2
.GetSubStart(1);
559 int pos2
= reg2
.GetSubStart(2);
560 std::string
szHexValue(reg2
.GetMatch(1));
562 std::string replace
= std::to_string(std::stol(szHexValue
, NULL
, 16));
563 string
.replace(string
.begin()+pos1
-2, string
.begin()+pos2
+reg2
.GetSubLength(2), replace
);
566 StringUtils::Replace(string
, "\\\"","\"");
569 void CScraperParser::ClearBuffers()
571 //clear all m_param strings
572 for (std::string
& param
: m_param
)
576 void CScraperParser::GetBufferParams(bool* result
, const char* attribute
, bool defvalue
)
578 for (int iBuf
=0;iBuf
<MAX_SCRAPER_BUFFERS
;++iBuf
)
579 result
[iBuf
] = defvalue
;
582 std::vector
<std::string
> vecBufs
;
583 StringUtils::Tokenize(attribute
,vecBufs
,",");
584 for (size_t nToken
=0; nToken
< vecBufs
.size(); nToken
++)
586 int index
= atoi(vecBufs
[nToken
].c_str())-1;
587 if (index
< MAX_SCRAPER_BUFFERS
)
588 result
[index
] = !defvalue
;
593 void CScraperParser::InsertToken(std::string
& strOutput
, int buf
, const char* token
)
596 snprintf(temp
, sizeof(temp
), "\\%i", buf
);
598 while ((i2
= strOutput
.find(temp
,i2
)) != std::string::npos
)
600 strOutput
.insert(i2
,token
);
601 i2
+= strlen(token
) + strlen(temp
);
602 strOutput
.insert(i2
,token
);
606 void CScraperParser::AddDocument(const CXBMCTinyXML
* doc
)
608 const TiXmlNode
* node
= doc
->RootElement()->FirstChild();
611 m_pRootElement
->InsertEndChild(*node
);
612 node
= node
->NextSibling();