2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
9 #include "XBMCTinyXML.h"
13 #include "filesystem/File.h"
14 #include "utils/CharsetConverter.h"
15 #include "utils/CharsetDetection.h"
16 #include "utils/StringUtils.h"
17 #include "utils/Utf8Utils.h"
18 #include "utils/log.h"
20 #define MAX_ENTITY_LENGTH 8 // size of largest entity "&#xNNNN;"
21 #define BUFFER_SIZE 4096
23 CXBMCTinyXML::CXBMCTinyXML()
28 CXBMCTinyXML::CXBMCTinyXML(const char *documentName
)
29 : TiXmlDocument(documentName
)
33 CXBMCTinyXML::CXBMCTinyXML(const std::string
& documentName
)
34 : TiXmlDocument(documentName
)
38 CXBMCTinyXML::CXBMCTinyXML(const std::string
& documentName
, const std::string
& documentCharset
)
39 : TiXmlDocument(documentName
), m_SuggestedCharset(documentCharset
)
41 StringUtils::ToUpper(m_SuggestedCharset
);
44 bool CXBMCTinyXML::LoadFile(TiXmlEncoding encoding
)
46 return LoadFile(value
, encoding
);
49 bool CXBMCTinyXML::LoadFile(const char *_filename
, TiXmlEncoding encoding
)
51 return LoadFile(std::string(_filename
), encoding
);
54 bool CXBMCTinyXML::LoadFile(const std::string
& _filename
, TiXmlEncoding encoding
)
56 value
= _filename
.c_str();
59 std::vector
<uint8_t> buffer
;
61 if (file
.LoadFile(value
, buffer
) <= 0)
63 SetError(TIXML_ERROR_OPENING_FILE
, NULL
, NULL
, TIXML_ENCODING_UNKNOWN
);
67 // Delete the existing data:
71 std::string
data(reinterpret_cast<char*>(buffer
.data()), buffer
.size());
72 buffer
.clear(); // free memory early
74 if (encoding
== TIXML_ENCODING_UNKNOWN
)
75 Parse(data
, file
.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET
));
77 Parse(data
, encoding
);
84 bool CXBMCTinyXML::LoadFile(const std::string
& _filename
, const std::string
& documentCharset
)
86 m_SuggestedCharset
= documentCharset
;
87 StringUtils::ToUpper(m_SuggestedCharset
);
88 return LoadFile(_filename
, TIXML_ENCODING_UNKNOWN
);
91 bool CXBMCTinyXML::LoadFile(FILE *f
, TiXmlEncoding encoding
)
94 char buf
[BUFFER_SIZE
] = {};
96 while ((result
= fread(buf
, 1, BUFFER_SIZE
, f
)) > 0)
97 data
.append(buf
, result
);
98 return Parse(data
, encoding
);
101 bool CXBMCTinyXML::SaveFile(const char *_filename
) const
103 return SaveFile(std::string(_filename
));
106 bool CXBMCTinyXML::SaveFile(const std::string
& filename
) const
109 if (file
.OpenForWrite(filename
, true))
111 TiXmlPrinter printer
;
113 bool suc
= file
.Write(printer
.CStr(), printer
.Size()) == static_cast<ssize_t
>(printer
.Size());
122 bool CXBMCTinyXML::Parse(const std::string
& data
, const std::string
& dataCharset
)
124 m_SuggestedCharset
= dataCharset
;
125 StringUtils::ToUpper(m_SuggestedCharset
);
126 return Parse(data
, TIXML_ENCODING_UNKNOWN
);
129 bool CXBMCTinyXML::Parse(const std::string
& data
, TiXmlEncoding encoding
/*= TIXML_DEFAULT_ENCODING */)
131 m_UsedCharset
.clear();
132 if (encoding
!= TIXML_ENCODING_UNKNOWN
)
133 { // encoding != TIXML_ENCODING_UNKNOWN means "do not use m_SuggestedCharset and charset detection"
134 m_SuggestedCharset
.clear();
135 if (encoding
== TIXML_ENCODING_UTF8
)
136 m_UsedCharset
= "UTF-8";
138 return InternalParse(data
, encoding
);
141 if (!m_SuggestedCharset
.empty() && TryParse(data
, m_SuggestedCharset
))
144 std::string detectedCharset
;
145 if (CCharsetDetection::DetectXmlEncoding(data
, detectedCharset
) && TryParse(data
, detectedCharset
))
147 if (!m_SuggestedCharset
.empty())
148 CLog::Log(LOGWARNING
,
149 "{}: \"{}\" charset was used instead of suggested charset \"{}\" for {}",
150 __FUNCTION__
, m_UsedCharset
, m_SuggestedCharset
,
151 (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")));
156 // check for valid UTF-8
157 if (m_SuggestedCharset
!= "UTF-8" && detectedCharset
!= "UTF-8" && CUtf8Utils::isValidUtf8(data
) &&
158 TryParse(data
, "UTF-8"))
160 if (!m_SuggestedCharset
.empty())
161 CLog::Log(LOGWARNING
,
162 "{}: \"{}\" charset was used instead of suggested charset \"{}\" for {}",
163 __FUNCTION__
, m_UsedCharset
, m_SuggestedCharset
,
164 (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")));
165 else if (!detectedCharset
.empty())
166 CLog::Log(LOGWARNING
, "{}: \"{}\" charset was used instead of detected charset \"{}\" for {}",
167 __FUNCTION__
, m_UsedCharset
, detectedCharset
,
168 (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")));
172 // fallback: try user GUI charset
173 if (TryParse(data
, g_langInfo
.GetGuiCharSet()))
175 if (!m_SuggestedCharset
.empty())
176 CLog::Log(LOGWARNING
,
177 "{}: \"{}\" charset was used instead of suggested charset \"{}\" for {}",
178 __FUNCTION__
, m_UsedCharset
, m_SuggestedCharset
,
179 (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")));
180 else if (!detectedCharset
.empty())
181 CLog::Log(LOGWARNING
, "{}: \"{}\" charset was used instead of detected charset \"{}\" for {}",
182 __FUNCTION__
, m_UsedCharset
, detectedCharset
,
183 (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")));
187 // can't detect correct data charset, try to process data as is
188 if (InternalParse(data
, TIXML_ENCODING_UNKNOWN
))
190 if (!m_SuggestedCharset
.empty())
191 CLog::Log(LOGWARNING
, "{}: Processed {} as unknown encoding instead of suggested \"{}\"",
192 __FUNCTION__
, (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")),
194 else if (!detectedCharset
.empty())
195 CLog::Log(LOGWARNING
, "{}: Processed {} as unknown encoding instead of detected \"{}\"",
196 __FUNCTION__
, (value
.empty() ? "XML data" : ("file \"" + value
+ "\"")),
204 bool CXBMCTinyXML::TryParse(const std::string
& data
, const std::string
& tryDataCharset
)
206 if (tryDataCharset
== "UTF-8")
207 InternalParse(data
, TIXML_ENCODING_UTF8
); // process data without conversion
208 else if (!tryDataCharset
.empty())
210 std::string converted
;
211 /* some wrong conversions can leave US-ASCII XML header and structure untouched but break non-English data
212 * so conversion must fail on wrong character and then other encodings will be tried */
213 if (!g_charsetConverter
.ToUtf8(tryDataCharset
, data
, converted
, true) || converted
.empty())
214 return false; // can't convert data
216 InternalParse(converted
, TIXML_ENCODING_UTF8
);
219 InternalParse(data
, TIXML_ENCODING_LEGACY
);
221 // 'Error()' contains result of last run of 'TiXmlDocument::Parse()'
230 m_UsedCharset
= tryDataCharset
;
234 bool CXBMCTinyXML::InternalParse(const std::string
& rawdata
, TiXmlEncoding encoding
/*= TIXML_DEFAULT_ENCODING */)
236 // Preprocess string, replacing '&' with '& for invalid XML entities
237 size_t pos
= rawdata
.find('&');
238 if (pos
== std::string::npos
)
239 return (TiXmlDocument::Parse(rawdata
.c_str(), NULL
, encoding
) != NULL
); // nothing to fix, process data directly
241 std::string
data(rawdata
);
242 CRegExp
re(false, CRegExp::asciiOnly
, "^&(amp|lt|gt|quot|apos|#x[a-fA-F0-9]{1,4}|#[0-9]{1,5});.*");
245 if (re
.RegFind(data
, pos
, MAX_ENTITY_LENGTH
) < 0)
246 data
.insert(pos
+ 1, "amp;");
247 pos
= data
.find('&', pos
+ 1);
248 } while (pos
!= std::string::npos
);
250 return (TiXmlDocument::Parse(data
.c_str(), NULL
, encoding
) != NULL
);
253 bool CXBMCTinyXML::Test()
255 // scraper results with unescaped &
257 std::string
data("<details><url function=\"ParseTMDBRating\" "
258 "cache=\"tmdb-en-12244.json\">"
259 "http://api.themoviedb.org/3/movie/12244"
260 "?api_key=57983e31fb435df4df77afb854740ea9"
261 "&language=en???</url></details>");
262 doc
.Parse(data
, TIXML_DEFAULT_ENCODING
);
263 TiXmlNode
*root
= doc
.RootElement();
264 if (root
&& root
->ValueStr() == "details")
266 TiXmlElement
*url
= root
->FirstChildElement("url");
267 if (url
&& url
->FirstChild())
269 return (url
->FirstChild()->ValueStr() == "http://api.themoviedb.org/3/movie/12244?api_key=57983e31fb435df4df77afb854740ea9&language=en???");