2 * Copyright (C) 2013-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
9 #include "CharsetDetection.h"
12 #include "utils/CharsetConverter.h"
13 #include "utils/StringUtils.h"
14 #include "utils/Utf8Utils.h"
15 #include "utils/log.h"
19 /* XML declaration can be virtually any size (with many-many whitespaces)
20 * but for in real world we don't need to process megabytes of data
21 * so limit search for XML declaration to reasonable value */
22 const size_t CCharsetDetection::m_XmlDeclarationMaxLength
= 250;
24 /* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#charset
25 * encoding must be placed in first 1024 bytes of document */
26 const size_t CCharsetDetection::m_HtmlCharsetEndSearchPos
= 1024;
28 /* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#space-character
29 * tab, LF, FF, CR or space can be used as whitespace */
30 const std::string
CCharsetDetection::m_HtmlWhitespaceChars("\x09\x0A\x0C\x0D\x20"); // tab, LF, FF, CR and space
32 std::string
CCharsetDetection::GetBomEncoding(const char* const content
, const size_t contentLength
)
34 if (contentLength
< 2)
36 if (content
[0] == (char)0xFE && content
[1] == (char)0xFF)
38 if (contentLength
>= 4 && content
[0] == (char)0xFF && content
[1] == (char)0xFE && content
[2] == (char)0x00 && content
[3] == (char)0x00)
39 return "UTF-32LE"; /* first two bytes are same for UTF-16LE and UTF-32LE, so first check for full UTF-32LE mark */
40 if (content
[0] == (char)0xFF && content
[1] == (char)0xFE)
42 if (contentLength
< 3)
44 if (content
[0] == (char)0xEF && content
[1] == (char)0xBB && content
[2] == (char)0xBF)
46 if (contentLength
< 4)
48 if (content
[0] == (char)0x00 && content
[1] == (char)0x00 && content
[2] == (char)0xFE && content
[3] == (char)0xFF)
50 if (contentLength
>= 5 && content
[0] == (char)0x2B && content
[1] == (char)0x2F && content
[2] == (char)0x76 &&
51 (content
[4] == (char)0x32 || content
[4] == (char)0x39 || content
[4] == (char)0x2B || content
[4] == (char)0x2F))
53 if (content
[0] == (char)0x84 && content
[1] == (char)0x31 && content
[2] == (char)0x95 && content
[3] == (char)0x33)
59 bool CCharsetDetection::DetectXmlEncoding(const char* const xmlContent
, const size_t contentLength
, std::string
& detectedEncoding
)
61 detectedEncoding
.clear();
63 if (contentLength
< 2)
64 return false; // too short for any detection
66 /* Byte Order Mark has priority over "encoding=" parameter */
67 detectedEncoding
= GetBomEncoding(xmlContent
, contentLength
);
68 if (!detectedEncoding
.empty())
71 /* try to read encoding from XML declaration */
72 if (GetXmlEncodingFromDeclaration(xmlContent
, contentLength
, detectedEncoding
))
74 StringUtils::ToUpper(detectedEncoding
);
76 /* make some safety checks */
77 if (detectedEncoding
== "UTF-8")
78 return true; // fast track for most common case
80 if (StringUtils::StartsWith(detectedEncoding
, "UCS-") || StringUtils::StartsWith(detectedEncoding
, "UTF-"))
82 if (detectedEncoding
== "UTF-7")
85 /* XML declaration was detected in UTF-8 mode (by 'GetXmlEncodingFromDeclaration') so we know
86 * that text in single byte encoding, but declaration itself wrongly specify multibyte encoding */
87 detectedEncoding
.clear();
93 /* try to detect basic encoding */
94 std::string guessedEncoding
;
95 if (!GuessXmlEncoding(xmlContent
, contentLength
, guessedEncoding
))
96 return false; /* can't detect any encoding */
98 /* have some guessed encoding, try to use it */
99 std::string convertedXml
;
100 /* use 'm_XmlDeclarationMaxLength * 4' below for UTF-32-like encodings */
101 if (!g_charsetConverter
.ToUtf8(guessedEncoding
, std::string(xmlContent
, std::min(contentLength
, m_XmlDeclarationMaxLength
* 4)), convertedXml
)
102 || convertedXml
.empty())
103 return false; /* can't convert, guessed encoding is wrong */
105 /* text converted, hopefully at least XML declaration is in UTF-8 now */
106 std::string declaredEncoding
;
107 /* try to read real encoding from converted XML declaration */
108 if (!GetXmlEncodingFromDeclaration(convertedXml
.c_str(), convertedXml
.length(), declaredEncoding
))
109 { /* did not find real encoding in XML declaration, use guessed encoding */
110 detectedEncoding
= guessedEncoding
;
114 /* found encoding in converted XML declaration, we know correct endianness and number of bytes per char */
115 /* make some safety checks */
116 StringUtils::ToUpper(declaredEncoding
);
117 if (declaredEncoding
== guessedEncoding
)
120 if (StringUtils::StartsWith(guessedEncoding
, "UCS-4"))
122 if (declaredEncoding
.length() < 5 ||
123 (!StringUtils::StartsWith(declaredEncoding
, "UTF-32") && !StringUtils::StartsWith(declaredEncoding
, "UCS-4")))
124 { /* Guessed encoding was correct because we can convert and read XML declaration, but declaration itself is wrong (not 4-bytes encoding) */
125 detectedEncoding
= guessedEncoding
;
129 else if (StringUtils::StartsWith(guessedEncoding
, "UTF-16"))
131 if (declaredEncoding
.length() < 5 ||
132 (!StringUtils::StartsWith(declaredEncoding
, "UTF-16") && !StringUtils::StartsWith(declaredEncoding
, "UCS-2")))
133 { /* Guessed encoding was correct because we can read XML declaration, but declaration is wrong (not 2-bytes encoding) */
134 detectedEncoding
= guessedEncoding
;
139 if (StringUtils::StartsWith(guessedEncoding
, "UCS-4") || StringUtils::StartsWith(guessedEncoding
, "UTF-16"))
141 /* Check endianness in declared encoding. We already know correct endianness as XML declaration was detected after conversion. */
142 /* Guessed UTF/UCS encoding always ends with endianness */
143 std::string
guessedEndianness(guessedEncoding
, guessedEncoding
.length() - 2);
145 if (!StringUtils::EndsWith(declaredEncoding
, "BE") && !StringUtils::EndsWith(declaredEncoding
, "LE")) /* Declared encoding without endianness */
146 detectedEncoding
= declaredEncoding
+ guessedEndianness
; /* add guessed endianness */
147 else if (!StringUtils::EndsWith(declaredEncoding
, guessedEndianness
)) /* Wrong endianness in declared encoding */
148 detectedEncoding
= declaredEncoding
.substr(0, declaredEncoding
.length() - 2) + guessedEndianness
; /* replace endianness by guessed endianness */
150 detectedEncoding
= declaredEncoding
; /* declared encoding with correct endianness */
154 else if (StringUtils::StartsWith(guessedEncoding
, "EBCDIC"))
156 if (declaredEncoding
.find("EBCDIC") != std::string::npos
)
157 detectedEncoding
= declaredEncoding
; /* Declared encoding is some specific EBCDIC encoding */
159 detectedEncoding
= guessedEncoding
;
164 /* should be unreachable */
168 bool CCharsetDetection::GetXmlEncodingFromDeclaration(const char* const xmlContent
, const size_t contentLength
, std::string
& declaredEncoding
)
170 // following code is std::string-processing analog of regular expression-processing
171 // regular expression: "<\\?xml([ \n\r\t]+[^ \n\t\r>]+)*[ \n\r\t]+encoding[ \n\r\t]*=[ \n\r\t]*('[^ \n\t\r>']+'|\"[^ \n\t\r>\"]+\")"
172 // on win32 x86 machine regular expression is slower that std::string 20-40 times and can slowdown XML processing for several times
173 // seems that this regular expression is too slow due to many variable length parts, regexp for '&'-fixing is much faster
175 declaredEncoding
.clear();
177 // avoid extra large search
178 std::string
strXml(xmlContent
, std::min(contentLength
, m_XmlDeclarationMaxLength
));
180 size_t pos
= strXml
.find("<?xml");
181 if (pos
== std::string::npos
|| pos
+ 6 > strXml
.length() || pos
> strXml
.find('<'))
182 return false; // no "<?xml" declaration, "<?xml" is not first element or "<?xml" is incomplete
184 pos
+= 5; // 5 is length of "<?xml"
186 const size_t declLength
= std::min(std::min(m_XmlDeclarationMaxLength
, contentLength
- pos
), strXml
.find('>', pos
) - pos
);
187 const std::string
xmlDecl(xmlContent
+ pos
, declLength
);
188 const char* const xmlDeclC
= xmlDecl
.c_str(); // for faster processing of [] and for null-termination
190 static const char* const whiteSpaceChars
= " \n\r\t"; // according to W3C Recommendation for XML, any of them can be used as separator
193 while (pos
+ 12 <= declLength
) // 12 is minimal length of "encoding='x'"
195 pos
= xmlDecl
.find_first_of(whiteSpaceChars
, pos
);
196 if (pos
== std::string::npos
)
197 return false; // no " encoding=" in declaration
199 pos
= xmlDecl
.find_first_not_of(whiteSpaceChars
, pos
);
200 if (pos
== std::string::npos
)
201 return false; // no "encoding=" in declaration
203 if (xmlDecl
.compare(pos
, 8, "encoding", 8) != 0)
204 continue; // not "encoding" parameter
205 pos
+= 8; // length of "encoding"
207 if (xmlDeclC
[pos
] == ' ' || xmlDeclC
[pos
] == '\n' || xmlDeclC
[pos
] == '\r' || xmlDeclC
[pos
] == '\t') // no buffer overrun as string is null-terminated
209 pos
= xmlDecl
.find_first_not_of(whiteSpaceChars
, pos
);
210 if (pos
== std::string::npos
)
211 return false; // this " encoding" is incomplete, only whitespace chars remains
213 if (xmlDeclC
[pos
] != '=')
214 { // "encoding" without "=", try to find other
215 pos
--; // step back to whitespace
220 if (xmlDeclC
[pos
] == ' ' || xmlDeclC
[pos
] == '\n' || xmlDeclC
[pos
] == '\r' || xmlDeclC
[pos
] == '\t') // no buffer overrun as string is null-terminated
222 pos
= xmlDecl
.find_first_not_of(whiteSpaceChars
, pos
);
223 if (pos
== std::string::npos
)
224 return false; // this " encoding" is incomplete, only whitespace chars remains
226 size_t encNameEndPos
;
227 if (xmlDeclC
[pos
] == '"')
228 encNameEndPos
= xmlDecl
.find('"', ++pos
);
229 else if (xmlDeclC
[pos
] == '\'')
230 encNameEndPos
= xmlDecl
.find('\'', ++pos
);
232 continue; // no quote or double quote after 'encoding=', try to find other
234 if (encNameEndPos
!= std::string::npos
)
236 declaredEncoding
.assign(xmlDecl
, pos
, encNameEndPos
- pos
);
239 // no closing quote or double quote after 'encoding="x', try to find other
245 bool CCharsetDetection::GuessXmlEncoding(const char* const xmlContent
, const size_t contentLength
, std::string
& supposedEncoding
)
247 supposedEncoding
.clear();
248 if (contentLength
< 4)
249 return false; // too little data to guess
251 if (xmlContent
[0] == 0 && xmlContent
[1] == 0 && xmlContent
[2] == 0 && xmlContent
[3] == (char)0x3C) // '<' == '00 00 00 3C' in UCS-4 (UTF-32) big-endian
252 supposedEncoding
= "UCS-4BE"; // use UCS-4 according to W3C recommendation
253 else if (xmlContent
[0] == (char)0x3C && xmlContent
[1] == 0 && xmlContent
[2] == 0 && xmlContent
[3] == 0) // '<' == '3C 00 00 00' in UCS-4 (UTF-32) little-endian
254 supposedEncoding
= "UCS-4LE"; // use UCS-4 according to W3C recommendation
255 else if (xmlContent
[0] == 0 && xmlContent
[1] == (char)0x3C && xmlContent
[2] == 0 && xmlContent
[3] == (char)0x3F) // "<?" == "00 3C 00 3F" in UTF-16 (UCS-2) big-endian
256 supposedEncoding
= "UTF-16BE";
257 else if (xmlContent
[0] == (char)0x3C && xmlContent
[1] == 0 && xmlContent
[2] == (char)0x3F && xmlContent
[3] == 0) // "<?" == "3C 00 3F 00" in UTF-16 (UCS-2) little-endian
258 supposedEncoding
= "UTF-16LE";
259 else if (xmlContent
[0] == (char)0x4C && xmlContent
[1] == (char)0x6F && xmlContent
[2] == (char)0xA7 && xmlContent
[3] == (char)0x94) // "<?xm" == "4C 6F A7 94" in most EBCDIC encodings
260 supposedEncoding
= "EBCDIC-CP-US"; // guessed value, real value must be read from declaration
267 bool CCharsetDetection::ConvertHtmlToUtf8(const std::string
& htmlContent
, std::string
& converted
, const std::string
& serverReportedCharset
, std::string
& usedHtmlCharset
)
270 usedHtmlCharset
.clear();
271 if (htmlContent
.empty())
273 usedHtmlCharset
= "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
277 // this is relaxed implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#determining-the-character-encoding
279 // try to get charset from Byte Order Mark
280 std::string
bomCharset(GetBomEncoding(htmlContent
));
281 if (checkConversion(bomCharset
, htmlContent
, converted
))
283 usedHtmlCharset
= bomCharset
;
287 // try charset from HTTP header (or from other out-of-band source)
288 if (checkConversion(serverReportedCharset
, htmlContent
, converted
))
290 usedHtmlCharset
= serverReportedCharset
;
294 // try to find charset in HTML
295 std::string
declaredCharset(GetHtmlEncodingFromHead(htmlContent
));
296 if (!declaredCharset
.empty())
298 if (declaredCharset
.compare(0, 3, "UTF", 3) == 0)
299 declaredCharset
= "UTF-8"; // charset string was found in singlebyte mode, charset can't be multibyte encoding
300 if (checkConversion(declaredCharset
, htmlContent
, converted
))
302 usedHtmlCharset
= declaredCharset
;
307 // try UTF-8 if not tried before
308 if (bomCharset
!= "UTF-8" && serverReportedCharset
!= "UTF-8" && declaredCharset
!= "UTF-8" && checkConversion("UTF-8", htmlContent
, converted
))
310 usedHtmlCharset
= "UTF-8";
311 return false; // only guessed value
315 std::string
userCharset(g_langInfo
.GetGuiCharSet());
316 if (checkConversion(userCharset
, htmlContent
, converted
))
318 usedHtmlCharset
= userCharset
;
319 return false; // only guessed value
323 if (checkConversion("WINDOWS-1252", htmlContent
, converted
))
325 usedHtmlCharset
= "WINDOWS-1252";
326 return false; // only guessed value
329 // can't find exact charset
330 // use one of detected as fallback
331 if (!bomCharset
.empty())
332 usedHtmlCharset
= bomCharset
;
333 else if (!serverReportedCharset
.empty())
334 usedHtmlCharset
= serverReportedCharset
;
335 else if (!declaredCharset
.empty())
336 usedHtmlCharset
= declaredCharset
;
337 else if (!userCharset
.empty())
338 usedHtmlCharset
= userCharset
;
340 usedHtmlCharset
= "WINDOWS-1252";
342 CLog::Log(LOGWARNING
, "{}: Can't correctly convert to UTF-8 charset, converting as \"{}\"",
343 __FUNCTION__
, usedHtmlCharset
);
344 g_charsetConverter
.ToUtf8(usedHtmlCharset
, htmlContent
, converted
, false);
349 bool CCharsetDetection::ConvertPlainTextToUtf8(const std::string
& textContent
, std::string
& converted
, const std::string
& serverReportedCharset
, std::string
& usedCharset
)
353 if (textContent
.empty())
355 usedCharset
= "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
359 // try to get charset from Byte Order Mark
360 std::string
bomCharset(GetBomEncoding(textContent
));
361 if (checkConversion(bomCharset
, textContent
, converted
))
363 usedCharset
= bomCharset
;
367 // try charset from HTTP header (or from other out-of-band source)
368 if (checkConversion(serverReportedCharset
, textContent
, converted
))
370 usedCharset
= serverReportedCharset
;
374 // try UTF-8 if not tried before
375 if (bomCharset
!= "UTF-8" && serverReportedCharset
!= "UTF-8" && checkConversion("UTF-8", textContent
, converted
))
377 usedCharset
= "UTF-8";
382 std::string
userCharset(g_langInfo
.GetGuiCharSet());
383 if (checkConversion(userCharset
, textContent
, converted
))
385 usedCharset
= userCharset
;
389 // try system default charset
390 if (g_charsetConverter
.systemToUtf8(textContent
, converted
, true))
392 usedCharset
= "char"; // synonym to system charset
397 if (checkConversion("WINDOWS-1252", textContent
, converted
))
399 usedCharset
= "WINDOWS-1252";
403 // can't find correct charset
404 // use one of detected as fallback
405 if (!serverReportedCharset
.empty())
406 usedCharset
= serverReportedCharset
;
407 else if (!bomCharset
.empty())
408 usedCharset
= bomCharset
;
409 else if (!userCharset
.empty())
410 usedCharset
= userCharset
;
412 usedCharset
= "WINDOWS-1252";
414 CLog::Log(LOGWARNING
, "{}: Can't correctly convert to UTF-8 charset, converting as \"{}\"",
415 __FUNCTION__
, usedCharset
);
416 g_charsetConverter
.ToUtf8(usedCharset
, textContent
, converted
, false);
422 bool CCharsetDetection::checkConversion(const std::string
& srcCharset
, const std::string
& src
, std::string
& dst
)
424 if (srcCharset
.empty())
427 if (srcCharset
!= "UTF-8")
429 if (g_charsetConverter
.ToUtf8(srcCharset
, src
, dst
, true))
432 else if (CUtf8Utils::isValidUtf8(src
))
441 std::string
CCharsetDetection::GetHtmlEncodingFromHead(const std::string
& htmlContent
)
443 std::string smallerHtmlContent
;
444 if (htmlContent
.length() > 2 * m_HtmlCharsetEndSearchPos
)
445 smallerHtmlContent
.assign(htmlContent
, 0, 2 * m_HtmlCharsetEndSearchPos
); // use twice more bytes to search for charset for safety
447 const std::string
& html
= smallerHtmlContent
.empty() ? htmlContent
: smallerHtmlContent
; // limit search
448 const char* const htmlC
= html
.c_str(); // for null-termination
449 const size_t len
= html
.length();
451 // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#prescan-a-byte-stream-to-determine-its-encoding
452 // labels in comments correspond to the labels in HTML5 standard
453 // note: opposite to standard, everything is converted to uppercase instead of lower case
455 while (pos
< len
) // "loop" label
457 if (html
.compare(pos
, 4, "<!--", 4) == 0)
459 pos
= html
.find("-->", pos
+ 2);
460 if (pos
== std::string::npos
)
464 else if (htmlC
[pos
] == '<' && (htmlC
[pos
+ 1] == 'm' || htmlC
[pos
+ 1] == 'M') && (htmlC
[pos
+ 2] == 'e' || htmlC
[pos
+ 2] == 'E')
465 && (htmlC
[pos
+ 3] == 't' || htmlC
[pos
+ 3] == 'T') && (htmlC
[pos
+ 4] == 'a' || htmlC
[pos
+ 4] == 'A')
466 && (htmlC
[pos
+ 5] == 0x09 || htmlC
[pos
+ 5] == 0x0A || htmlC
[pos
+ 5] == 0x0C || htmlC
[pos
+ 5] == 0x0D || htmlC
[pos
+ 5] == 0x20 || htmlC
[pos
+ 5] == 0x2F))
467 { // this is case insensitive "<meta" and one of tab, LF, FF, CR, space or slash
468 pos
+= 5; // "pos" points to symbol after "<meta"
469 std::string attrName
, attrValue
;
470 bool gotPragma
= false;
471 std::string contentCharset
;
472 do // "attributes" label
474 pos
= GetHtmlAttribute(html
, pos
, attrName
, attrValue
);
475 if (attrName
== "HTTP-EQUIV" && attrValue
== "CONTENT-TYPE")
477 else if (attrName
== "CONTENT")
478 contentCharset
= ExtractEncodingFromHtmlMeta(attrValue
);
479 else if (attrName
== "CHARSET")
481 StringUtils::Trim(attrValue
, m_HtmlWhitespaceChars
.c_str()); // tab, LF, FF, CR, space
482 if (!attrValue
.empty())
485 } while (!attrName
.empty() && pos
< len
);
487 // "processing" label
488 if (gotPragma
&& !contentCharset
.empty())
489 return contentCharset
;
491 else if (htmlC
[pos
] == '<' && ((htmlC
[pos
+ 1] >= 'A' && htmlC
[pos
+ 1] <= 'Z') || (htmlC
[pos
+ 1] >= 'a' && htmlC
[pos
+ 1] <= 'z')))
493 pos
= html
.find_first_of("\x09\x0A\x0C\x0D >", pos
); // tab, LF, FF, CR, space or '>'
494 std::string attrName
, attrValue
;
497 pos
= GetHtmlAttribute(html
, pos
, attrName
, attrValue
);
498 } while (pos
< len
&& !attrName
.empty());
500 else if (html
.compare(pos
, 2, "<!", 2) == 0 || html
.compare(pos
, 2, "</", 2) == 0 || html
.compare(pos
, 2, "<?", 2) == 0)
501 pos
= html
.find('>', pos
);
503 if (pos
== std::string::npos
)
510 return ""; // no charset was found
513 size_t CCharsetDetection::GetHtmlAttribute(const std::string
& htmlContent
, size_t pos
, std::string
& attrName
, std::string
& attrValue
)
517 static const char* const htmlWhitespaceSlash
= "\x09\x0A\x0C\x0D\x20\x2F"; // tab, LF, FF, CR, space or slash
518 const char* const htmlC
= htmlContent
.c_str();
519 const size_t len
= htmlContent
.length();
521 // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#concept-get-attributes-when-sniffing
522 // labels in comments correspond to the labels in HTML5 standard
523 // note: opposite to standard, everything is converted to uppercase instead of lower case
524 pos
= htmlContent
.find_first_not_of(htmlWhitespaceSlash
, pos
);
525 if (pos
== std::string::npos
|| htmlC
[pos
] == '>')
526 return pos
; // only white spaces or slashes up to the end of the htmlContent or no more attributes
528 while (pos
< len
&& htmlC
[pos
] != '=')
530 const char chr
= htmlC
[pos
];
531 if (chr
== '/' || chr
== '>')
532 return pos
; // no attributes or empty attribute value
533 else if (m_HtmlWhitespaceChars
.find(chr
) != std::string::npos
) // chr is one of whitespaces
535 pos
= htmlContent
.find_first_not_of(m_HtmlWhitespaceChars
, pos
); // "spaces" label
536 if (pos
== std::string::npos
|| htmlC
[pos
] != '=')
537 return pos
; // only white spaces up to the end or no attribute value
541 appendCharAsAsciiUpperCase(attrName
, chr
);
547 return std::string::npos
; // no '=', '/' or '>' were found up to the end of htmlContent
549 pos
++; // advance pos to character after '='
551 pos
= htmlContent
.find_first_not_of(m_HtmlWhitespaceChars
, pos
); // "value" label
552 if (pos
== std::string::npos
)
553 return pos
; // only white spaces remain in htmlContent
555 if (htmlC
[pos
] == '>')
556 return pos
; // empty attribute value
557 else if (htmlC
[pos
] == '"' || htmlC
[pos
] == '\'')
559 const char qChr
= htmlC
[pos
];
560 // "quote loop" label
563 const char chr
= htmlC
[pos
];
567 appendCharAsAsciiUpperCase(attrValue
, chr
);
569 return std::string::npos
; // no closing quote is found
572 appendCharAsAsciiUpperCase(attrValue
, htmlC
[pos
]);
577 const char chr
= htmlC
[pos
];
578 if (m_HtmlWhitespaceChars
.find(chr
) != std::string::npos
|| chr
== '>')
581 appendCharAsAsciiUpperCase(attrValue
, chr
);
586 return std::string::npos
; // rest of htmlContent was attribute value
589 std::string
CCharsetDetection::ExtractEncodingFromHtmlMeta(const std::string
& metaContent
,
592 size_t len
= metaContent
.length();
596 const char* const metaContentC
= metaContent
.c_str();
598 // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
599 // labels in comments correspond to the labels in HTML5 standard
600 // note: opposite to standard, case sensitive match is used as argument is always in uppercase
605 pos
= metaContent
.find("CHARSET", pos
);
606 if (pos
== std::string::npos
)
609 pos
= metaContent
.find_first_not_of(m_HtmlWhitespaceChars
, pos
+ 7); // '7' is the length of 'CHARSET'
610 if (pos
!= std::string::npos
&& metaContentC
[pos
] == '=')
612 pos
= metaContent
.find_first_not_of(m_HtmlWhitespaceChars
, pos
+ 1);
613 if (pos
!= std::string::npos
)
615 if (metaContentC
[pos
] == '\'' || metaContentC
[pos
] == '"')
617 const char qChr
= metaContentC
[pos
];
619 const size_t closeQpos
= metaContent
.find(qChr
, pos
);
620 if (closeQpos
!= std::string::npos
)
621 charset
.assign(metaContent
, pos
, closeQpos
- pos
);
624 charset
.assign(metaContent
, pos
, metaContent
.find("\x09\x0A\x0C\x0D ;", pos
) - pos
); // assign content up to the next tab, LF, FF, CR, space, semicolon or end of string
630 static const char* const htmlWhitespaceCharsC
= m_HtmlWhitespaceChars
.c_str();
631 StringUtils::Trim(charset
, htmlWhitespaceCharsC
);
636 inline void CCharsetDetection::appendCharAsAsciiUpperCase(std::string
& str
, const char chr
)
638 if (chr
>= 'a' && chr
<= 'z')
639 str
.push_back(chr
- ('a' - 'A')); // convert to upper case