xbmc/utils/CharsetDetection.cpp

   1 /*
   2  *  Copyright (C) 2013-2018 Team Kodi
   3  *  This file is part of Kodi - https://kodi.tv
   4  *
   5  *  SPDX-License-Identifier: GPL-2.0-or-later
   6  *  See LICENSES/README.md for more information.
   7  */
   8
   9 #include "CharsetDetection.h"
  10
  11 #include "LangInfo.h"
  12 #include "utils/CharsetConverter.h"
  13 #include "utils/StringUtils.h"
  14 #include "utils/Utf8Utils.h"
  15 #include "utils/log.h"
  16
  17 #include <algorithm>
  18
  19 /* XML declaration can be virtually any size (with many-many whitespaces)
  20  * but for in real world we don't need to process megabytes of data
  21  * so limit search for XML declaration to reasonable value */
  22 const size_t CCharsetDetection::m_XmlDeclarationMaxLength = 250;
  23
  24 /* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#charset
  25  * encoding must be placed in first 1024 bytes of document */
  26 const size_t CCharsetDetection::m_HtmlCharsetEndSearchPos = 1024;
  27
  28 /* According to http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#space-character
  29  * tab, LF, FF, CR or space can be used as whitespace */
  30 const std::string CCharsetDetection::m_HtmlWhitespaceChars("\x09\x0A\x0C\x0D\x20");    // tab, LF, FF, CR and space
  31
  32 std::string CCharsetDetection::GetBomEncoding(const char* const content, const size_t contentLength)
  33 {
  34   if (contentLength < 2)
  35     return "";
  36   if (content[0] == (char)0xFE && content[1] == (char)0xFF)
  37     return "UTF-16BE";
  38   if (contentLength >= 4 && content[0] == (char)0xFF && content[1] == (char)0xFE && content[2] == (char)0x00 && content[3] == (char)0x00)
  39     return "UTF-32LE";  /* first two bytes are same for UTF-16LE and UTF-32LE, so first check for full UTF-32LE mark */
  40   if (content[0] == (char)0xFF && content[1] == (char)0xFE)
  41    return "UTF-16LE";
  42   if (contentLength < 3)
  43     return "";
  44   if (content[0] == (char)0xEF && content[1] == (char)0xBB && content[2] == (char)0xBF)
  45     return "UTF-8";
  46   if (contentLength < 4)
  47     return "";
  48   if (content[0] == (char)0x00 && content[1] == (char)0x00 && content[2] == (char)0xFE && content[3] == (char)0xFF)
  49     return "UTF-32BE";
  50   if (contentLength >= 5 && content[0] == (char)0x2B && content[1] == (char)0x2F && content[2] == (char)0x76 &&
  51             (content[4] == (char)0x32 || content[4] == (char)0x39 || content[4] == (char)0x2B || content[4] == (char)0x2F))
  52     return "UTF-7";
  53   if (content[0] == (char)0x84 && content[1] == (char)0x31 && content[2] == (char)0x95 && content[3] == (char)0x33)
  54     return "GB18030";
  55
  56   return "";
  57 }
  58
  59 bool CCharsetDetection::DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding)
  60 {
  61   detectedEncoding.clear();
  62
  63   if (contentLength < 2)
  64     return false; // too short for any detection
  65
  66   /* Byte Order Mark has priority over "encoding=" parameter */
  67   detectedEncoding = GetBomEncoding(xmlContent, contentLength);
  68   if (!detectedEncoding.empty())
  69     return true;
  70
  71   /* try to read encoding from XML declaration */
  72   if (GetXmlEncodingFromDeclaration(xmlContent, contentLength, detectedEncoding))
  73   {
  74     StringUtils::ToUpper(detectedEncoding);
  75
  76     /* make some safety checks */
  77     if (detectedEncoding == "UTF-8")
  78       return true; // fast track for most common case
  79
  80     if (StringUtils::StartsWith(detectedEncoding, "UCS-") || StringUtils::StartsWith(detectedEncoding, "UTF-"))
  81     {
  82       if (detectedEncoding == "UTF-7")
  83         return true;
  84
  85       /* XML declaration was detected in UTF-8 mode (by 'GetXmlEncodingFromDeclaration') so we know
  86        * that text in single byte encoding, but declaration itself wrongly specify multibyte encoding */
  87       detectedEncoding.clear();
  88       return false;
  89     }
  90     return true;
  91   }
  92
  93   /* try to detect basic encoding */
  94   std::string guessedEncoding;
  95   if (!GuessXmlEncoding(xmlContent, contentLength, guessedEncoding))
  96     return false; /* can't detect any encoding */
  97
  98   /* have some guessed encoding, try to use it */
  99   std::string convertedXml;
 100   /* use 'm_XmlDeclarationMaxLength * 4' below for UTF-32-like encodings */
 101   if (!g_charsetConverter.ToUtf8(guessedEncoding, std::string(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength * 4)), convertedXml)
 102       || convertedXml.empty())
 103     return false;  /* can't convert, guessed encoding is wrong */
 104
 105   /* text converted, hopefully at least XML declaration is in UTF-8 now */
 106   std::string declaredEncoding;
 107   /* try to read real encoding from converted XML declaration */
 108   if (!GetXmlEncodingFromDeclaration(convertedXml.c_str(), convertedXml.length(), declaredEncoding))
 109   { /* did not find real encoding in XML declaration, use guessed encoding */
 110     detectedEncoding = guessedEncoding;
 111     return true;
 112   }
 113
 114   /* found encoding in converted XML declaration, we know correct endianness and number of bytes per char */
 115   /* make some safety checks */
 116   StringUtils::ToUpper(declaredEncoding);
 117   if (declaredEncoding == guessedEncoding)
 118     return true;
 119
 120   if (StringUtils::StartsWith(guessedEncoding, "UCS-4"))
 121   {
 122     if (declaredEncoding.length() < 5 ||
 123         (!StringUtils::StartsWith(declaredEncoding, "UTF-32") && !StringUtils::StartsWith(declaredEncoding, "UCS-4")))
 124     { /* Guessed encoding was correct because we can convert and read XML declaration, but declaration itself is wrong (not 4-bytes encoding) */
 125       detectedEncoding = guessedEncoding;
 126       return true;
 127     }
 128   }
 129   else if (StringUtils::StartsWith(guessedEncoding, "UTF-16"))
 130   {
 131     if (declaredEncoding.length() < 5 ||
 132         (!StringUtils::StartsWith(declaredEncoding, "UTF-16") && !StringUtils::StartsWith(declaredEncoding, "UCS-2")))
 133     { /* Guessed encoding was correct because we can read XML declaration, but declaration is wrong (not 2-bytes encoding) */
 134       detectedEncoding = guessedEncoding;
 135       return true;
 136     }
 137   }
 138
 139   if (StringUtils::StartsWith(guessedEncoding, "UCS-4") || StringUtils::StartsWith(guessedEncoding, "UTF-16"))
 140   {
 141     /* Check endianness in declared encoding. We already know correct endianness as XML declaration was detected after conversion. */
 142     /* Guessed UTF/UCS encoding always ends with endianness */
 143     std::string guessedEndianness(guessedEncoding, guessedEncoding.length() - 2);
 144
 145     if (!StringUtils::EndsWith(declaredEncoding, "BE") && !StringUtils::EndsWith(declaredEncoding, "LE")) /* Declared encoding without endianness */
 146       detectedEncoding = declaredEncoding + guessedEndianness; /* add guessed endianness */
 147     else if (!StringUtils::EndsWith(declaredEncoding, guessedEndianness)) /* Wrong endianness in declared encoding */
 148       detectedEncoding = declaredEncoding.substr(0, declaredEncoding.length() - 2) + guessedEndianness; /* replace endianness by guessed endianness */
 149     else
 150       detectedEncoding = declaredEncoding; /* declared encoding with correct endianness */
 151
 152     return true;
 153   }
 154   else if (StringUtils::StartsWith(guessedEncoding, "EBCDIC"))
 155   {
 156     if (declaredEncoding.find("EBCDIC") != std::string::npos)
 157       detectedEncoding = declaredEncoding; /* Declared encoding is some specific EBCDIC encoding */
 158     else
 159       detectedEncoding = guessedEncoding;
 160
 161     return true;
 162   }
 163
 164   /* should be unreachable */
 165   return false;
 166 }
 167
 168 bool CCharsetDetection::GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding)
 169 {
 170   // following code is std::string-processing analog of regular expression-processing
 171   // regular expression: "<\\?xml([ \n\r\t]+[^ \n\t\r>]+)*[ \n\r\t]+encoding[ \n\r\t]*=[ \n\r\t]*('[^ \n\t\r>']+'|\"[^ \n\t\r>\"]+\")"
 172   // on win32 x86 machine regular expression is slower that std::string 20-40 times and can slowdown XML processing for several times
 173   // seems that this regular expression is too slow due to many variable length parts, regexp for '&amp;'-fixing is much faster
 174
 175   declaredEncoding.clear();
 176
 177   // avoid extra large search
 178   std::string strXml(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength));
 179
 180   size_t pos = strXml.find("<?xml");
 181   if (pos == std::string::npos || pos + 6 > strXml.length() || pos > strXml.find('<'))
 182     return false; // no "<?xml" declaration, "<?xml" is not first element or "<?xml" is incomplete
 183
 184   pos += 5; // 5 is length of "<?xml"
 185
 186   const size_t declLength = std::min(std::min(m_XmlDeclarationMaxLength, contentLength - pos), strXml.find('>', pos) - pos);
 187   const std::string xmlDecl(xmlContent + pos, declLength);
 188   const char* const xmlDeclC = xmlDecl.c_str(); // for faster processing of [] and for null-termination
 189
 190   static const char* const whiteSpaceChars = " \n\r\t"; // according to W3C Recommendation for XML, any of them can be used as separator
 191   pos = 0;
 192
 193   while (pos + 12 <= declLength) // 12 is minimal length of "encoding='x'"
 194   {
 195     pos = xmlDecl.find_first_of(whiteSpaceChars, pos);
 196     if (pos == std::string::npos)
 197       return false; // no " encoding=" in declaration
 198
 199     pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
 200     if (pos == std::string::npos)
 201       return false; // no "encoding=" in declaration
 202
 203     if (xmlDecl.compare(pos, 8, "encoding", 8) != 0)
 204       continue; // not "encoding" parameter
 205     pos += 8; // length of "encoding"
 206
 207     if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated
 208     {
 209       pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
 210       if (pos == std::string::npos)
 211         return false; // this " encoding" is incomplete, only whitespace chars remains
 212     }
 213     if (xmlDeclC[pos] != '=')
 214     { // "encoding" without "=", try to find other
 215       pos--; // step back to whitespace
 216       continue;
 217     }
 218
 219     pos++; // skip '='
 220     if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated
 221     {
 222       pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
 223       if (pos == std::string::npos)
 224         return false; // this " encoding" is incomplete, only whitespace chars remains
 225     }
 226     size_t encNameEndPos;
 227     if (xmlDeclC[pos] == '"')
 228       encNameEndPos = xmlDecl.find('"', ++pos);
 229     else if (xmlDeclC[pos] == '\'')
 230       encNameEndPos = xmlDecl.find('\'', ++pos);
 231     else
 232       continue; // no quote or double quote after 'encoding=', try to find other
 233
 234     if (encNameEndPos != std::string::npos)
 235     {
 236       declaredEncoding.assign(xmlDecl, pos, encNameEndPos - pos);
 237       return true;
 238     }
 239     // no closing quote or double quote after 'encoding="x', try to find other
 240   }
 241
 242   return false;
 243 }
 244
 245 bool CCharsetDetection::GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding)
 246 {
 247   supposedEncoding.clear();
 248   if (contentLength < 4)
 249     return false; // too little data to guess
 250
 251   if (xmlContent[0] == 0 && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == (char)0x3C) // '<' == '00 00 00 3C' in UCS-4 (UTF-32) big-endian
 252     supposedEncoding = "UCS-4BE"; // use UCS-4 according to W3C recommendation
 253   else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == 0) // '<' == '3C 00 00 00' in UCS-4 (UTF-32) little-endian
 254     supposedEncoding = "UCS-4LE"; // use UCS-4 according to W3C recommendation
 255   else if (xmlContent[0] == 0 && xmlContent[1] == (char)0x3C && xmlContent[2] == 0 && xmlContent[3] == (char)0x3F) // "<?" == "00 3C 00 3F" in UTF-16 (UCS-2) big-endian
 256     supposedEncoding = "UTF-16BE";
 257   else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == (char)0x3F && xmlContent[3] == 0) // "<?" == "3C 00 3F 00" in UTF-16 (UCS-2) little-endian
 258     supposedEncoding = "UTF-16LE";
 259   else if (xmlContent[0] == (char)0x4C && xmlContent[1] == (char)0x6F && xmlContent[2] == (char)0xA7 && xmlContent[3] == (char)0x94) // "<?xm" == "4C 6F A7 94" in most EBCDIC encodings
 260     supposedEncoding = "EBCDIC-CP-US"; // guessed value, real value must be read from declaration
 261   else
 262     return false;
 263
 264   return true;
 265 }
 266
 267 bool CCharsetDetection::ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset)
 268 {
 269   converted.clear();
 270   usedHtmlCharset.clear();
 271   if (htmlContent.empty())
 272   {
 273     usedHtmlCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
 274     return false;
 275   }
 276
 277   // this is relaxed implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#determining-the-character-encoding
 278
 279   // try to get charset from Byte Order Mark
 280   std::string bomCharset(GetBomEncoding(htmlContent));
 281   if (checkConversion(bomCharset, htmlContent, converted))
 282   {
 283     usedHtmlCharset = bomCharset;
 284     return true;
 285   }
 286
 287   // try charset from HTTP header (or from other out-of-band source)
 288   if (checkConversion(serverReportedCharset, htmlContent, converted))
 289   {
 290     usedHtmlCharset = serverReportedCharset;
 291     return true;
 292   }
 293
 294   // try to find charset in HTML
 295   std::string declaredCharset(GetHtmlEncodingFromHead(htmlContent));
 296   if (!declaredCharset.empty())
 297   {
 298     if (declaredCharset.compare(0, 3, "UTF", 3) == 0)
 299       declaredCharset = "UTF-8"; // charset string was found in singlebyte mode, charset can't be multibyte encoding
 300     if (checkConversion(declaredCharset, htmlContent, converted))
 301     {
 302       usedHtmlCharset = declaredCharset;
 303       return true;
 304     }
 305   }
 306
 307   // try UTF-8 if not tried before
 308   if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && declaredCharset != "UTF-8" && checkConversion("UTF-8", htmlContent, converted))
 309   {
 310     usedHtmlCharset = "UTF-8";
 311     return false; // only guessed value
 312   }
 313
 314   // try user charset
 315   std::string userCharset(g_langInfo.GetGuiCharSet());
 316   if (checkConversion(userCharset, htmlContent, converted))
 317   {
 318     usedHtmlCharset = userCharset;
 319     return false; // only guessed value
 320   }
 321
 322   // try WINDOWS-1252
 323   if (checkConversion("WINDOWS-1252", htmlContent, converted))
 324   {
 325     usedHtmlCharset = "WINDOWS-1252";
 326     return false; // only guessed value
 327   }
 328
 329   // can't find exact charset
 330   // use one of detected as fallback
 331   if (!bomCharset.empty())
 332     usedHtmlCharset = bomCharset;
 333   else if (!serverReportedCharset.empty())
 334     usedHtmlCharset = serverReportedCharset;
 335   else if (!declaredCharset.empty())
 336     usedHtmlCharset = declaredCharset;
 337   else if (!userCharset.empty())
 338     usedHtmlCharset = userCharset;
 339   else
 340     usedHtmlCharset = "WINDOWS-1252";
 341
 342   CLog::Log(LOGWARNING, "{}: Can't correctly convert to UTF-8 charset, converting as \"{}\"",
 343             __FUNCTION__, usedHtmlCharset);
 344   g_charsetConverter.ToUtf8(usedHtmlCharset, htmlContent, converted, false);
 345
 346   return false;
 347 }
 348
 349 bool CCharsetDetection::ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset)
 350 {
 351   converted.clear();
 352   usedCharset.clear();
 353   if (textContent.empty())
 354   {
 355     usedCharset = "UTF-8"; // any charset can be used for empty content, use UTF-8 as default
 356     return true;
 357   }
 358
 359   // try to get charset from Byte Order Mark
 360   std::string bomCharset(GetBomEncoding(textContent));
 361   if (checkConversion(bomCharset, textContent, converted))
 362   {
 363     usedCharset = bomCharset;
 364     return true;
 365   }
 366
 367   // try charset from HTTP header (or from other out-of-band source)
 368   if (checkConversion(serverReportedCharset, textContent, converted))
 369   {
 370     usedCharset = serverReportedCharset;
 371     return true;
 372   }
 373
 374   // try UTF-8 if not tried before
 375   if (bomCharset != "UTF-8" && serverReportedCharset != "UTF-8" && checkConversion("UTF-8", textContent, converted))
 376   {
 377     usedCharset = "UTF-8";
 378     return true;
 379   }
 380
 381   // try user charset
 382   std::string userCharset(g_langInfo.GetGuiCharSet());
 383   if (checkConversion(userCharset, textContent, converted))
 384   {
 385     usedCharset = userCharset;
 386     return true;
 387   }
 388
 389   // try system default charset
 390   if (g_charsetConverter.systemToUtf8(textContent, converted, true))
 391   {
 392     usedCharset = "char"; // synonym to system charset
 393     return true;
 394   }
 395
 396   // try WINDOWS-1252
 397   if (checkConversion("WINDOWS-1252", textContent, converted))
 398   {
 399     usedCharset = "WINDOWS-1252";
 400     return true;
 401   }
 402
 403   // can't find correct charset
 404   // use one of detected as fallback
 405   if (!serverReportedCharset.empty())
 406     usedCharset = serverReportedCharset;
 407   else if (!bomCharset.empty())
 408     usedCharset = bomCharset;
 409   else if (!userCharset.empty())
 410     usedCharset = userCharset;
 411   else
 412     usedCharset = "WINDOWS-1252";
 413
 414   CLog::Log(LOGWARNING, "{}: Can't correctly convert to UTF-8 charset, converting as \"{}\"",
 415             __FUNCTION__, usedCharset);
 416   g_charsetConverter.ToUtf8(usedCharset, textContent, converted, false);
 417
 418   return false;
 419 }
 420
 421
 422 bool CCharsetDetection::checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst)
 423 {
 424   if (srcCharset.empty())
 425     return false;
 426
 427   if (srcCharset != "UTF-8")
 428   {
 429     if (g_charsetConverter.ToUtf8(srcCharset, src, dst, true))
 430       return true;
 431   }
 432   else if (CUtf8Utils::isValidUtf8(src))
 433   {
 434     dst = src;
 435     return true;
 436   }
 437
 438   return false;
 439 }
 440
 441 std::string CCharsetDetection::GetHtmlEncodingFromHead(const std::string& htmlContent)
 442 {
 443   std::string smallerHtmlContent;
 444   if (htmlContent.length() > 2 * m_HtmlCharsetEndSearchPos)
 445     smallerHtmlContent.assign(htmlContent, 0, 2 * m_HtmlCharsetEndSearchPos); // use twice more bytes to search for charset for safety
 446
 447   const std::string& html = smallerHtmlContent.empty() ? htmlContent : smallerHtmlContent; // limit search
 448   const char* const htmlC = html.c_str(); // for null-termination
 449   const size_t len = html.length();
 450
 451   // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#prescan-a-byte-stream-to-determine-its-encoding
 452   // labels in comments correspond to the labels in HTML5 standard
 453   // note: opposite to standard, everything is converted to uppercase instead of lower case
 454   size_t pos = 0;
 455   while (pos < len) // "loop" label
 456   {
 457     if (html.compare(pos, 4, "<!--", 4) == 0)
 458     {
 459       pos = html.find("-->", pos + 2);
 460       if (pos == std::string::npos)
 461         return "";
 462       pos += 2;
 463     }
 464     else if (htmlC[pos] == '<' && (htmlC[pos + 1] == 'm' || htmlC[pos + 1] == 'M') && (htmlC[pos + 2] == 'e' || htmlC[pos + 2] == 'E')
 465              && (htmlC[pos + 3] == 't' || htmlC[pos + 3] == 'T') && (htmlC[pos + 4] == 'a' || htmlC[pos + 4] == 'A')
 466              && (htmlC[pos + 5] == 0x09 || htmlC[pos + 5] == 0x0A || htmlC[pos + 5] == 0x0C || htmlC[pos + 5] == 0x0D || htmlC[pos + 5] == 0x20 || htmlC[pos + 5] == 0x2F))
 467     { // this is case insensitive "<meta" and one of tab, LF, FF, CR, space or slash
 468       pos += 5; // "pos" points to symbol after "<meta"
 469       std::string attrName, attrValue;
 470       bool gotPragma = false;
 471       std::string contentCharset;
 472       do // "attributes" label
 473       {
 474         pos = GetHtmlAttribute(html, pos, attrName, attrValue);
 475         if (attrName == "HTTP-EQUIV" && attrValue == "CONTENT-TYPE")
 476           gotPragma = true;
 477         else if (attrName == "CONTENT")
 478           contentCharset = ExtractEncodingFromHtmlMeta(attrValue);
 479         else if (attrName == "CHARSET")
 480         {
 481           StringUtils::Trim(attrValue, m_HtmlWhitespaceChars.c_str()); // tab, LF, FF, CR, space
 482           if (!attrValue.empty())
 483             return attrValue;
 484         }
 485       } while (!attrName.empty() && pos < len);
 486
 487       // "processing" label
 488       if (gotPragma && !contentCharset.empty())
 489         return contentCharset;
 490     }
 491     else if (htmlC[pos] == '<' && ((htmlC[pos + 1] >= 'A' && htmlC[pos + 1] <= 'Z') || (htmlC[pos + 1] >= 'a' && htmlC[pos + 1] <= 'z')))
 492     {
 493       pos = html.find_first_of("\x09\x0A\x0C\x0D >", pos); // tab, LF, FF, CR, space or '>'
 494       std::string attrName, attrValue;
 495       do
 496       {
 497         pos = GetHtmlAttribute(html, pos, attrName, attrValue);
 498       } while (pos < len && !attrName.empty());
 499     }
 500     else if (html.compare(pos, 2, "<!", 2) == 0 || html.compare(pos, 2, "</", 2) == 0 || html.compare(pos, 2, "<?", 2) == 0)
 501       pos = html.find('>', pos);
 502
 503     if (pos == std::string::npos)
 504       return "";
 505
 506     // "next byte" label
 507     pos++;
 508   }
 509
 510   return ""; // no charset was found
 511 }
 512
 513 size_t CCharsetDetection::GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& attrName, std::string& attrValue)
 514 {
 515   attrName.clear();
 516   attrValue.clear();
 517   static const char* const htmlWhitespaceSlash = "\x09\x0A\x0C\x0D\x20\x2F"; // tab, LF, FF, CR, space or slash
 518   const char* const htmlC = htmlContent.c_str();
 519   const size_t len = htmlContent.length();
 520
 521   // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#concept-get-attributes-when-sniffing
 522   // labels in comments correspond to the labels in HTML5 standard
 523   // note: opposite to standard, everything is converted to uppercase instead of lower case
 524   pos = htmlContent.find_first_not_of(htmlWhitespaceSlash, pos);
 525   if (pos == std::string::npos || htmlC[pos] == '>')
 526     return pos; // only white spaces or slashes up to the end of the htmlContent or no more attributes
 527
 528   while (pos < len && htmlC[pos] != '=')
 529   {
 530     const char chr = htmlC[pos];
 531     if (chr == '/' || chr == '>')
 532       return pos; // no attributes or empty attribute value
 533     else if (m_HtmlWhitespaceChars.find(chr) != std::string::npos) // chr is one of whitespaces
 534     {
 535       pos = htmlContent.find_first_not_of(m_HtmlWhitespaceChars, pos); // "spaces" label
 536       if (pos == std::string::npos || htmlC[pos] != '=')
 537         return pos; // only white spaces up to the end or no attribute value
 538       break;
 539     }
 540     else
 541       appendCharAsAsciiUpperCase(attrName, chr);
 542
 543     pos++;
 544   }
 545
 546   if (pos >= len)
 547     return std::string::npos; // no '=', '/' or '>' were found up to the end of htmlContent
 548
 549   pos++; // advance pos to character after '='
 550
 551   pos = htmlContent.find_first_not_of(m_HtmlWhitespaceChars, pos); // "value" label
 552   if (pos == std::string::npos)
 553     return pos; // only white spaces remain in htmlContent
 554
 555   if (htmlC[pos] == '>')
 556     return pos; // empty attribute value
 557   else if (htmlC[pos] == '"' || htmlC[pos] == '\'')
 558   {
 559     const char qChr = htmlC[pos];
 560     // "quote loop" label
 561     while (++pos < len)
 562     {
 563       const char chr = htmlC[pos];
 564       if (chr == qChr)
 565         return pos + 1;
 566       else
 567         appendCharAsAsciiUpperCase(attrValue, chr);
 568     }
 569     return std::string::npos; // no closing quote is found
 570   }
 571
 572   appendCharAsAsciiUpperCase(attrValue, htmlC[pos]);
 573   pos++;
 574
 575   while (pos < len)
 576   {
 577     const char chr = htmlC[pos];
 578     if (m_HtmlWhitespaceChars.find(chr) != std::string::npos || chr == '>')
 579       return pos;
 580     else
 581       appendCharAsAsciiUpperCase(attrValue, chr);
 582
 583     pos++;
 584   }
 585
 586   return std::string::npos; // rest of htmlContent was attribute value
 587 }
 588
 589 std::string CCharsetDetection::ExtractEncodingFromHtmlMeta(const std::string& metaContent,
 590                                                            size_t pos /*= 0*/)
 591 {
 592   size_t len = metaContent.length();
 593   if (pos >= len)
 594     return "";
 595
 596   const char* const metaContentC = metaContent.c_str();
 597
 598   // this is an implementation of http://www.w3.org/TR/2013/CR-html5-20130806/single-page.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
 599   // labels in comments correspond to the labels in HTML5 standard
 600   // note: opposite to standard, case sensitive match is used as argument is always in uppercase
 601   std::string charset;
 602   do
 603   {
 604     // "loop" label
 605     pos = metaContent.find("CHARSET", pos);
 606     if (pos == std::string::npos)
 607       return "";
 608
 609     pos = metaContent.find_first_not_of(m_HtmlWhitespaceChars, pos + 7); // '7' is the length of 'CHARSET'
 610     if (pos != std::string::npos && metaContentC[pos] == '=')
 611     {
 612       pos = metaContent.find_first_not_of(m_HtmlWhitespaceChars, pos + 1);
 613       if (pos != std::string::npos)
 614       {
 615         if (metaContentC[pos] == '\'' || metaContentC[pos] == '"')
 616         {
 617           const char qChr = metaContentC[pos];
 618           pos++;
 619           const size_t closeQpos = metaContent.find(qChr, pos);
 620           if (closeQpos != std::string::npos)
 621             charset.assign(metaContent, pos, closeQpos - pos);
 622         }
 623         else
 624           charset.assign(metaContent, pos, metaContent.find("\x09\x0A\x0C\x0D ;", pos) - pos); // assign content up to the next tab, LF, FF, CR, space, semicolon or end of string
 625       }
 626       break;
 627     }
 628   } while (pos < len);
 629
 630   static const char* const htmlWhitespaceCharsC = m_HtmlWhitespaceChars.c_str();
 631   StringUtils::Trim(charset, htmlWhitespaceCharsC);
 632
 633   return charset;
 634 }
 635
 636 inline void CCharsetDetection::appendCharAsAsciiUpperCase(std::string& str, const char chr)
 637 {
 638   if (chr >= 'a' && chr <= 'z')
 639     str.push_back(chr - ('a' - 'A')); // convert to upper case
 640   else
 641     str.push_back(chr);
 642 }