hphp/zend/zend-html.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   7    +----------------------------------------------------------------------+
   8    | This source file is subject to version 2.00 of the Zend license,     |
   9    | that is bundled with this package in the file LICENSE, and is        |
  10    | available through the world-wide-web at the following url:           |
  11    | http://www.zend.com/license/2_00.txt.                                |
  12    | If you did not receive a copy of the Zend license and are unable to  |
  13    | obtain it through the world-wide-web, please send a note to          |
  14    | license@zend.com so we can mail you a copy immediately.              |
  15    +----------------------------------------------------------------------+
  16 */
  17 #include "hphp/zend/zend-html.h"
  18
  19 #include <unicode/uchar.h>
  20 #include <unicode/utf8.h>
  21
  22 #include "hphp/util/lock.h"
  23 #include "hphp/util/functional.h"
  24 #include "hphp/util/hash-map.h"
  25
  26 namespace HPHP {
  27
  28 ///////////////////////////////////////////////////////////////////////////////
  29 // UTF-8 entity tables
  30
  31 using namespace entity_charset_enum;
  32
  33 static entity_table_t ent_cp_866[] = {
  34   "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
  35   "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
  36   "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
  37   "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
  38   "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
  39   "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
  40   "blk14", "blk12", "blk34", "boxv", "boxvl", "boxvL", "boxVl", "boxDl",
  41   "boxdL", "boxVL", "boxV", "boxDL", "boxUL", "boxUl", "boxuL", "boxdl",
  42   "boxur", "boxhu", "boxhd", "boxvr", "boxh", "boxvh", "boxvR", "boxVr",
  43   "boxUR", "boxDR", "boxHU", "boxHD", "boxVR", "boxH", "boxVH", "boxHu",
  44   "boxhU", "boxHd", "boxhD", "boxUr", "boxuR", "boxdR", "boxDr", "boxVh",
  45   "boxvH", "boxul", "boxdr", "block", "lhblk", nullptr, nullptr, "uhblk",
  46   "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
  47   "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
  48   "IOcy", "iocy", "Jukcy", "jukcy", "YIcy", "yicy", "Ubrcy", "ubrcy",
  49   "deg", nullptr, "middot", "Sqrt", "numero", "curren", nullptr, "nbsp"
  50 };
  51
  52 static entity_table_t ent_cp_1251[] = {
  53   "DJcy", "GJcy", "sbquo", "gjcy", "bdquo", "hellip", "dagger", "Dagger",
  54   "euro", "permil", "LJcy", "lsaquo", "NJcy", "KJcy", "TSHcy", "DZcy",
  55   "djcy", "lsquo", "rsquo", "ldquo", "rdquo", "bull", "ndash", "mdash",
  56   nullptr, "trade", "ljcy", "rsaquo", "njcy", "kjcy", "tshcy", "dzcy",
  57   "nbsp", "Ubrcy", "ubrcy", "Jsercy", "curren", nullptr, "brvbar", "sect",
  58   "IOcy", "copy", "Jukcy", "laquo", "not", "shy", "reg", "YIcy",
  59   "deg", "pm", "Iukcy", "iukcy", nullptr, "micro", "para", "middot",
  60   "iocy", "numero", "jukcy", "raquo", "jsercy", "DScy", "dscy", "yicy",
  61   "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
  62   "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
  63   "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
  64   "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
  65   "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
  66   "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
  67   "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
  68   "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy"
  69 };
  70
  71 /* codepage 1252 is a Windows extension to iso-8859-1. */
  72 static entity_table_t ent_cp_1252[] = {
  73   "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
  74   "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
  75   nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
  76   "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
  77   "oelig", nullptr, nullptr, "Yuml"
  78 };
  79
  80 static entity_table_t ent_iso_8859_1[] = {
  81   "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
  82   "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
  83   "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
  84   "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
  85   "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
  86   "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
  87   "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
  88   "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
  89   "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
  90   "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
  91   "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
  92   "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
  93   "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
  94   "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
  95   "uuml", "yacute", "thorn", "yuml"
  96 };
  97
  98 static entity_table_t ent_iso_8859_5[] = {
  99   "nbsp", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy",
 100   "Jsercy", "LJcy", "NJcy", "TSHcy", "KJcy", "shy", "Ubrcy", "DZcy",
 101   "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
 102   "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
 103   "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
 104   "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
 105   "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
 106   "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
 107   "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
 108   "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
 109   "numero", "iocy", "djcy", "gjcy", "jukcy", "dscy", "iukcy", "yicy",
 110   "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "sect", "ubrcy", "dzcy"
 111 };
 112
 113 static entity_table_t ent_iso_8859_15[] = {
 114   "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
 115   "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
 116   "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
 117   "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
 118   "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
 119   "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
 120   "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
 121   "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
 122   "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
 123   "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
 124   "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
 125   "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
 126   "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
 127   "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
 128   "uuml", "yacute", "thorn", "yuml"
 129 };
 130
 131 static entity_table_t ent_uni_338_402[] = {
 132   /* 338 (0x0152) */
 133   "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
 134   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 135   /* 352 (0x0160) */
 136   "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 137   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 138   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 139   /* 376 (0x0178) */
 140   "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 141   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 142   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 143   /* 400 (0x0190) */
 144   nullptr, nullptr, "fnof"
 145 };
 146
 147 static entity_table_t ent_uni_spacing[] = {
 148   /* 710 */
 149   "circ",
 150   /* 711 - 730 */
 151   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 152   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 153   /* 731 - 732 */
 154   nullptr, "tilde"
 155 };
 156
 157 static entity_table_t ent_uni_greek[] = {
 158   /* 913 */
 159   "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
 160   "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
 161   nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
 162   /* 938 - 944 are not mapped */
 163   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 164   "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
 165   "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
 166   "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
 167   /* 970 - 976 are not mapped */
 168   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 169   "thetasym", "upsih",
 170   nullptr, nullptr, nullptr,
 171   "piv"
 172 };
 173
 174 static entity_table_t ent_uni_punct[] = {
 175   /* 8194 */
 176   "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
 177   "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
 178   nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
 179   /* 8216 */
 180   "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
 181   "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
 182   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
 183   /* 8242 */
 184   "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
 185   nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
 186   "frasl"
 187 };
 188
 189 static entity_table_t ent_uni_euro[] = {
 190   "euro"
 191 };
 192
 193 static entity_table_t ent_uni_8465_8501[] = {
 194   /* 8465 */
 195   "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 196   /* 8472 */
 197   "weierp", nullptr, nullptr, nullptr,
 198   /* 8476 */
 199   "real", nullptr, nullptr, nullptr, nullptr, nullptr,
 200   /* 8482 */
 201   "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 202   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 203   /* 8501 */
 204   "alefsym",
 205 };
 206
 207 static entity_table_t ent_uni_8592_9002[] = {
 208   /* 8592 (0x2190) */
 209   "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
 210   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 211   /* 8608 (0x21a0) */
 212   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 213   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 214   /* 8624 (0x21b0) */
 215   nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
 216   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 217   /* 8640 (0x21c0) */
 218   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 219   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 220   /* 8656 (0x21d0) */
 221   "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
 222   nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
 223   /* 8672 (0x21e0) */
 224   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 225   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 226   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 227   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 228   /* 8704 (0x2200) */
 229   "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
 230   "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
 231   /* 8720 (0x2210) */
 232   "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
 233   "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
 234   /* 8736 (0x2220) */
 235   "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
 236   "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
 237   /* 8752 (0x2230) */
 238   nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
 239   nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
 240   /* 8768 (0x2240) */
 241   "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
 242   "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
 243   /* 8784 (0x2250) */
 244   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 245   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 246   /* 8800 (0x2260) */
 247   "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
 248   "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
 249   /* 8816 (0x2270) */
 250   "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
 251   nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
 252   /* 8832 (0x2280) */
 253   "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
 254   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 255   /* 8848 (0x2290) */
 256   nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
 257   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 258   /* 8864 (0x22a0) */
 259   nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
 260   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 261   /* 8880 (0x22b0) */
 262   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 263   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 264   /* 8896 (0x22c0) */
 265   nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
 266   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 267   /* 8912 (0x22d0) */
 268   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 269   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 270   /* 8928 (0x22e0) */
 271   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 272   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 273   /* 8944 (0x22f0) */
 274   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 275   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 276   /* 8960 (0x2300) */
 277   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 278   "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
 279   /* 8976 (0x2310) */
 280   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 281   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 282   /* 8992 (0x2320) */
 283   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 284   nullptr, "lang", "rang"
 285 };
 286
 287 static entity_table_t ent_uni_9674[] = {
 288   /* 9674 */
 289   "loz"
 290 };
 291
 292 static entity_table_t ent_uni_9824_9830[] = {
 293   /* 9824 */
 294   "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
 295 };
 296
 297 static const struct html_entity_map entity_map[] = {
 298   { cs_cp866,     0x80, 0xff, ent_cp_866 },
 299   { cs_cp1251,    0x80, 0xff, ent_cp_1251 },
 300   { cs_cp1252,    0x80, 0x9f, ent_cp_1252 },
 301   { cs_cp1252,    0xa0, 0xff, ent_iso_8859_1 },
 302   { cs_8859_1,    0xa0, 0xff, ent_iso_8859_1 },
 303   { cs_8859_5,    0xa0, 0xff, ent_iso_8859_5 },
 304   { cs_8859_15,   0xa0, 0xff, ent_iso_8859_15 },
 305   { cs_utf_8,     0xa0, 0xff, ent_iso_8859_1 },
 306   { cs_utf_8,     338,  402,  ent_uni_338_402 },
 307   { cs_utf_8,     710,  732,  ent_uni_spacing },
 308   { cs_utf_8,     913,  982,  ent_uni_greek },
 309   { cs_utf_8,     8194, 8260, ent_uni_punct },
 310   { cs_utf_8,     8364, 8364, ent_uni_euro },
 311   { cs_utf_8,     8465, 8501, ent_uni_8465_8501 },
 312   { cs_utf_8,     8592, 9002, ent_uni_8592_9002 },
 313   { cs_utf_8,     9674, 9674, ent_uni_9674 },
 314   { cs_utf_8,     9824, 9830, ent_uni_9824_9830 },
 315   { cs_big5,      0xa0, 0xff, ent_iso_8859_1 },
 316   { cs_gb2312,    0xa0, 0xff, ent_iso_8859_1 },
 317   { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
 318   { cs_sjis,      0xa0, 0xff, ent_iso_8859_1 },
 319   { cs_eucjp,     0xa0, 0xff, ent_iso_8859_1 },
 320   /* Missing support for these at the moment
 321   { cs_koi8r,     0xa3, 0xff, ent_koi8r },
 322   { cs_macroman,  0x0b, 0xff, ent_macroman },
 323   */
 324   { cs_terminator }
 325 };
 326
 327 static const struct {
 328   const char *codeset;
 329   entity_charset charset;
 330 } charset_map[] = {
 331   { "ISO-8859-1",     cs_8859_1 },
 332   { "ISO8859-1",      cs_8859_1 },
 333   { "ISO-8859-5",     cs_8859_5 },
 334   { "ISO8859-5",      cs_8859_5 },
 335   { "ISO-8859-15",    cs_8859_15 },
 336   { "ISO8859-15",     cs_8859_15 },
 337   { "utf-8",          cs_utf_8 },
 338   { "cp866",          cs_cp866 },
 339   { "866",            cs_cp866 },
 340   { "ibm866",         cs_cp866 },
 341   { "cp1251",         cs_cp1251 },
 342   { "Windows-1251",   cs_cp1251 },
 343   { "win-1251",       cs_cp1251 },
 344   { "cp1252",         cs_cp1252 },
 345   { "Windows-1252",   cs_cp1252 },
 346   { "1252",           cs_cp1252 },
 347   { "BIG5",           cs_big5 },
 348   { "950",            cs_big5 },
 349   { "GB2312",         cs_gb2312 },
 350   { "936",            cs_gb2312 },
 351   { "BIG5-HKSCS",     cs_big5hkscs },
 352   { "Shift_JIS",      cs_sjis },
 353   { "SJIS",           cs_sjis },
 354   { "932",            cs_sjis },
 355   { "EUCJP",          cs_eucjp },
 356   /* Missing support for these at the moment
 357   { "EUC-JP",         cs_eucjp },
 358   { "KOI8-R",         cs_koi8r },
 359   { "koi8-ru",        cs_koi8r },
 360   { "koi8r",          cs_koi8r },
 361   { "MacRoman",       cs_macroman },
 362   */
 363   { nullptr }
 364 };
 365
 366 ///////////////////////////////////////////////////////////////////////////////
 367
 368 entity_charset determine_charset(const char *charset_hint) {
 369   entity_charset charset = cs_unknown;
 370
 371   if (charset_hint == nullptr) {
 372     // default to utf-8
 373     return cs_utf_8;
 374   }
 375
 376   size_t len = strlen(charset_hint);
 377
 378   /* now walk the charset map and look for the codeset */
 379   for (int i = 0; charset_map[i].codeset; i++) {
 380     if (len == strlen(charset_map[i].codeset) &&
 381       strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
 382       charset = charset_map[i].charset;
 383       break;
 384     }
 385   }
 386
 387   return charset;
 388 }
 389
 390 static int utf32_to_utf8(unsigned char *buf, int k) {
 391   int retval = 0;
 392
 393   if (k < 0x80) {
 394     buf[0] = k;
 395     retval = 1;
 396   } else if (k < 0x800) {
 397     buf[0] = 0xc0 | (k >> 6);
 398     buf[1] = 0x80 | (k & 0x3f);
 399     retval = 2;
 400   } else if (k < 0x10000) {
 401     buf[0] = 0xe0 | (k >> 12);
 402     buf[1] = 0x80 | ((k >> 6) & 0x3f);
 403     buf[2] = 0x80 | (k & 0x3f);
 404     retval = 3;
 405   } else if (k < 0x200000) {
 406     buf[0] = 0xf0 | (k >> 18);
 407     buf[1] = 0x80 | ((k >> 12) & 0x3f);
 408     buf[2] = 0x80 | ((k >> 6) & 0x3f);
 409     buf[3] = 0x80 | (k & 0x3f);
 410     retval = 4;
 411   } else if (k < 0x4000000) {
 412     buf[0] = 0xf8 | (k >> 24);
 413     buf[1] = 0x80 | ((k >> 18) & 0x3f);
 414     buf[2] = 0x80 | ((k >> 12) & 0x3f);
 415     buf[3] = 0x80 | ((k >> 6) & 0x3f);
 416     buf[4] = 0x80 | (k & 0x3f);
 417     retval = 5;
 418   } else {
 419     buf[0] = 0xfc | (k >> 30);
 420     buf[1] = 0x80 | ((k >> 24) & 0x3f);
 421     buf[2] = 0x80 | ((k >> 18) & 0x3f);
 422     buf[3] = 0x80 | ((k >> 12) & 0x3f);
 423     buf[4] = 0x80 | ((k >> 6) & 0x3f);
 424     buf[5] = 0x80 | (k & 0x3f);
 425     retval = 6;
 426   }
 427   buf[retval] = '\0';
 428
 429   return retval;
 430 }
 431
 432 using HtmlEntityMap = hphp_const_char_map<std::string>;
 433
 434 static volatile bool EntityMapInited = false;
 435 static Mutex EntityMapMutex;
 436 static HtmlEntityMap EntityMap[cs_end];
 437 static HtmlEntityMap XHPEntityMap[cs_end];
 438
 439 static void init_entity_table() {
 440   for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
 441     const html_entity_map &em = entity_map[i];
 442     const entity_charset charset = entity_map[i].charset;
 443
 444     int index = 0;
 445     for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
 446       const char *entity = em.table[index];
 447       if (entity == nullptr) {
 448         continue;
 449       }
 450       unsigned char buf[10];
 451       switch (charset) {
 452         case cs_8859_1:
 453         case cs_cp1252:
 454         case cs_8859_15:
 455         case cs_cp1251:
 456         case cs_8859_5:
 457         case cs_cp866:
 458         case cs_koi8r:
 459           buf[0] = ch;
 460           buf[1] = '\0';
 461           break;
 462
 463         case cs_utf_8:
 464           utf32_to_utf8(buf, ch);
 465           break;
 466
 467         default:
 468           continue;
 469       }
 470       EntityMap[charset][entity] = (const char *)buf;
 471       XHPEntityMap[charset][entity] = (const char *)buf;
 472     }
 473
 474     EntityMap[charset]["quot"] = "\"";
 475     EntityMap[charset]["lt"] = "<";
 476     EntityMap[charset]["gt"] = ">";
 477     EntityMap[charset]["amp"] = "&";
 478
 479     XHPEntityMap[charset]["quot"] = "\"";
 480     XHPEntityMap[charset]["lt"] = "<";
 481     XHPEntityMap[charset]["gt"] = ">";
 482     XHPEntityMap[charset]["amp"] = "&";
 483     // XHP-specific entities
 484     XHPEntityMap[charset]["apos"] = "\'";
 485     XHPEntityMap[charset]["cloud"] = (const char *)u8"\u2601";
 486     XHPEntityMap[charset]["umbrella"] = (const char *)u8"\u2602";
 487     XHPEntityMap[charset]["snowman"] = (const char *)u8"\u2603";
 488     XHPEntityMap[charset]["snowflake"] = (const char *)u8"\u2745";
 489     XHPEntityMap[charset]["comet"] = (const char *)u8"\u2604";
 490     XHPEntityMap[charset]["thunderstorm"] = (const char *)u8"\u2608";
 491   }
 492
 493   // the first element is an empty table
 494   EntityMap[cs_terminator]["quot"] = "\"";
 495   EntityMap[cs_terminator]["lt"] = "<";
 496   EntityMap[cs_terminator]["gt"] = ">";
 497   EntityMap[cs_terminator]["amp"] = "&";
 498   // XHP-specific entities
 499   XHPEntityMap[cs_terminator]["apos"] = "\'";
 500   XHPEntityMap[cs_terminator]["cloud"] = (const char *)u8"\u2601";
 501   XHPEntityMap[cs_terminator]["umbrella"] = (const char *)u8"\u2602";
 502   XHPEntityMap[cs_terminator]["snowman"] = (const char *)u8"\u2603";
 503   XHPEntityMap[cs_terminator]["snowflake"] = (const char *)u8"\u2745";
 504   XHPEntityMap[cs_terminator]["comet"] = (const char *)u8"\u2604";
 505   XHPEntityMap[cs_terminator]["thunderstorm"] = (const char *)u8"\u2608";
 506 }
 507
 508 ///////////////////////////////////////////////////////////////////////////////
 509 inline static bool decode_entity(char *entity, int *len,
 510                                  bool decode_double_quote,
 511                                  bool decode_single_quote,
 512                                  entity_charset charset, bool all,
 513                                  bool xhp = false) {
 514   // entity is 16 bytes, allocated statically below
 515   // default in PHP
 516   assert(entity && *entity);
 517   if (entity[0] == '#') {
 518     int code;
 519     if (entity[1] == 'x' || entity[1] == 'X') {
 520       if (!isxdigit(entity[2])) return false;
 521       code = strtol(entity + 2, nullptr, 16);
 522     } else {
 523       if (!isdigit(entity[1])) return false;
 524       code = strtol(entity + 1, nullptr, 10);
 525     }
 526
 527     // since we don't support multibyte chars other than utf-8
 528     int l = 1;
 529
 530     if (code == 39 && decode_single_quote) {
 531       entity[0] = code;
 532       entity[1] = '\0';
 533       *len = l;
 534       return true;
 535     }
 536
 537     if (!all          && (code != '&') &&
 538         (code != '<') && (code != '>') &&
 539         (code != '"') && (code != '\'')) {
 540       // htmlspecialchars_decode() does not parse numeric
 541       // entities other than & < > " '
 542       return false;
 543     }
 544
 545     switch (charset) {
 546       case cs_utf_8:
 547       {
 548         unsigned char buf[10];
 549         int size = utf32_to_utf8(buf, code);
 550         memcpy(entity, buf, size + 1);
 551         l = size;
 552         break;
 553       }
 554
 555       case cs_8859_1:
 556       case cs_8859_5:
 557       case cs_8859_15:
 558         if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
 559           return false;
 560         } else {
 561           if (code == 39) {
 562             return false;
 563           }
 564           entity[0] = code;
 565           entity[1] = '\0';
 566         }
 567         break;
 568
 569       case cs_cp1252:
 570       case cs_cp1251:
 571       case cs_cp866:
 572         if (code > 0xff) {
 573           return false;
 574         }
 575         entity[0] = code;
 576         entity[1] = '\0';
 577         break;
 578
 579       case cs_big5:
 580       case cs_big5hkscs:
 581       case cs_sjis:
 582       case cs_eucjp:
 583         if (code >= 0x80) {
 584           return false;
 585         }
 586         entity[0] = code;
 587         entity[1] = '\0';
 588         break;
 589
 590       case cs_gb2312:
 591         if (code >= 0x81) {
 592           return false;
 593         }
 594         entity[0] = code;
 595         entity[1] = '\0';
 596         break;
 597
 598       default:
 599         return false;
 600         break;
 601     }
 602     *len = l;
 603     return true;
 604   } else {
 605     HtmlEntityMap *entityMap;
 606
 607     if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
 608       return false;
 609     }
 610
 611     if (all) {
 612       entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
 613     } else {
 614       entityMap = xhp ? &XHPEntityMap[cs_terminator]
 615                       : &EntityMap[cs_terminator];
 616     }
 617     HtmlEntityMap::const_iterator iter = entityMap->find(entity);
 618     if (iter != entityMap->end()) {
 619       memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
 620       *len = iter->second.length();
 621       return true;
 622     }
 623   }
 624
 625   return false;
 626 }
 627
 628 inline static bool encode_entity(char* buf, int* buflen,
 629                                  const char* entity, bool utf8) {
 630   entity_charset charset = cs_utf_8;
 631   if (!utf8){ charset = cs_8859_1; }
 632
 633   HtmlEntityMap *entityMap = &EntityMap[charset];
 634
 635   for(HtmlEntityMap::const_iterator iter = entityMap->begin();
 636       iter != entityMap->end(); iter++) {
 637     if (strcmp(iter->second.c_str(), entity) == 0) {
 638       memcpy(buf, iter->first, strlen(iter->first));
 639       *buflen = strlen(iter->first);
 640       return true;
 641     }
 642   }
 643   return false;
 644 }
 645
 646 char *string_html_encode(const char *input, int &len,
 647                          const int64_t qsBitmask, bool utf8,
 648                          bool dEncode, bool htmlEnt) {
 649   assert(input);
 650   /**
 651    * Though seems to be wasting memory a lot, we have to realize most of the
 652    * time this function is called with small strings, or fragments of HTMLs.
 653    * Allocating/deallocating anything less than 1K is trivial these days, and
 654    * we want avoid string copying as much as possible. Of course, the return
 655    * char * is really sent back at large, occupying unnecessary space for
 656    * potentially longer time than we need, we have to realize the two closest
 657    * solutions are not that much better, either:
 658    *
 659    * 1. pre-calculate size by iterating through the string once: too time
 660    *    consuming;
 661    * 2. take a guess and double buffer size when over: still wasting, and
 662    *    it may not save that much.
 663    *
 664    * Note: Amount of  allocation per character to be encoded may have to be
 665    * increased as larger HTML Entities are implemented.
 666    */
 667   char *ret = (char *)malloc(len * 14uL + 1);
 668   if (!ret) {
 669     return nullptr;
 670   }
 671   char *q = ret;
 672   for (const char *p = input, *end = input + len; p < end; p++) {
 673     unsigned char c = *p;
 674     char entity[5];
 675     int codeLength = 0;
 676     switch (c) {
 677     case '"':
 678       if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE)) {
 679         *q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
 680       } else {
 681         *q++ = c;
 682       }
 683       break;
 684     case '\'':
 685       if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE)) {
 686         *q++ = '&';
 687         if ((qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_XML1))) {
 688           *q++ = 'a'; *q++ = 'p'; *q++ = 'o'; *q++ = 's';
 689         } else {
 690           *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9';
 691         }
 692         *q++ = ';';
 693       } else {
 694         *q++ = c;
 695       }
 696       break;
 697     case '<':
 698       *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
 699       break;
 700     case '>':
 701       *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
 702       break;
 703     case '&':
 704       if (!dEncode) {
 705         p++;
 706
 707         html_get_entity_map();
 708
 709         bool found = false;
 710         for (const char *t = p; *t; t++) {
 711           if (*t == ';') {
 712             int l = t - p;
 713             if (l > 0) {
 714               char sbuf[16] = {0};
 715               char *buf;
 716               if (l > 10) {
 717                 buf = (char* )malloc(l + 1);
 718               } else {
 719                 buf = sbuf;
 720               }
 721               memcpy(buf, p, l);
 722               buf[l] = '\0';
 723               if (decode_entity(buf, &l, true, true,
 724                 cs_utf_8, true)) {
 725                 found = true;
 726                 *q++ = '&';
 727                 for(const char *s = p; s <= t; s++) {
 728                   *q++ = *s;
 729                 }
 730                 p = t;
 731               }
 732               if (buf != sbuf) {
 733                 free(buf);
 734               }
 735             }
 736             break;
 737           }
 738         }
 739         if (!found) {
 740           p--;
 741           *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 742         }
 743       } else {
 744         *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 745       }
 746       break;
 747     case static_cast<unsigned char>('\xc2'):
 748       if (htmlEnt && utf8 && p != end && *(p+1) == '\xa0') {
 749         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 750         p++;
 751         break;
 752       }
 753       [[fallthrough]];
 754     default: {
 755       if (LIKELY(c < 0x80)) {
 756         *q++ = c;
 757         break;
 758       } else if (htmlEnt && !utf8 && (c - 160) < sizeof(ent_iso_8859_1) - 1) {
 759         /**
 760           * https://github.com/facebook/hhvm/issues/2186
 761           * If not UTF8, and we are converting to HTML entities, use known
 762           * entity equivalent of the character, if possible.
 763           * Since we only support ISO-8859-1 or UTF8 right now, and they use
 764           * the same mapping array, use it.
 765           * Start at 0xA0 = 160
 766           */
 767         *q++ = '&';
 768         const char *s = ent_iso_8859_1[c - 160];
 769         int len_2 = strlen(s);
 770         for (int n = 0; n < len_2; n++) {
 771           *q++ = *s++;
 772         }
 773         *q++ = ';';
 774         break;
 775       }
 776
 777       bool should_skip =
 778         qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
 779       bool should_replace =
 780         qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
 781
 782       if (!utf8 && should_skip) {
 783         *q++ = c;
 784         break;
 785       }
 786
 787       auto avail = end - p;
 788       auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
 789       auto utf8_lead = [](unsigned char c) {
 790         return c < 0x80 || (c >= 0xC2 && c <= 0xF4);
 791       };
 792
 793       // This has to be a macro since it needs to be able to break away from
 794       // the for loop we're in.
 795       // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
 796       // \uFFFD is Unicode Replacement Character (U+FFFD)
 797       #define UTF8_ERROR_IF_LEN(cond, len) \
 798         if (cond) { \
 799           p += (len) - 1; \
 800           if (should_skip) { break; } \
 801           else if (should_replace) { strcpy(q, (const char *)u8"\uFFFD"); q += 3; break; } \
 802           else { goto exit_error; } \
 803         }
 804
 805       #define UTF8_ERROR_IF(cond) UTF8_ERROR_IF_LEN(cond, 1)
 806
 807       if (utf8) {
 808         if (c < 0xc2) {
 809           UTF8_ERROR_IF(true);
 810         } else if (c < 0xe0) {
 811           UTF8_ERROR_IF(avail < 2);
 812           UTF8_ERROR_IF_LEN(!utf8_trail(*(p + 1)), utf8_lead(*(p + 1)) ? 1 : 2);
 813
 814           uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
 815           UTF8_ERROR_IF_LEN(tc < 0x80, 2); // non-shortest form
 816
 817           codeLength = 2;
 818           entity[0] = *p;
 819           entity[1] = *(p + 1);
 820           entity[2] = '\0';
 821         } else if (c < 0xf0) {
 822           if (avail < 3 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2))) {
 823             UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
 824             UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
 825             UTF8_ERROR_IF_LEN(true, 3);
 826           }
 827
 828           uint32_t tc = ((c & 0x0f) << 12) |
 829                         ((*(p+1) & 0x3f) << 6) |
 830                         (*(p+2) & 0x3f);
 831           UTF8_ERROR_IF_LEN(tc < 0x800, 3); // non-shortest form
 832           UTF8_ERROR_IF_LEN(tc >= 0xd800 && tc <= 0xdfff, 3); // surrogate
 833
 834           codeLength = 3;
 835           entity[0] = *p;
 836           entity[1] = *(p + 1);
 837           entity[2] = *(p + 2);
 838           entity[3] = '\0';
 839         } else if (c < 0xf5) {
 840           if (avail < 4 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2)) ||
 841               !utf8_trail(*(p + 3))) {
 842             UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
 843             UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
 844             UTF8_ERROR_IF_LEN(avail < 4 || utf8_lead(*(p + 3)), 3);
 845             UTF8_ERROR_IF_LEN(true, 4);
 846           }
 847
 848           uint32_t tc = ((c & 0x07) << 18) |
 849                         ((*(p+1) & 0x3f) << 12) |
 850                         ((*(p+2) & 0x3f) << 6) |
 851                         (*(p+3) & 0x3f);
 852
 853           // non-shortest form or outside range
 854           UTF8_ERROR_IF_LEN(tc < 0x10000 || tc > 0x10ffff, 4);
 855
 856           codeLength = 4;
 857           entity[0] = *p;
 858           entity[1] = *(p + 1);
 859           entity[2] = *(p + 2);
 860           entity[3] = *(p + 3);
 861           entity[4] = '\0';
 862         } else {
 863           UTF8_ERROR_IF(true);
 864         }
 865       } else {
 866         codeLength = 1;
 867         entity[0] = *p;
 868         entity[1] = '\0';
 869       }
 870
 871       if (htmlEnt) {
 872         html_get_entity_map();
 873
 874         char buf[16] = {0};
 875         buf[0] = c;
 876         int len_2 = 1;
 877
 878         if (encode_entity(buf, &len_2, const_cast<char*>(entity), utf8)) {
 879           *q++ = '&';
 880           const char *s = buf;
 881           for (int n = 0; n < len_2; n++) {
 882             *q++ = *s++;
 883           }
 884           *q++ = ';';
 885         } else {
 886           memcpy(q, p, codeLength);
 887           q += codeLength;
 888         }
 889       } else {
 890         memcpy(q, p, codeLength);
 891         q += codeLength;
 892       }
 893       p += codeLength - 1;
 894
 895       break;
 896     }
 897     }
 898
 899   }
 900
 901   #undef UTF8_ERROR_IF
 902   #undef UTF8_ERROR_IF_LEN
 903
 904   if (q - ret > INT_MAX) {
 905     goto exit_error;
 906   }
 907   *q = 0;
 908   len = q - ret;
 909   return ret;
 910
 911 exit_error:
 912   free(ret);
 913   return nullptr;
 914 }
 915
 916 char *string_html_encode_extra(const char *input, int &len,
 917                                StringHtmlEncoding flags,
 918                                const AsciiMap *asciiMap) {
 919   assert(input);
 920   /**
 921    * Though seems to be wasting memory a lot, we have to realize most of the
 922    * time this function is called with small strings, or fragments of HTMLs.
 923    * Allocating/deallocating anything less than 1K is trivial these days, and
 924    * we want avoid string copying as much as possible. Of course, the return
 925    * char * is really sent back at large, occupying unnecessary space for
 926    * potentially longer time than we need, we have to realize the two closest
 927    * solutions are not that much better, either:
 928    *
 929    * 1. pre-calculate size by iterating through the string once: too time
 930    *    consuming;
 931    * 2. take a guess and double buffer size when over: still wasting, and
 932    *    it may not save that much.
 933    */
 934   char *ret = (char *)malloc(len * 8uL + 1);
 935   if (!ret) {
 936     return nullptr;
 937   }
 938   char *q = ret;
 939   const char *rep = (const char *)u8"\ufffd";
 940   int32_t srcPosBytes;
 941   for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
 942     unsigned char c = input[srcPosBytes];
 943     if (c && c < 128) {
 944       srcPosBytes++; // Optimize US-ASCII case
 945       if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
 946         switch (c) {
 947           case '"':
 948             *q++ = '&'; *q++ = 'q'; *q++ = 'u';
 949             *q++ = 'o'; *q++ = 't'; *q++ = ';';
 950             break;
 951           case '\'':
 952             *q++ = '&'; *q++ = '#'; *q++ = '0';
 953             *q++ = '3'; *q++ = '9'; *q++ = ';';
 954             break;
 955           case '<':
 956             *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
 957             break;
 958           case '>':
 959             *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
 960             break;
 961           case '&':
 962             *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 963             break;
 964           default:
 965             *q++ = '&'; *q++ = '#';
 966             *q++ = c >= 100 ? '1' : '0';
 967             *q++ = ((c / 10) % 10) + '0';
 968             *q++ = (c % 10) + '0';
 969             *q++ = ';';
 970             break;
 971         }
 972       } else {
 973         *q++ = c;
 974       }
 975     } else if (flags & STRING_HTML_ENCODE_UTF8) {
 976       UChar32 curCodePoint;
 977       U8_NEXT(input, srcPosBytes, len, curCodePoint);
 978       if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
 979         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 980       } else if (curCodePoint <= 0) {
 981         if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
 982           if (flags & STRING_HTML_ENCODE_HIGH) {
 983             *q++ = '&'; *q++ = '#'; *q++ = 'x';
 984             *q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
 985             *q++ = ';';
 986           } else {
 987             const char *r = rep;
 988             while (*r) *q++ = *r++;
 989           }
 990         }
 991       } else if (flags & STRING_HTML_ENCODE_HIGH) {
 992         q += sprintf(q, "&#x%x;", curCodePoint);
 993       } else {
 994         int32_t pos = 0;
 995         U8_APPEND_UNSAFE(q, pos, curCodePoint);
 996         q += pos;
 997       }
 998     } else {
 999       srcPosBytes++; // Optimize US-ASCII case
1000       if (c == 0xa0) {
1001         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
1002       } else if (flags & STRING_HTML_ENCODE_HIGH) {
1003         *q++ = '&'; *q++ = '#';
1004         *q++ = c >= 200 ? '2' : '1';
1005         *q++ = ((c / 10) % 10) + '0';
1006         *q++ = (c % 10) + '0';
1007         *q++ = ';';
1008       } else {
1009         *q++ = c;
1010       }
1011     }
1012   }
1013   if (q - ret > INT_MAX) {
1014     free(ret);
1015     return nullptr;
1016   }
1017   *q = 0;
1018   len = q - ret;
1019   return ret;
1020 }
1021
1022 char *string_html_decode(const char *input, int &len,
1023                          bool decode_double_quote, bool decode_single_quote,
1024                          const char *charset_hint, bool all,
1025                          bool xhp /* = false */) {
1026   assert(input);
1027
1028   if (!EntityMapInited) {
1029     Lock lock(EntityMapMutex);
1030     if (!EntityMapInited) {
1031       init_entity_table();
1032       EntityMapInited = true;
1033     }
1034   }
1035
1036   entity_charset charset = determine_charset(charset_hint);
1037   if (charset == cs_unknown) {
1038     return nullptr;
1039   }
1040
1041   char *ret = (char *)malloc(len + 1);
1042   char *q = ret;
1043   for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
1044     char ch = *p;
1045     if (ch != '&') {
1046       *q++ = ch;
1047       continue;
1048     }
1049     p++;
1050
1051     bool found = false;
1052     for (const char *t = p; *t; t++) {
1053       if (*t == ';') {
1054         int l = t - p;
1055         if (l > 0) {
1056           char sbuf[16] = {0};
1057           char *buf;
1058           if (l > 10) {
1059             buf = (char* )malloc(l + 1);
1060           } else {
1061             buf = sbuf;
1062           }
1063           memcpy(buf, p, l);
1064           buf[l] = '\0';
1065           if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
1066                             charset, all, xhp)) {
1067             memcpy(q, buf, l);
1068             found = true;
1069             p = t;
1070             q += l;
1071           }
1072           if (buf != sbuf) {
1073             free(buf);
1074           }
1075         }
1076         break;
1077       }
1078     }
1079     if (!found) {
1080       p--;
1081       *q++ = '&'; // not an entity
1082     }
1083   }
1084   *q = '\0';
1085   len = q - ret;
1086   return ret;
1087 }
1088
1089 const html_entity_map* html_get_entity_map() {
1090   if (!EntityMapInited) {
1091     Lock lock(EntityMapMutex);
1092     if (!EntityMapInited) {
1093       init_entity_table();
1094       EntityMapInited = true;
1095     }
1096   }
1097   return entity_map;
1098 }
1099
1100 ///////////////////////////////////////////////////////////////////////////////
1101 }