Remove footer from spec
[hiphop-php.git] / hphp / zend / zend-html.cpp
bloba207491911ae8a6f1694d91478bd5197ef9cb844
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
17 #include "hphp/zend/zend-html.h"
19 #include <unicode/uchar.h>
20 #include <unicode/utf8.h>
22 #include "hphp/util/lock.h"
23 #include "hphp/util/functional.h"
24 #include "hphp/util/hash-map.h"
26 namespace HPHP {
28 ///////////////////////////////////////////////////////////////////////////////
29 // UTF-8 entity tables
31 using namespace entity_charset_enum;
33 static entity_table_t ent_cp_866[] = {
34 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
35 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
36 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
37 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
38 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
39 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
40 "blk14", "blk12", "blk34", "boxv", "boxvl", "boxvL", "boxVl", "boxDl",
41 "boxdL", "boxVL", "boxV", "boxDL", "boxUL", "boxUl", "boxuL", "boxdl",
42 "boxur", "boxhu", "boxhd", "boxvr", "boxh", "boxvh", "boxvR", "boxVr",
43 "boxUR", "boxDR", "boxHU", "boxHD", "boxVR", "boxH", "boxVH", "boxHu",
44 "boxhU", "boxHd", "boxhD", "boxUr", "boxuR", "boxdR", "boxDr", "boxVh",
45 "boxvH", "boxul", "boxdr", "block", "lhblk", nullptr, nullptr, "uhblk",
46 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
47 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
48 "IOcy", "iocy", "Jukcy", "jukcy", "YIcy", "yicy", "Ubrcy", "ubrcy",
49 "deg", nullptr, "middot", "Sqrt", "numero", "curren", nullptr, "nbsp"
52 static entity_table_t ent_cp_1251[] = {
53 "DJcy", "GJcy", "sbquo", "gjcy", "bdquo", "hellip", "dagger", "Dagger",
54 "euro", "permil", "LJcy", "lsaquo", "NJcy", "KJcy", "TSHcy", "DZcy",
55 "djcy", "lsquo", "rsquo", "ldquo", "rdquo", "bull", "ndash", "mdash",
56 nullptr, "trade", "ljcy", "rsaquo", "njcy", "kjcy", "tshcy", "dzcy",
57 "nbsp", "Ubrcy", "ubrcy", "Jsercy", "curren", nullptr, "brvbar", "sect",
58 "IOcy", "copy", "Jukcy", "laquo", "not", "shy", "reg", "YIcy",
59 "deg", "pm", "Iukcy", "iukcy", nullptr, "micro", "para", "middot",
60 "iocy", "numero", "jukcy", "raquo", "jsercy", "DScy", "dscy", "yicy",
61 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
62 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
63 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
64 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
65 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
66 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
67 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
68 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy"
71 /* codepage 1252 is a Windows extension to iso-8859-1. */
72 static entity_table_t ent_cp_1252[] = {
73 "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
74 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
75 nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
76 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
77 "oelig", nullptr, nullptr, "Yuml"
80 static entity_table_t ent_iso_8859_1[] = {
81 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
82 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
83 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
84 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
85 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
86 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
87 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
88 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
89 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
90 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
91 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
92 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
93 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
94 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
95 "uuml", "yacute", "thorn", "yuml"
98 static entity_table_t ent_iso_8859_5[] = {
99 "nbsp", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy",
100 "Jsercy", "LJcy", "NJcy", "TSHcy", "KJcy", "shy", "Ubrcy", "DZcy",
101 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
102 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
103 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
104 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
105 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
106 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
107 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
108 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
109 "numero", "iocy", "djcy", "gjcy", "jukcy", "dscy", "iukcy", "yicy",
110 "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "sect", "ubrcy", "dzcy"
113 static entity_table_t ent_iso_8859_15[] = {
114 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
115 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
116 "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
117 "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
118 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
119 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
120 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
121 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
122 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
123 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
124 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
125 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
126 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
127 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
128 "uuml", "yacute", "thorn", "yuml"
131 static entity_table_t ent_uni_338_402[] = {
132 /* 338 (0x0152) */
133 "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
134 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
135 /* 352 (0x0160) */
136 "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
137 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
138 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
139 /* 376 (0x0178) */
140 "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
141 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
142 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
143 /* 400 (0x0190) */
144 nullptr, nullptr, "fnof"
147 static entity_table_t ent_uni_spacing[] = {
148 /* 710 */
149 "circ",
150 /* 711 - 730 */
151 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
152 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
153 /* 731 - 732 */
154 nullptr, "tilde"
157 static entity_table_t ent_uni_greek[] = {
158 /* 913 */
159 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
160 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
161 nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
162 /* 938 - 944 are not mapped */
163 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
164 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
165 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
166 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
167 /* 970 - 976 are not mapped */
168 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
169 "thetasym", "upsih",
170 nullptr, nullptr, nullptr,
171 "piv"
174 static entity_table_t ent_uni_punct[] = {
175 /* 8194 */
176 "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
177 "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
178 nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
179 /* 8216 */
180 "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
181 "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
182 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
183 /* 8242 */
184 "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
185 nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
186 "frasl"
189 static entity_table_t ent_uni_euro[] = {
190 "euro"
193 static entity_table_t ent_uni_8465_8501[] = {
194 /* 8465 */
195 "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
196 /* 8472 */
197 "weierp", nullptr, nullptr, nullptr,
198 /* 8476 */
199 "real", nullptr, nullptr, nullptr, nullptr, nullptr,
200 /* 8482 */
201 "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
202 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
203 /* 8501 */
204 "alefsym",
207 static entity_table_t ent_uni_8592_9002[] = {
208 /* 8592 (0x2190) */
209 "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
210 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
211 /* 8608 (0x21a0) */
212 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
213 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
214 /* 8624 (0x21b0) */
215 nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
216 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
217 /* 8640 (0x21c0) */
218 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
219 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
220 /* 8656 (0x21d0) */
221 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
222 nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
223 /* 8672 (0x21e0) */
224 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
225 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
226 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
227 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
228 /* 8704 (0x2200) */
229 "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
230 "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
231 /* 8720 (0x2210) */
232 "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
233 "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
234 /* 8736 (0x2220) */
235 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
236 "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
237 /* 8752 (0x2230) */
238 nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
239 nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
240 /* 8768 (0x2240) */
241 "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
242 "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
243 /* 8784 (0x2250) */
244 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
245 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
246 /* 8800 (0x2260) */
247 "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
248 "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
249 /* 8816 (0x2270) */
250 "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
251 nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
252 /* 8832 (0x2280) */
253 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
254 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
255 /* 8848 (0x2290) */
256 nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
257 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
258 /* 8864 (0x22a0) */
259 nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
260 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
261 /* 8880 (0x22b0) */
262 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
263 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
264 /* 8896 (0x22c0) */
265 nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
266 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
267 /* 8912 (0x22d0) */
268 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
269 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
270 /* 8928 (0x22e0) */
271 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
272 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
273 /* 8944 (0x22f0) */
274 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
275 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
276 /* 8960 (0x2300) */
277 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
278 "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
279 /* 8976 (0x2310) */
280 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
281 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
282 /* 8992 (0x2320) */
283 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
284 nullptr, "lang", "rang"
287 static entity_table_t ent_uni_9674[] = {
288 /* 9674 */
289 "loz"
292 static entity_table_t ent_uni_9824_9830[] = {
293 /* 9824 */
294 "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
297 static const struct html_entity_map entity_map[] = {
298 { cs_cp866, 0x80, 0xff, ent_cp_866 },
299 { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
300 { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
301 { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
302 { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
303 { cs_8859_5, 0xa0, 0xff, ent_iso_8859_5 },
304 { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
305 { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
306 { cs_utf_8, 338, 402, ent_uni_338_402 },
307 { cs_utf_8, 710, 732, ent_uni_spacing },
308 { cs_utf_8, 913, 982, ent_uni_greek },
309 { cs_utf_8, 8194, 8260, ent_uni_punct },
310 { cs_utf_8, 8364, 8364, ent_uni_euro },
311 { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
312 { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
313 { cs_utf_8, 9674, 9674, ent_uni_9674 },
314 { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
315 { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
316 { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
317 { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
318 { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
319 { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
320 /* Missing support for these at the moment
321 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
322 { cs_macroman, 0x0b, 0xff, ent_macroman },
324 { cs_terminator }
327 static const struct {
328 const char *codeset;
329 entity_charset charset;
330 } charset_map[] = {
331 { "ISO-8859-1", cs_8859_1 },
332 { "ISO8859-1", cs_8859_1 },
333 { "ISO-8859-5", cs_8859_5 },
334 { "ISO8859-5", cs_8859_5 },
335 { "ISO-8859-15", cs_8859_15 },
336 { "ISO8859-15", cs_8859_15 },
337 { "utf-8", cs_utf_8 },
338 { "cp866", cs_cp866 },
339 { "866", cs_cp866 },
340 { "ibm866", cs_cp866 },
341 { "cp1251", cs_cp1251 },
342 { "Windows-1251", cs_cp1251 },
343 { "win-1251", cs_cp1251 },
344 { "cp1252", cs_cp1252 },
345 { "Windows-1252", cs_cp1252 },
346 { "1252", cs_cp1252 },
347 { "BIG5", cs_big5 },
348 { "950", cs_big5 },
349 { "GB2312", cs_gb2312 },
350 { "936", cs_gb2312 },
351 { "BIG5-HKSCS", cs_big5hkscs },
352 { "Shift_JIS", cs_sjis },
353 { "SJIS", cs_sjis },
354 { "932", cs_sjis },
355 { "EUCJP", cs_eucjp },
356 /* Missing support for these at the moment
357 { "EUC-JP", cs_eucjp },
358 { "KOI8-R", cs_koi8r },
359 { "koi8-ru", cs_koi8r },
360 { "koi8r", cs_koi8r },
361 { "MacRoman", cs_macroman },
363 { nullptr }
366 ///////////////////////////////////////////////////////////////////////////////
368 entity_charset determine_charset(const char *charset_hint) {
369 entity_charset charset = cs_unknown;
371 if (charset_hint == nullptr) {
372 // default to utf-8
373 return cs_utf_8;
376 size_t len = strlen(charset_hint);
378 /* now walk the charset map and look for the codeset */
379 for (int i = 0; charset_map[i].codeset; i++) {
380 if (len == strlen(charset_map[i].codeset) &&
381 strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
382 charset = charset_map[i].charset;
383 break;
387 return charset;
390 static int utf32_to_utf8(unsigned char *buf, int k) {
391 int retval = 0;
393 if (k < 0x80) {
394 buf[0] = k;
395 retval = 1;
396 } else if (k < 0x800) {
397 buf[0] = 0xc0 | (k >> 6);
398 buf[1] = 0x80 | (k & 0x3f);
399 retval = 2;
400 } else if (k < 0x10000) {
401 buf[0] = 0xe0 | (k >> 12);
402 buf[1] = 0x80 | ((k >> 6) & 0x3f);
403 buf[2] = 0x80 | (k & 0x3f);
404 retval = 3;
405 } else if (k < 0x200000) {
406 buf[0] = 0xf0 | (k >> 18);
407 buf[1] = 0x80 | ((k >> 12) & 0x3f);
408 buf[2] = 0x80 | ((k >> 6) & 0x3f);
409 buf[3] = 0x80 | (k & 0x3f);
410 retval = 4;
411 } else if (k < 0x4000000) {
412 buf[0] = 0xf8 | (k >> 24);
413 buf[1] = 0x80 | ((k >> 18) & 0x3f);
414 buf[2] = 0x80 | ((k >> 12) & 0x3f);
415 buf[3] = 0x80 | ((k >> 6) & 0x3f);
416 buf[4] = 0x80 | (k & 0x3f);
417 retval = 5;
418 } else {
419 buf[0] = 0xfc | (k >> 30);
420 buf[1] = 0x80 | ((k >> 24) & 0x3f);
421 buf[2] = 0x80 | ((k >> 18) & 0x3f);
422 buf[3] = 0x80 | ((k >> 12) & 0x3f);
423 buf[4] = 0x80 | ((k >> 6) & 0x3f);
424 buf[5] = 0x80 | (k & 0x3f);
425 retval = 6;
427 buf[retval] = '\0';
429 return retval;
432 using HtmlEntityMap = hphp_const_char_map<std::string>;
434 static volatile bool EntityMapInited = false;
435 static Mutex EntityMapMutex;
436 static HtmlEntityMap EntityMap[cs_end];
437 static HtmlEntityMap XHPEntityMap[cs_end];
439 static void init_entity_table() {
440 for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
441 const html_entity_map &em = entity_map[i];
442 const entity_charset charset = entity_map[i].charset;
444 int index = 0;
445 for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
446 const char *entity = em.table[index];
447 if (entity == nullptr) {
448 continue;
450 unsigned char buf[10];
451 switch (charset) {
452 case cs_8859_1:
453 case cs_cp1252:
454 case cs_8859_15:
455 case cs_cp1251:
456 case cs_8859_5:
457 case cs_cp866:
458 case cs_koi8r:
459 buf[0] = ch;
460 buf[1] = '\0';
461 break;
463 case cs_utf_8:
464 utf32_to_utf8(buf, ch);
465 break;
467 default:
468 continue;
470 EntityMap[charset][entity] = (const char *)buf;
471 XHPEntityMap[charset][entity] = (const char *)buf;
474 EntityMap[charset]["quot"] = "\"";
475 EntityMap[charset]["lt"] = "<";
476 EntityMap[charset]["gt"] = ">";
477 EntityMap[charset]["amp"] = "&";
479 XHPEntityMap[charset]["quot"] = "\"";
480 XHPEntityMap[charset]["lt"] = "<";
481 XHPEntityMap[charset]["gt"] = ">";
482 XHPEntityMap[charset]["amp"] = "&";
483 // XHP-specific entities
484 XHPEntityMap[charset]["apos"] = "\'";
485 XHPEntityMap[charset]["cloud"] = (const char *)u8"\u2601";
486 XHPEntityMap[charset]["umbrella"] = (const char *)u8"\u2602";
487 XHPEntityMap[charset]["snowman"] = (const char *)u8"\u2603";
488 XHPEntityMap[charset]["snowflake"] = (const char *)u8"\u2745";
489 XHPEntityMap[charset]["comet"] = (const char *)u8"\u2604";
490 XHPEntityMap[charset]["thunderstorm"] = (const char *)u8"\u2608";
493 // the first element is an empty table
494 EntityMap[cs_terminator]["quot"] = "\"";
495 EntityMap[cs_terminator]["lt"] = "<";
496 EntityMap[cs_terminator]["gt"] = ">";
497 EntityMap[cs_terminator]["amp"] = "&";
498 // XHP-specific entities
499 XHPEntityMap[cs_terminator]["apos"] = "\'";
500 XHPEntityMap[cs_terminator]["cloud"] = (const char *)u8"\u2601";
501 XHPEntityMap[cs_terminator]["umbrella"] = (const char *)u8"\u2602";
502 XHPEntityMap[cs_terminator]["snowman"] = (const char *)u8"\u2603";
503 XHPEntityMap[cs_terminator]["snowflake"] = (const char *)u8"\u2745";
504 XHPEntityMap[cs_terminator]["comet"] = (const char *)u8"\u2604";
505 XHPEntityMap[cs_terminator]["thunderstorm"] = (const char *)u8"\u2608";
508 ///////////////////////////////////////////////////////////////////////////////
509 inline static bool decode_entity(char *entity, int *len,
510 bool decode_double_quote,
511 bool decode_single_quote,
512 entity_charset charset, bool all,
513 bool xhp = false) {
514 // entity is 16 bytes, allocated statically below
515 // default in PHP
516 assert(entity && *entity);
517 if (entity[0] == '#') {
518 int code;
519 if (entity[1] == 'x' || entity[1] == 'X') {
520 if (!isxdigit(entity[2])) return false;
521 code = strtol(entity + 2, nullptr, 16);
522 } else {
523 if (!isdigit(entity[1])) return false;
524 code = strtol(entity + 1, nullptr, 10);
527 // since we don't support multibyte chars other than utf-8
528 int l = 1;
530 if (code == 39 && decode_single_quote) {
531 entity[0] = code;
532 entity[1] = '\0';
533 *len = l;
534 return true;
537 if (!all && (code != '&') &&
538 (code != '<') && (code != '>') &&
539 (code != '"') && (code != '\'')) {
540 // htmlspecialchars_decode() does not parse numeric
541 // entities other than & < > " '
542 return false;
545 switch (charset) {
546 case cs_utf_8:
548 unsigned char buf[10];
549 int size = utf32_to_utf8(buf, code);
550 memcpy(entity, buf, size + 1);
551 l = size;
552 break;
555 case cs_8859_1:
556 case cs_8859_5:
557 case cs_8859_15:
558 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
559 return false;
560 } else {
561 if (code == 39) {
562 return false;
564 entity[0] = code;
565 entity[1] = '\0';
567 break;
569 case cs_cp1252:
570 case cs_cp1251:
571 case cs_cp866:
572 if (code > 0xff) {
573 return false;
575 entity[0] = code;
576 entity[1] = '\0';
577 break;
579 case cs_big5:
580 case cs_big5hkscs:
581 case cs_sjis:
582 case cs_eucjp:
583 if (code >= 0x80) {
584 return false;
586 entity[0] = code;
587 entity[1] = '\0';
588 break;
590 case cs_gb2312:
591 if (code >= 0x81) {
592 return false;
594 entity[0] = code;
595 entity[1] = '\0';
596 break;
598 default:
599 return false;
600 break;
602 *len = l;
603 return true;
604 } else {
605 HtmlEntityMap *entityMap;
607 if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
608 return false;
611 if (all) {
612 entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
613 } else {
614 entityMap = xhp ? &XHPEntityMap[cs_terminator]
615 : &EntityMap[cs_terminator];
617 HtmlEntityMap::const_iterator iter = entityMap->find(entity);
618 if (iter != entityMap->end()) {
619 memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
620 *len = iter->second.length();
621 return true;
625 return false;
628 inline static bool encode_entity(char* buf, int* buflen,
629 const char* entity, bool utf8) {
630 entity_charset charset = cs_utf_8;
631 if (!utf8){ charset = cs_8859_1; }
633 HtmlEntityMap *entityMap = &EntityMap[charset];
635 for(HtmlEntityMap::const_iterator iter = entityMap->begin();
636 iter != entityMap->end(); iter++) {
637 if (strcmp(iter->second.c_str(), entity) == 0) {
638 memcpy(buf, iter->first, strlen(iter->first));
639 *buflen = strlen(iter->first);
640 return true;
643 return false;
646 char *string_html_encode(const char *input, int &len,
647 const int64_t qsBitmask, bool utf8,
648 bool dEncode, bool htmlEnt) {
649 assert(input);
651 * Though seems to be wasting memory a lot, we have to realize most of the
652 * time this function is called with small strings, or fragments of HTMLs.
653 * Allocating/deallocating anything less than 1K is trivial these days, and
654 * we want avoid string copying as much as possible. Of course, the return
655 * char * is really sent back at large, occupying unnecessary space for
656 * potentially longer time than we need, we have to realize the two closest
657 * solutions are not that much better, either:
659 * 1. pre-calculate size by iterating through the string once: too time
660 * consuming;
661 * 2. take a guess and double buffer size when over: still wasting, and
662 * it may not save that much.
664 * Note: Amount of allocation per character to be encoded may have to be
665 * increased as larger HTML Entities are implemented.
667 char *ret = (char *)malloc(len * 14uL + 1);
668 if (!ret) {
669 return nullptr;
671 char *q = ret;
672 for (const char *p = input, *end = input + len; p < end; p++) {
673 unsigned char c = *p;
674 char entity[5];
675 int codeLength = 0;
676 switch (c) {
677 case '"':
678 if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE)) {
679 *q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
680 } else {
681 *q++ = c;
683 break;
684 case '\'':
685 if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE)) {
686 *q++ = '&';
687 if ((qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_XML1))) {
688 *q++ = 'a'; *q++ = 'p'; *q++ = 'o'; *q++ = 's';
689 } else {
690 *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9';
692 *q++ = ';';
693 } else {
694 *q++ = c;
696 break;
697 case '<':
698 *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
699 break;
700 case '>':
701 *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
702 break;
703 case '&':
704 if (!dEncode) {
705 p++;
707 html_get_entity_map();
709 bool found = false;
710 for (const char *t = p; *t; t++) {
711 if (*t == ';') {
712 int l = t - p;
713 if (l > 0) {
714 char sbuf[16] = {0};
715 char *buf;
716 if (l > 10) {
717 buf = (char* )malloc(l + 1);
718 } else {
719 buf = sbuf;
721 memcpy(buf, p, l);
722 buf[l] = '\0';
723 if (decode_entity(buf, &l, true, true,
724 cs_utf_8, true)) {
725 found = true;
726 *q++ = '&';
727 for(const char *s = p; s <= t; s++) {
728 *q++ = *s;
730 p = t;
732 if (buf != sbuf) {
733 free(buf);
736 break;
739 if (!found) {
740 p--;
741 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
743 } else {
744 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
746 break;
747 case static_cast<unsigned char>('\xc2'):
748 if (htmlEnt && utf8 && p != end && *(p+1) == '\xa0') {
749 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
750 p++;
751 break;
753 [[fallthrough]];
754 default: {
755 if (LIKELY(c < 0x80)) {
756 *q++ = c;
757 break;
758 } else if (htmlEnt && !utf8 && (c - 160) < sizeof(ent_iso_8859_1) - 1) {
760 * https://github.com/facebook/hhvm/issues/2186
761 * If not UTF8, and we are converting to HTML entities, use known
762 * entity equivalent of the character, if possible.
763 * Since we only support ISO-8859-1 or UTF8 right now, and they use
764 * the same mapping array, use it.
765 * Start at 0xA0 = 160
767 *q++ = '&';
768 const char *s = ent_iso_8859_1[c - 160];
769 int len_2 = strlen(s);
770 for (int n = 0; n < len_2; n++) {
771 *q++ = *s++;
773 *q++ = ';';
774 break;
777 bool should_skip =
778 qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
779 bool should_replace =
780 qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
782 if (!utf8 && should_skip) {
783 *q++ = c;
784 break;
787 auto avail = end - p;
788 auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
789 auto utf8_lead = [](unsigned char c) {
790 return c < 0x80 || (c >= 0xC2 && c <= 0xF4);
793 // This has to be a macro since it needs to be able to break away from
794 // the for loop we're in.
795 // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
796 // \uFFFD is Unicode Replacement Character (U+FFFD)
797 #define UTF8_ERROR_IF_LEN(cond, len) \
798 if (cond) { \
799 p += (len) - 1; \
800 if (should_skip) { break; } \
801 else if (should_replace) { strcpy(q, (const char *)u8"\uFFFD"); q += 3; break; } \
802 else { goto exit_error; } \
805 #define UTF8_ERROR_IF(cond) UTF8_ERROR_IF_LEN(cond, 1)
807 if (utf8) {
808 if (c < 0xc2) {
809 UTF8_ERROR_IF(true);
810 } else if (c < 0xe0) {
811 UTF8_ERROR_IF(avail < 2);
812 UTF8_ERROR_IF_LEN(!utf8_trail(*(p + 1)), utf8_lead(*(p + 1)) ? 1 : 2);
814 uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
815 UTF8_ERROR_IF_LEN(tc < 0x80, 2); // non-shortest form
817 codeLength = 2;
818 entity[0] = *p;
819 entity[1] = *(p + 1);
820 entity[2] = '\0';
821 } else if (c < 0xf0) {
822 if (avail < 3 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2))) {
823 UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
824 UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
825 UTF8_ERROR_IF_LEN(true, 3);
828 uint32_t tc = ((c & 0x0f) << 12) |
829 ((*(p+1) & 0x3f) << 6) |
830 (*(p+2) & 0x3f);
831 UTF8_ERROR_IF_LEN(tc < 0x800, 3); // non-shortest form
832 UTF8_ERROR_IF_LEN(tc >= 0xd800 && tc <= 0xdfff, 3); // surrogate
834 codeLength = 3;
835 entity[0] = *p;
836 entity[1] = *(p + 1);
837 entity[2] = *(p + 2);
838 entity[3] = '\0';
839 } else if (c < 0xf5) {
840 if (avail < 4 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2)) ||
841 !utf8_trail(*(p + 3))) {
842 UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
843 UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
844 UTF8_ERROR_IF_LEN(avail < 4 || utf8_lead(*(p + 3)), 3);
845 UTF8_ERROR_IF_LEN(true, 4);
848 uint32_t tc = ((c & 0x07) << 18) |
849 ((*(p+1) & 0x3f) << 12) |
850 ((*(p+2) & 0x3f) << 6) |
851 (*(p+3) & 0x3f);
853 // non-shortest form or outside range
854 UTF8_ERROR_IF_LEN(tc < 0x10000 || tc > 0x10ffff, 4);
856 codeLength = 4;
857 entity[0] = *p;
858 entity[1] = *(p + 1);
859 entity[2] = *(p + 2);
860 entity[3] = *(p + 3);
861 entity[4] = '\0';
862 } else {
863 UTF8_ERROR_IF(true);
865 } else {
866 codeLength = 1;
867 entity[0] = *p;
868 entity[1] = '\0';
871 if (htmlEnt) {
872 html_get_entity_map();
874 char buf[16] = {0};
875 buf[0] = c;
876 int len_2 = 1;
878 if (encode_entity(buf, &len_2, const_cast<char*>(entity), utf8)) {
879 *q++ = '&';
880 const char *s = buf;
881 for (int n = 0; n < len_2; n++) {
882 *q++ = *s++;
884 *q++ = ';';
885 } else {
886 memcpy(q, p, codeLength);
887 q += codeLength;
889 } else {
890 memcpy(q, p, codeLength);
891 q += codeLength;
893 p += codeLength - 1;
895 break;
901 #undef UTF8_ERROR_IF
902 #undef UTF8_ERROR_IF_LEN
904 if (q - ret > INT_MAX) {
905 goto exit_error;
907 *q = 0;
908 len = q - ret;
909 return ret;
911 exit_error:
912 free(ret);
913 return nullptr;
916 char *string_html_encode_extra(const char *input, int &len,
917 StringHtmlEncoding flags,
918 const AsciiMap *asciiMap) {
919 assert(input);
921 * Though seems to be wasting memory a lot, we have to realize most of the
922 * time this function is called with small strings, or fragments of HTMLs.
923 * Allocating/deallocating anything less than 1K is trivial these days, and
924 * we want avoid string copying as much as possible. Of course, the return
925 * char * is really sent back at large, occupying unnecessary space for
926 * potentially longer time than we need, we have to realize the two closest
927 * solutions are not that much better, either:
929 * 1. pre-calculate size by iterating through the string once: too time
930 * consuming;
931 * 2. take a guess and double buffer size when over: still wasting, and
932 * it may not save that much.
934 char *ret = (char *)malloc(len * 8uL + 1);
935 if (!ret) {
936 return nullptr;
938 char *q = ret;
939 const char *rep = (const char *)u8"\ufffd";
940 int32_t srcPosBytes;
941 for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
942 unsigned char c = input[srcPosBytes];
943 if (c && c < 128) {
944 srcPosBytes++; // Optimize US-ASCII case
945 if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
946 switch (c) {
947 case '"':
948 *q++ = '&'; *q++ = 'q'; *q++ = 'u';
949 *q++ = 'o'; *q++ = 't'; *q++ = ';';
950 break;
951 case '\'':
952 *q++ = '&'; *q++ = '#'; *q++ = '0';
953 *q++ = '3'; *q++ = '9'; *q++ = ';';
954 break;
955 case '<':
956 *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
957 break;
958 case '>':
959 *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
960 break;
961 case '&':
962 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
963 break;
964 default:
965 *q++ = '&'; *q++ = '#';
966 *q++ = c >= 100 ? '1' : '0';
967 *q++ = ((c / 10) % 10) + '0';
968 *q++ = (c % 10) + '0';
969 *q++ = ';';
970 break;
972 } else {
973 *q++ = c;
975 } else if (flags & STRING_HTML_ENCODE_UTF8) {
976 UChar32 curCodePoint;
977 U8_NEXT(input, srcPosBytes, len, curCodePoint);
978 if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
979 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
980 } else if (curCodePoint <= 0) {
981 if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
982 if (flags & STRING_HTML_ENCODE_HIGH) {
983 *q++ = '&'; *q++ = '#'; *q++ = 'x';
984 *q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
985 *q++ = ';';
986 } else {
987 const char *r = rep;
988 while (*r) *q++ = *r++;
991 } else if (flags & STRING_HTML_ENCODE_HIGH) {
992 q += sprintf(q, "&#x%x;", curCodePoint);
993 } else {
994 int32_t pos = 0;
995 U8_APPEND_UNSAFE(q, pos, curCodePoint);
996 q += pos;
998 } else {
999 srcPosBytes++; // Optimize US-ASCII case
1000 if (c == 0xa0) {
1001 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
1002 } else if (flags & STRING_HTML_ENCODE_HIGH) {
1003 *q++ = '&'; *q++ = '#';
1004 *q++ = c >= 200 ? '2' : '1';
1005 *q++ = ((c / 10) % 10) + '0';
1006 *q++ = (c % 10) + '0';
1007 *q++ = ';';
1008 } else {
1009 *q++ = c;
1013 if (q - ret > INT_MAX) {
1014 free(ret);
1015 return nullptr;
1017 *q = 0;
1018 len = q - ret;
1019 return ret;
1022 char *string_html_decode(const char *input, int &len,
1023 bool decode_double_quote, bool decode_single_quote,
1024 const char *charset_hint, bool all,
1025 bool xhp /* = false */) {
1026 assert(input);
1028 if (!EntityMapInited) {
1029 Lock lock(EntityMapMutex);
1030 if (!EntityMapInited) {
1031 init_entity_table();
1032 EntityMapInited = true;
1036 entity_charset charset = determine_charset(charset_hint);
1037 if (charset == cs_unknown) {
1038 return nullptr;
1041 char *ret = (char *)malloc(len + 1);
1042 char *q = ret;
1043 for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
1044 char ch = *p;
1045 if (ch != '&') {
1046 *q++ = ch;
1047 continue;
1049 p++;
1051 bool found = false;
1052 for (const char *t = p; *t; t++) {
1053 if (*t == ';') {
1054 int l = t - p;
1055 if (l > 0) {
1056 char sbuf[16] = {0};
1057 char *buf;
1058 if (l > 10) {
1059 buf = (char* )malloc(l + 1);
1060 } else {
1061 buf = sbuf;
1063 memcpy(buf, p, l);
1064 buf[l] = '\0';
1065 if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
1066 charset, all, xhp)) {
1067 memcpy(q, buf, l);
1068 found = true;
1069 p = t;
1070 q += l;
1072 if (buf != sbuf) {
1073 free(buf);
1076 break;
1079 if (!found) {
1080 p--;
1081 *q++ = '&'; // not an entity
1084 *q = '\0';
1085 len = q - ret;
1086 return ret;
1089 const html_entity_map* html_get_entity_map() {
1090 if (!EntityMapInited) {
1091 Lock lock(EntityMapMutex);
1092 if (!EntityMapInited) {
1093 init_entity_table();
1094 EntityMapInited = true;
1097 return entity_map;
1100 ///////////////////////////////////////////////////////////////////////////////