2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
17 #include "hphp/zend/zend-html.h"
19 #include <unicode/uchar.h>
20 #include <unicode/utf8.h>
22 #include "hphp/util/lock.h"
23 #include "hphp/util/functional.h"
24 #include "hphp/util/hash-map.h"
28 ///////////////////////////////////////////////////////////////////////////////
29 // UTF-8 entity tables
31 using namespace entity_charset_enum
;
33 static entity_table_t ent_cp_866
[] = {
34 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
35 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
36 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
37 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
38 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
39 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
40 "blk14", "blk12", "blk34", "boxv", "boxvl", "boxvL", "boxVl", "boxDl",
41 "boxdL", "boxVL", "boxV", "boxDL", "boxUL", "boxUl", "boxuL", "boxdl",
42 "boxur", "boxhu", "boxhd", "boxvr", "boxh", "boxvh", "boxvR", "boxVr",
43 "boxUR", "boxDR", "boxHU", "boxHD", "boxVR", "boxH", "boxVH", "boxHu",
44 "boxhU", "boxHd", "boxhD", "boxUr", "boxuR", "boxdR", "boxDr", "boxVh",
45 "boxvH", "boxul", "boxdr", "block", "lhblk", nullptr, nullptr, "uhblk",
46 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
47 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
48 "IOcy", "iocy", "Jukcy", "jukcy", "YIcy", "yicy", "Ubrcy", "ubrcy",
49 "deg", nullptr, "middot", "Sqrt", "numero", "curren", nullptr, "nbsp"
52 static entity_table_t ent_cp_1251
[] = {
53 "DJcy", "GJcy", "sbquo", "gjcy", "bdquo", "hellip", "dagger", "Dagger",
54 "euro", "permil", "LJcy", "lsaquo", "NJcy", "KJcy", "TSHcy", "DZcy",
55 "djcy", "lsquo", "rsquo", "ldquo", "rdquo", "bull", "ndash", "mdash",
56 nullptr, "trade", "ljcy", "rsaquo", "njcy", "kjcy", "tshcy", "dzcy",
57 "nbsp", "Ubrcy", "ubrcy", "Jsercy", "curren", nullptr, "brvbar", "sect",
58 "IOcy", "copy", "Jukcy", "laquo", "not", "shy", "reg", "YIcy",
59 "deg", "pm", "Iukcy", "iukcy", nullptr, "micro", "para", "middot",
60 "iocy", "numero", "jukcy", "raquo", "jsercy", "DScy", "dscy", "yicy",
61 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
62 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
63 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
64 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
65 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
66 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
67 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
68 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy"
71 /* codepage 1252 is a Windows extension to iso-8859-1. */
72 static entity_table_t ent_cp_1252
[] = {
73 "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
74 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
75 nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
76 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
77 "oelig", nullptr, nullptr, "Yuml"
80 static entity_table_t ent_iso_8859_1
[] = {
81 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
82 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
83 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
84 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
85 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
86 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
87 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
88 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
89 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
90 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
91 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
92 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
93 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
94 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
95 "uuml", "yacute", "thorn", "yuml"
98 static entity_table_t ent_iso_8859_5
[] = {
99 "nbsp", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy",
100 "Jsercy", "LJcy", "NJcy", "TSHcy", "KJcy", "shy", "Ubrcy", "DZcy",
101 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
102 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
103 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
104 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
105 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
106 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
107 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
108 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
109 "numero", "iocy", "djcy", "gjcy", "jukcy", "dscy", "iukcy", "yicy",
110 "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "sect", "ubrcy", "dzcy"
113 static entity_table_t ent_iso_8859_15
[] = {
114 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
115 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
116 "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
117 "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
118 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
119 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
120 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
121 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
122 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
123 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
124 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
125 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
126 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
127 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
128 "uuml", "yacute", "thorn", "yuml"
131 static entity_table_t ent_uni_338_402
[] = {
133 "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
134 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
136 "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
137 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
138 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
140 "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
141 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
142 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
144 nullptr, nullptr, "fnof"
147 static entity_table_t ent_uni_spacing
[] = {
151 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
152 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
157 static entity_table_t ent_uni_greek
[] = {
159 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
160 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
161 nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
162 /* 938 - 944 are not mapped */
163 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
164 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
165 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
166 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
167 /* 970 - 976 are not mapped */
168 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
170 nullptr, nullptr, nullptr,
174 static entity_table_t ent_uni_punct
[] = {
176 "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
177 "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
178 nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
180 "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
181 "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
182 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
184 "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
185 nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
189 static entity_table_t ent_uni_euro
[] = {
193 static entity_table_t ent_uni_8465_8501
[] = {
195 "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
197 "weierp", nullptr, nullptr, nullptr,
199 "real", nullptr, nullptr, nullptr, nullptr, nullptr,
201 "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
202 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
207 static entity_table_t ent_uni_8592_9002
[] = {
209 "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
210 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
212 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
213 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
215 nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
216 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
218 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
219 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
221 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
222 nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
224 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
225 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
226 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
227 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
229 "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
230 "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
232 "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
233 "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
235 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
236 "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
238 nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
239 nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
241 "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
242 "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
244 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
245 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
247 "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
248 "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
250 "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
251 nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
253 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
254 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
256 nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
257 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
259 nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
260 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
262 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
263 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
265 nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
266 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
268 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
269 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
271 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
272 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
274 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
275 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
277 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
278 "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
280 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
281 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
283 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
284 nullptr, "lang", "rang"
287 static entity_table_t ent_uni_9674
[] = {
292 static entity_table_t ent_uni_9824_9830
[] = {
294 "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
297 static const struct html_entity_map entity_map
[] = {
298 { cs_cp866
, 0x80, 0xff, ent_cp_866
},
299 { cs_cp1251
, 0x80, 0xff, ent_cp_1251
},
300 { cs_cp1252
, 0x80, 0x9f, ent_cp_1252
},
301 { cs_cp1252
, 0xa0, 0xff, ent_iso_8859_1
},
302 { cs_8859_1
, 0xa0, 0xff, ent_iso_8859_1
},
303 { cs_8859_5
, 0xa0, 0xff, ent_iso_8859_5
},
304 { cs_8859_15
, 0xa0, 0xff, ent_iso_8859_15
},
305 { cs_utf_8
, 0xa0, 0xff, ent_iso_8859_1
},
306 { cs_utf_8
, 338, 402, ent_uni_338_402
},
307 { cs_utf_8
, 710, 732, ent_uni_spacing
},
308 { cs_utf_8
, 913, 982, ent_uni_greek
},
309 { cs_utf_8
, 8194, 8260, ent_uni_punct
},
310 { cs_utf_8
, 8364, 8364, ent_uni_euro
},
311 { cs_utf_8
, 8465, 8501, ent_uni_8465_8501
},
312 { cs_utf_8
, 8592, 9002, ent_uni_8592_9002
},
313 { cs_utf_8
, 9674, 9674, ent_uni_9674
},
314 { cs_utf_8
, 9824, 9830, ent_uni_9824_9830
},
315 { cs_big5
, 0xa0, 0xff, ent_iso_8859_1
},
316 { cs_gb2312
, 0xa0, 0xff, ent_iso_8859_1
},
317 { cs_big5hkscs
, 0xa0, 0xff, ent_iso_8859_1
},
318 { cs_sjis
, 0xa0, 0xff, ent_iso_8859_1
},
319 { cs_eucjp
, 0xa0, 0xff, ent_iso_8859_1
},
320 /* Missing support for these at the moment
321 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
322 { cs_macroman, 0x0b, 0xff, ent_macroman },
327 static const struct {
329 entity_charset charset
;
331 { "ISO-8859-1", cs_8859_1
},
332 { "ISO8859-1", cs_8859_1
},
333 { "ISO-8859-5", cs_8859_5
},
334 { "ISO8859-5", cs_8859_5
},
335 { "ISO-8859-15", cs_8859_15
},
336 { "ISO8859-15", cs_8859_15
},
337 { "utf-8", cs_utf_8
},
338 { "cp866", cs_cp866
},
340 { "ibm866", cs_cp866
},
341 { "cp1251", cs_cp1251
},
342 { "Windows-1251", cs_cp1251
},
343 { "win-1251", cs_cp1251
},
344 { "cp1252", cs_cp1252
},
345 { "Windows-1252", cs_cp1252
},
346 { "1252", cs_cp1252
},
349 { "GB2312", cs_gb2312
},
350 { "936", cs_gb2312
},
351 { "BIG5-HKSCS", cs_big5hkscs
},
352 { "Shift_JIS", cs_sjis
},
355 { "EUCJP", cs_eucjp
},
356 /* Missing support for these at the moment
357 { "EUC-JP", cs_eucjp },
358 { "KOI8-R", cs_koi8r },
359 { "koi8-ru", cs_koi8r },
360 { "koi8r", cs_koi8r },
361 { "MacRoman", cs_macroman },
366 ///////////////////////////////////////////////////////////////////////////////
368 entity_charset
determine_charset(const char *charset_hint
) {
369 entity_charset charset
= cs_unknown
;
371 if (charset_hint
== nullptr) {
376 size_t len
= strlen(charset_hint
);
378 /* now walk the charset map and look for the codeset */
379 for (int i
= 0; charset_map
[i
].codeset
; i
++) {
380 if (len
== strlen(charset_map
[i
].codeset
) &&
381 strncasecmp(charset_hint
, charset_map
[i
].codeset
, len
) == 0) {
382 charset
= charset_map
[i
].charset
;
390 static int utf32_to_utf8(unsigned char *buf
, int k
) {
396 } else if (k
< 0x800) {
397 buf
[0] = 0xc0 | (k
>> 6);
398 buf
[1] = 0x80 | (k
& 0x3f);
400 } else if (k
< 0x10000) {
401 buf
[0] = 0xe0 | (k
>> 12);
402 buf
[1] = 0x80 | ((k
>> 6) & 0x3f);
403 buf
[2] = 0x80 | (k
& 0x3f);
405 } else if (k
< 0x200000) {
406 buf
[0] = 0xf0 | (k
>> 18);
407 buf
[1] = 0x80 | ((k
>> 12) & 0x3f);
408 buf
[2] = 0x80 | ((k
>> 6) & 0x3f);
409 buf
[3] = 0x80 | (k
& 0x3f);
411 } else if (k
< 0x4000000) {
412 buf
[0] = 0xf8 | (k
>> 24);
413 buf
[1] = 0x80 | ((k
>> 18) & 0x3f);
414 buf
[2] = 0x80 | ((k
>> 12) & 0x3f);
415 buf
[3] = 0x80 | ((k
>> 6) & 0x3f);
416 buf
[4] = 0x80 | (k
& 0x3f);
419 buf
[0] = 0xfc | (k
>> 30);
420 buf
[1] = 0x80 | ((k
>> 24) & 0x3f);
421 buf
[2] = 0x80 | ((k
>> 18) & 0x3f);
422 buf
[3] = 0x80 | ((k
>> 12) & 0x3f);
423 buf
[4] = 0x80 | ((k
>> 6) & 0x3f);
424 buf
[5] = 0x80 | (k
& 0x3f);
432 using HtmlEntityMap
= hphp_const_char_map
<std::string
>;
434 static volatile bool EntityMapInited
= false;
435 static Mutex EntityMapMutex
;
436 static HtmlEntityMap EntityMap
[cs_end
];
437 static HtmlEntityMap XHPEntityMap
[cs_end
];
439 static void init_entity_table() {
440 for (unsigned int i
= 0; entity_map
[i
].charset
!= cs_terminator
; i
++) {
441 const html_entity_map
&em
= entity_map
[i
];
442 const entity_charset charset
= entity_map
[i
].charset
;
445 for (int ch
= em
.basechar
; ch
<= em
.endchar
; ch
++, index
++) {
446 const char *entity
= em
.table
[index
];
447 if (entity
== nullptr) {
450 unsigned char buf
[10];
464 utf32_to_utf8(buf
, ch
);
470 EntityMap
[charset
][entity
] = (const char *)buf
;
471 XHPEntityMap
[charset
][entity
] = (const char *)buf
;
474 EntityMap
[charset
]["quot"] = "\"";
475 EntityMap
[charset
]["lt"] = "<";
476 EntityMap
[charset
]["gt"] = ">";
477 EntityMap
[charset
]["amp"] = "&";
479 XHPEntityMap
[charset
]["quot"] = "\"";
480 XHPEntityMap
[charset
]["lt"] = "<";
481 XHPEntityMap
[charset
]["gt"] = ">";
482 XHPEntityMap
[charset
]["amp"] = "&";
483 // XHP-specific entities
484 XHPEntityMap
[charset
]["apos"] = "\'";
485 XHPEntityMap
[charset
]["cloud"] = (const char *)u8
"\u2601";
486 XHPEntityMap
[charset
]["umbrella"] = (const char *)u8
"\u2602";
487 XHPEntityMap
[charset
]["snowman"] = (const char *)u8
"\u2603";
488 XHPEntityMap
[charset
]["snowflake"] = (const char *)u8
"\u2745";
489 XHPEntityMap
[charset
]["comet"] = (const char *)u8
"\u2604";
490 XHPEntityMap
[charset
]["thunderstorm"] = (const char *)u8
"\u2608";
493 // the first element is an empty table
494 EntityMap
[cs_terminator
]["quot"] = "\"";
495 EntityMap
[cs_terminator
]["lt"] = "<";
496 EntityMap
[cs_terminator
]["gt"] = ">";
497 EntityMap
[cs_terminator
]["amp"] = "&";
498 // XHP-specific entities
499 XHPEntityMap
[cs_terminator
]["apos"] = "\'";
500 XHPEntityMap
[cs_terminator
]["cloud"] = (const char *)u8
"\u2601";
501 XHPEntityMap
[cs_terminator
]["umbrella"] = (const char *)u8
"\u2602";
502 XHPEntityMap
[cs_terminator
]["snowman"] = (const char *)u8
"\u2603";
503 XHPEntityMap
[cs_terminator
]["snowflake"] = (const char *)u8
"\u2745";
504 XHPEntityMap
[cs_terminator
]["comet"] = (const char *)u8
"\u2604";
505 XHPEntityMap
[cs_terminator
]["thunderstorm"] = (const char *)u8
"\u2608";
508 ///////////////////////////////////////////////////////////////////////////////
509 inline static bool decode_entity(char *entity
, int *len
,
510 bool decode_double_quote
,
511 bool decode_single_quote
,
512 entity_charset charset
, bool all
,
514 // entity is 16 bytes, allocated statically below
516 assert(entity
&& *entity
);
517 if (entity
[0] == '#') {
519 if (entity
[1] == 'x' || entity
[1] == 'X') {
520 if (!isxdigit(entity
[2])) return false;
521 code
= strtol(entity
+ 2, nullptr, 16);
523 if (!isdigit(entity
[1])) return false;
524 code
= strtol(entity
+ 1, nullptr, 10);
527 // since we don't support multibyte chars other than utf-8
530 if (code
== 39 && decode_single_quote
) {
537 if (!all
&& (code
!= '&') &&
538 (code
!= '<') && (code
!= '>') &&
539 (code
!= '"') && (code
!= '\'')) {
540 // htmlspecialchars_decode() does not parse numeric
541 // entities other than & < > " '
548 unsigned char buf
[10];
549 int size
= utf32_to_utf8(buf
, code
);
550 memcpy(entity
, buf
, size
+ 1);
558 if ((code
>= 0x80 && code
< 0xa0) || code
> 0xff) {
605 HtmlEntityMap
*entityMap
;
607 if (strncasecmp(entity
, "quot", 4) == 0 && !decode_double_quote
) {
612 entityMap
= xhp
? &XHPEntityMap
[charset
] : &EntityMap
[charset
];
614 entityMap
= xhp
? &XHPEntityMap
[cs_terminator
]
615 : &EntityMap
[cs_terminator
];
617 HtmlEntityMap::const_iterator iter
= entityMap
->find(entity
);
618 if (iter
!= entityMap
->end()) {
619 memcpy(entity
, iter
->second
.c_str(), iter
->second
.length() + 1);
620 *len
= iter
->second
.length();
628 inline static bool encode_entity(char* buf
, int* buflen
,
629 const char* entity
, bool utf8
) {
630 entity_charset charset
= cs_utf_8
;
631 if (!utf8
){ charset
= cs_8859_1
; }
633 HtmlEntityMap
*entityMap
= &EntityMap
[charset
];
635 for(HtmlEntityMap::const_iterator iter
= entityMap
->begin();
636 iter
!= entityMap
->end(); iter
++) {
637 if (strcmp(iter
->second
.c_str(), entity
) == 0) {
638 memcpy(buf
, iter
->first
, strlen(iter
->first
));
639 *buflen
= strlen(iter
->first
);
646 char *string_html_encode(const char *input
, int &len
,
647 const int64_t qsBitmask
, bool utf8
,
648 bool dEncode
, bool htmlEnt
) {
651 * Though seems to be wasting memory a lot, we have to realize most of the
652 * time this function is called with small strings, or fragments of HTMLs.
653 * Allocating/deallocating anything less than 1K is trivial these days, and
654 * we want avoid string copying as much as possible. Of course, the return
655 * char * is really sent back at large, occupying unnecessary space for
656 * potentially longer time than we need, we have to realize the two closest
657 * solutions are not that much better, either:
659 * 1. pre-calculate size by iterating through the string once: too time
661 * 2. take a guess and double buffer size when over: still wasting, and
662 * it may not save that much.
664 * Note: Amount of allocation per character to be encoded may have to be
665 * increased as larger HTML Entities are implemented.
667 char *ret
= (char *)malloc(len
* 14uL + 1);
672 for (const char *p
= input
, *end
= input
+ len
; p
< end
; p
++) {
673 unsigned char c
= *p
;
678 if (qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE
)) {
679 *q
++ = '&'; *q
++ = 'q'; *q
++ = 'u'; *q
++ = 'o'; *q
++ = 't'; *q
++ = ';';
685 if (qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE
)) {
687 if ((qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_XML1
))) {
688 *q
++ = 'a'; *q
++ = 'p'; *q
++ = 'o'; *q
++ = 's';
690 *q
++ = '#'; *q
++ = '0'; *q
++ = '3'; *q
++ = '9';
698 *q
++ = '&'; *q
++ = 'l'; *q
++ = 't'; *q
++ = ';';
701 *q
++ = '&'; *q
++ = 'g'; *q
++ = 't'; *q
++ = ';';
707 html_get_entity_map();
710 for (const char *t
= p
; *t
; t
++) {
717 buf
= (char* )malloc(l
+ 1);
723 if (decode_entity(buf
, &l
, true, true,
727 for(const char *s
= p
; s
<= t
; s
++) {
741 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
744 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
747 case static_cast<unsigned char>('\xc2'):
748 if (htmlEnt
&& utf8
&& p
!= end
&& *(p
+1) == '\xa0') {
749 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
755 if (LIKELY(c
< 0x80)) {
758 } else if (htmlEnt
&& !utf8
&& (c
- 160) < sizeof(ent_iso_8859_1
) - 1) {
760 * https://github.com/facebook/hhvm/issues/2186
761 * If not UTF8, and we are converting to HTML entities, use known
762 * entity equivalent of the character, if possible.
763 * Since we only support ISO-8859-1 or UTF8 right now, and they use
764 * the same mapping array, use it.
765 * Start at 0xA0 = 160
768 const char *s
= ent_iso_8859_1
[c
- 160];
769 int len_2
= strlen(s
);
770 for (int n
= 0; n
< len_2
; n
++) {
778 qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE
);
779 bool should_replace
=
780 qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE
);
782 if (!utf8
&& should_skip
) {
787 auto avail
= end
- p
;
788 auto utf8_trail
= [](unsigned char c
) { return c
>= 0x80 && c
<= 0xbf; };
789 auto utf8_lead
= [](unsigned char c
) {
790 return c
< 0x80 || (c
>= 0xC2 && c
<= 0xF4);
793 // This has to be a macro since it needs to be able to break away from
794 // the for loop we're in.
795 // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
796 // \uFFFD is Unicode Replacement Character (U+FFFD)
797 #define UTF8_ERROR_IF_LEN(cond, len) \
800 if (should_skip) { break; } \
801 else if (should_replace) { strcpy(q, (const char *)u8"\uFFFD"); q += 3; break; } \
802 else { goto exit_error; } \
805 #define UTF8_ERROR_IF(cond) UTF8_ERROR_IF_LEN(cond, 1)
810 } else if (c
< 0xe0) {
811 UTF8_ERROR_IF(avail
< 2);
812 UTF8_ERROR_IF_LEN(!utf8_trail(*(p
+ 1)), utf8_lead(*(p
+ 1)) ? 1 : 2);
814 uint16_t tc
= ((c
& 0x1f) << 6) | (p
[1] & 0x3f);
815 UTF8_ERROR_IF_LEN(tc
< 0x80, 2); // non-shortest form
819 entity
[1] = *(p
+ 1);
821 } else if (c
< 0xf0) {
822 if (avail
< 3 || !utf8_trail(*(p
+ 1)) || !utf8_trail(*(p
+ 2))) {
823 UTF8_ERROR_IF_LEN(avail
< 2 || utf8_lead(*(p
+ 1)), 1);
824 UTF8_ERROR_IF_LEN(avail
< 3 || utf8_lead(*(p
+ 2)), 2);
825 UTF8_ERROR_IF_LEN(true, 3);
828 uint32_t tc
= ((c
& 0x0f) << 12) |
829 ((*(p
+1) & 0x3f) << 6) |
831 UTF8_ERROR_IF_LEN(tc
< 0x800, 3); // non-shortest form
832 UTF8_ERROR_IF_LEN(tc
>= 0xd800 && tc
<= 0xdfff, 3); // surrogate
836 entity
[1] = *(p
+ 1);
837 entity
[2] = *(p
+ 2);
839 } else if (c
< 0xf5) {
840 if (avail
< 4 || !utf8_trail(*(p
+ 1)) || !utf8_trail(*(p
+ 2)) ||
841 !utf8_trail(*(p
+ 3))) {
842 UTF8_ERROR_IF_LEN(avail
< 2 || utf8_lead(*(p
+ 1)), 1);
843 UTF8_ERROR_IF_LEN(avail
< 3 || utf8_lead(*(p
+ 2)), 2);
844 UTF8_ERROR_IF_LEN(avail
< 4 || utf8_lead(*(p
+ 3)), 3);
845 UTF8_ERROR_IF_LEN(true, 4);
848 uint32_t tc
= ((c
& 0x07) << 18) |
849 ((*(p
+1) & 0x3f) << 12) |
850 ((*(p
+2) & 0x3f) << 6) |
853 // non-shortest form or outside range
854 UTF8_ERROR_IF_LEN(tc
< 0x10000 || tc
> 0x10ffff, 4);
858 entity
[1] = *(p
+ 1);
859 entity
[2] = *(p
+ 2);
860 entity
[3] = *(p
+ 3);
872 html_get_entity_map();
878 if (encode_entity(buf
, &len_2
, const_cast<char*>(entity
), utf8
)) {
881 for (int n
= 0; n
< len_2
; n
++) {
886 memcpy(q
, p
, codeLength
);
890 memcpy(q
, p
, codeLength
);
902 #undef UTF8_ERROR_IF_LEN
904 if (q
- ret
> INT_MAX
) {
916 char *string_html_encode_extra(const char *input
, int &len
,
917 StringHtmlEncoding flags
,
918 const AsciiMap
*asciiMap
) {
921 * Though seems to be wasting memory a lot, we have to realize most of the
922 * time this function is called with small strings, or fragments of HTMLs.
923 * Allocating/deallocating anything less than 1K is trivial these days, and
924 * we want avoid string copying as much as possible. Of course, the return
925 * char * is really sent back at large, occupying unnecessary space for
926 * potentially longer time than we need, we have to realize the two closest
927 * solutions are not that much better, either:
929 * 1. pre-calculate size by iterating through the string once: too time
931 * 2. take a guess and double buffer size when over: still wasting, and
932 * it may not save that much.
934 char *ret
= (char *)malloc(len
* 8uL + 1);
939 const char *rep
= (const char *)u8
"\ufffd";
941 for (srcPosBytes
= 0; srcPosBytes
< len
; /* incremented in-loop */) {
942 unsigned char c
= input
[srcPosBytes
];
944 srcPosBytes
++; // Optimize US-ASCII case
945 if ((asciiMap
->map
[c
& 64 ? 1 : 0] >> (c
& 63)) & 1) {
948 *q
++ = '&'; *q
++ = 'q'; *q
++ = 'u';
949 *q
++ = 'o'; *q
++ = 't'; *q
++ = ';';
952 *q
++ = '&'; *q
++ = '#'; *q
++ = '0';
953 *q
++ = '3'; *q
++ = '9'; *q
++ = ';';
956 *q
++ = '&'; *q
++ = 'l'; *q
++ = 't'; *q
++ = ';';
959 *q
++ = '&'; *q
++ = 'g'; *q
++ = 't'; *q
++ = ';';
962 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
965 *q
++ = '&'; *q
++ = '#';
966 *q
++ = c
>= 100 ? '1' : '0';
967 *q
++ = ((c
/ 10) % 10) + '0';
968 *q
++ = (c
% 10) + '0';
975 } else if (flags
& STRING_HTML_ENCODE_UTF8
) {
976 UChar32 curCodePoint
;
977 U8_NEXT(input
, srcPosBytes
, len
, curCodePoint
);
978 if ((flags
& STRING_HTML_ENCODE_NBSP
) && curCodePoint
== 0xC2A0) {
979 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
980 } else if (curCodePoint
<= 0) {
981 if (flags
& STRING_HTML_ENCODE_UTF8IZE_REPLACE
) {
982 if (flags
& STRING_HTML_ENCODE_HIGH
) {
983 *q
++ = '&'; *q
++ = '#'; *q
++ = 'x';
984 *q
++ = 'f'; *q
++ = 'f'; *q
++ = 'f'; *q
++ = 'd';
988 while (*r
) *q
++ = *r
++;
991 } else if (flags
& STRING_HTML_ENCODE_HIGH
) {
992 q
+= sprintf(q
, "&#x%x;", curCodePoint
);
995 U8_APPEND_UNSAFE(q
, pos
, curCodePoint
);
999 srcPosBytes
++; // Optimize US-ASCII case
1001 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
1002 } else if (flags
& STRING_HTML_ENCODE_HIGH
) {
1003 *q
++ = '&'; *q
++ = '#';
1004 *q
++ = c
>= 200 ? '2' : '1';
1005 *q
++ = ((c
/ 10) % 10) + '0';
1006 *q
++ = (c
% 10) + '0';
1013 if (q
- ret
> INT_MAX
) {
1022 char *string_html_decode(const char *input
, int &len
,
1023 bool decode_double_quote
, bool decode_single_quote
,
1024 const char *charset_hint
, bool all
,
1025 bool xhp
/* = false */) {
1028 if (!EntityMapInited
) {
1029 Lock
lock(EntityMapMutex
);
1030 if (!EntityMapInited
) {
1031 init_entity_table();
1032 EntityMapInited
= true;
1036 entity_charset charset
= determine_charset(charset_hint
);
1037 if (charset
== cs_unknown
) {
1041 char *ret
= (char *)malloc(len
+ 1);
1043 for (const char *p
= input
; *p
|| UNLIKELY(p
- input
< len
); p
++) {
1052 for (const char *t
= p
; *t
; t
++) {
1056 char sbuf
[16] = {0};
1059 buf
= (char* )malloc(l
+ 1);
1065 if (decode_entity(buf
, &l
, decode_double_quote
, decode_single_quote
,
1066 charset
, all
, xhp
)) {
1081 *q
++ = '&'; // not an entity
1089 const html_entity_map
* html_get_entity_map() {
1090 if (!EntityMapInited
) {
1091 Lock
lock(EntityMapMutex
);
1092 if (!EntityMapInited
) {
1093 init_entity_table();
1094 EntityMapInited
= true;
1100 ///////////////////////////////////////////////////////////////////////////////