2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #ifndef incl_HPHP_ZEND_HTML_H_
19 #define incl_HPHP_ZEND_HTML_H_
23 // Avoid dragging in the icu namespace.
24 #ifndef U_USING_ICU_NAMESPACE
25 #define U_USING_ICU_NAMESPACE 0
29 ///////////////////////////////////////////////////////////////////////////////
31 * Major departures from Zend:
33 * 1. We are only supporting UTF-8 and ISO-8859-1 encoding.
34 * Major reason for this is because the original get_next_char() bothers me,
35 * sacrificing performance for some character sets that people rarely used
36 * or that people shouldn't use. UTF-8 should really be the standard string
37 * format everywhere, and we ought to write coding specifilized for it to
38 * take full advantage of it: one example would be the new html encoding
39 * function that simply do *p one a time iterating through the strings to
40 * look for special characters for entity escaping.
42 * 2. HTML encoding function no longer encodes entities other than the basic
43 * ones. There is no need to encode them, since all browsers support UTF-8
44 * natively, and we are ok to send out UTF-8 encoding characters without
45 * turning them into printable ASCIIs. Basic entities are encoded for
46 * a different reason! In fact, I personally don't see why HTML spec has
47 * those extended list of entities, other than historical artifacts.
49 * 3. Double encoding parameter is not supported. That really sounds like
50 * a workaround of buggy coding. I don't find a legit use for that yet.
57 enum StringHtmlEncoding
{
58 STRING_HTML_ENCODE_UTF8
= 1,
59 STRING_HTML_ENCODE_NBSP
= 2,
60 STRING_HTML_ENCODE_HIGH
= 4,
61 STRING_HTML_ENCODE_UTF8IZE_REPLACE
= 8
64 enum class EntBitmask
{
65 ENT_BM_NOQUOTES
= 0, /* leave all quotes alone */
66 ENT_BM_SINGLE
= 1, /* escape single quotes only */
67 ENT_BM_DOUBLE
= 2, /* escape double quotes only */
68 ENT_BM_IGNORE
= 4, /* silently discard invalid chars */
69 ENT_BM_SUBSTITUTE
= 8, /* replace invalid chars with U+FFFD */
70 ENT_BM_XML1
= 16, /* XML1 mode*/
71 ENT_BM_XHTML
= 32, /* XHTML mode */
74 namespace entity_charset_enum
{
75 enum entity_charset_impl
{
76 cs_terminator
, cs_8859_1
, cs_cp1252
,
77 cs_8859_15
, cs_utf_8
, cs_big5
, cs_gb2312
,
78 cs_big5hkscs
, cs_sjis
, cs_eucjp
, cs_koi8r
,
79 cs_cp1251
, cs_8859_5
, cs_cp866
, cs_macroman
,
84 typedef entity_charset_enum::entity_charset_impl entity_charset
;
86 struct HtmlBasicEntity
{
87 unsigned short charcode
;
93 typedef const char *const entity_table_t
;
95 struct html_entity_map
{
96 entity_charset charset
; /* charset identifier */
97 unsigned short basechar
; /* char code at start of table */
98 unsigned short endchar
; /* last char code in the table */
99 entity_table_t
*table
; /* the table of mappings */
102 const html_entity_map
* html_get_entity_map();
105 * returns cs_unknown iff not found;
106 * if input null, returns default charset of cs_utf_8
108 entity_charset
determine_charset(const char*);
110 char *string_html_encode(const char *input
, int &len
,
111 const int64_t qsBitmask
, bool utf8
,
112 bool dEncode
, bool htmlEnt
);
113 char *string_html_encode_extra(const char *input
, int &len
,
114 StringHtmlEncoding flags
,
115 const AsciiMap
*asciiMap
);
118 * returns decoded string;
119 * note, can return nullptr if the charset could not be detected
120 * using the given charset_hint; can also pass in nullptr
121 * for the charset_hint to use the default one (UTF-8).
122 * (see determine_charset).
124 char *string_html_decode(const char *input
, int &len
,
125 bool decode_double_quote
, bool decode_single_quote
,
126 const char *charset_hint
,
127 bool all
, bool xhp
= false );
129 ///////////////////////////////////////////////////////////////////////////////