hphp/zend/zend-html.h

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   7    +----------------------------------------------------------------------+
   8    | This source file is subject to version 2.00 of the Zend license,     |
   9    | that is bundled with this package in the file LICENSE, and is        |
  10    | available through the world-wide-web at the following url:           |
  11    | http://www.zend.com/license/2_00.txt.                                |
  12    | If you did not receive a copy of the Zend license and are unable to  |
  13    | obtain it through the world-wide-web, please send a note to          |
  14    | license@zend.com so we can mail you a copy immediately.              |
  15    +----------------------------------------------------------------------+
  16 */
  17
  18 #ifndef incl_HPHP_ZEND_HTML_H_
  19 #define incl_HPHP_ZEND_HTML_H_
  20
  21 #include <cstdint>
  22
  23 // Avoid dragging in the icu namespace.
  24 #ifndef U_USING_ICU_NAMESPACE
  25 #define U_USING_ICU_NAMESPACE 0
  26 #endif
  27
  28 namespace HPHP {
  29 ///////////////////////////////////////////////////////////////////////////////
  30 /**
  31  * Major departures from Zend:
  32  *
  33  * 1. We are only supporting UTF-8 and ISO-8859-1 encoding.
  34  *    Major reason for this is because the original get_next_char() bothers me,
  35  *    sacrificing performance for some character sets that people rarely used
  36  *    or that people shouldn't use. UTF-8 should really be the standard string
  37  *    format everywhere, and we ought to write coding specifilized for it to
  38  *    take full advantage of it: one example would be the new html encoding
  39  *    function that simply do *p one a time iterating through the strings to
  40  *    look for special characters for entity escaping.
  41  *
  42  * 2. HTML encoding function no longer encodes entities other than the basic
  43  *    ones. There is no need to encode them, since all browsers support UTF-8
  44  *    natively, and we are ok to send out UTF-8 encoding characters without
  45  *    turning them into printable ASCIIs. Basic entities are encoded for
  46  *    a different reason! In fact, I personally don't see why HTML spec has
  47  *    those extended list of entities, other than historical artifacts.
  48  *
  49  * 3. Double encoding parameter is not supported. That really sounds like
  50  *    a workaround of buggy coding. I don't find a legit use for that yet.
  51  */
  52
  53 struct AsciiMap {
  54   uint64_t map[2];
  55 };
  56
  57 enum StringHtmlEncoding {
  58   STRING_HTML_ENCODE_UTF8 = 1,
  59   STRING_HTML_ENCODE_NBSP = 2,
  60   STRING_HTML_ENCODE_HIGH = 4,
  61   STRING_HTML_ENCODE_UTF8IZE_REPLACE = 8
  62 };
  63
  64 enum class EntBitmask {
  65   ENT_BM_NOQUOTES = 0,   /* leave all quotes alone */
  66   ENT_BM_SINGLE = 1,     /* escape single quotes only */
  67   ENT_BM_DOUBLE = 2,     /* escape double quotes only */
  68   ENT_BM_IGNORE = 4,     /* silently discard invalid chars */
  69   ENT_BM_SUBSTITUTE = 8, /* replace invalid chars with U+FFFD */
  70   ENT_BM_XML1 = 16,      /* XML1 mode*/
  71   ENT_BM_XHTML = 32,     /* XHTML mode */
  72 };
  73
  74 namespace entity_charset_enum {
  75 enum entity_charset_impl {
  76   cs_terminator, cs_8859_1, cs_cp1252,
  77   cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
  78   cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
  79   cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
  80   cs_unknown,
  81   cs_end
  82 };
  83 }
  84 typedef entity_charset_enum::entity_charset_impl entity_charset;
  85
  86 struct HtmlBasicEntity {
  87   unsigned short charcode;
  88   const char *entity;
  89   int entitylen;
  90   int flags;
  91 };
  92
  93 typedef const char *const entity_table_t;
  94
  95 struct html_entity_map {
  96   entity_charset charset; /* charset identifier */
  97   unsigned short basechar; /* char code at start of table */
  98   unsigned short endchar;  /* last char code in the table */
  99   entity_table_t *table;   /* the table of mappings */
 100 };
 101
 102 const html_entity_map* html_get_entity_map();
 103
 104 /*
 105  * returns cs_unknown iff not found;
 106  * if input null, returns default charset of cs_utf_8
 107  */
 108 entity_charset determine_charset(const char*);
 109
 110 char *string_html_encode(const char *input, int &len,
 111                          const int64_t qsBitmask, bool utf8,
 112                          bool dEncode, bool htmlEnt);
 113 char *string_html_encode_extra(const char *input, int &len,
 114                                StringHtmlEncoding flags,
 115                                const AsciiMap *asciiMap);
 116
 117 /**
 118  * returns decoded string;
 119  * note, can return nullptr if the charset could not be detected
 120  * using the given charset_hint; can also pass in nullptr
 121  * for the charset_hint to use the default one (UTF-8).
 122  * (see determine_charset).
 123  */
 124 char *string_html_decode(const char *input, int &len,
 125                          bool decode_double_quote, bool decode_single_quote,
 126                          const char *charset_hint,
 127                          bool all, bool xhp = false );
 128
 129 ///////////////////////////////////////////////////////////////////////////////
 130 }
 131
 132 #endif