src/intl/charsets.h

   1 #ifndef EL__INTL_CHARSETS_H
   2 #define EL__INTL_CHARSETS_H
   3
   4 struct hash;
   5
   6 /* The TRE check in configure.in assumes unicode_val_T is uint32_t.  */
   7 typedef uint32_t unicode_val_T;
   8
   9 /* U+0020 SPACE.  Normally the same as ' ' or L' ' but perhaps ELinks
  10  * shouldn't rely on that.  */
  11 #define UCS_SPACE ((unicode_val_T) 0x0020)
  12
  13 /* U+00A0 NO-BREAK SPACE.  */
  14 #define UCS_NO_BREAK_SPACE ((unicode_val_T) 0x00A0)
  15
  16 /* U+00AD SOFT HYPHEN.  */
  17 #define UCS_SOFT_HYPHEN ((unicode_val_T) 0x00AD)
  18
  19 /* U+FFFD REPLACEMENT CHARACTER.  Used when no Unicode mapping is
  20  * known for a byte in a codepage, or when invalid UTF-8 is received
  21  * from a terminal.  After generating the character, ELinks then
  22  * treats it like any other Unicode character.  The user can also type
  23  * this character directly, and it can occur in documents.  */
  24 #define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)
  25
  26 /* A special value that fits in unicode_val_T but is outside the range
  27  * of Unicode characters.  utf8_to_unicode and cp_to_unicode return
  28  * this if the input is too short.  This is also used as a placeholder
  29  * for the second cell of a double-cell character.  */
  30 #define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)
  31
  32 #ifdef CONFIG_COMBINE
  33 #define UCS_END_COMBINED ((unicode_val_T) 0xFFFFFFFC)
  34
  35 #define UCS_BEGIN_COMBINED ((unicode_val_T) (UCS_END_COMBINED - (unicode_val_T) 10000))
  36
  37 /* Base character and up to 5 combining characters. */
  38 #define UCS_MAX_LENGTH_COMBINED 6
  39 #endif /* CONFIG_COMBINE */
  40
  41 /* If ELinks should display a double-cell character but there is only
  42  * one cell available, it displays this character instead.  This must
  43  * be a single-cell character but need not be unique.  Possible values
  44  * might be U+0020 SPACE or U+303F IDEOGRAPHIC HALF FILL SPACE.
  45  *
  46  * Some BFU widgets (at least input fields and list boxes) currently
  47  * ignore this setting and use U+0020 instead.  (They first draw spaces
  48  * and then overwrite with text; look for utf8_cells2bytes calls.)
  49  * We should fix that if we ever change the value.  */
  50 #define UCS_ORPHAN_CELL ((unicode_val_T) 0x20)
  51
  52 /* &nbsp; replacement character. See u2cp().
  53  * UTF-8 strings should use the encoding of U+00A0 instead. */
  54 #define NBSP_CHAR ((unsigned char) 1)
  55 #define NBSP_CHAR_STRING "\001"
  56
  57 /* How to convert a byte from a source charset.  This is used in an
  58  * array (struct conv_table[256]) indexed by the byte value.  */
  59 struct conv_table {
  60         /* 0 if this is the final byte of a character, or 1 if more
  61          * bytes are needed.  */
  62         int t;
  63         union {
  64                 /* If @t==0: a null-terminated string that is the
  65                  * corresponding character in the target charset.
  66                  * Normally, the string is statically allocated.
  67                  * However, if the conversion table is to UTF-8, then
  68                  * the strings in elements 0x80 to 0xFF are allocated
  69                  * with @mem_alloc and owned by the table.  */
  70                 const unsigned char *str;
  71                 /* If @t==1: a pointer to a nested conversion table
  72                  * (with 256 elements) that describes how to convert
  73                  * each possible subsequent byte.  The conversion
  74                  * table owns the nested conversion table.  */
  75                 struct conv_table *tbl;
  76         } u;
  77         int iconv_cp;
  78 };
  79
  80 enum convert_string_mode {
  81         CSM_DEFAULT, /* Convert any char. */
  82         CSM_QUERY, /* Special handling of '&' and '=' chars. */
  83         CSM_FORM, /* Special handling of '&' and '=' chars in forms. */
  84         CSM_NONE, /* Convert nothing. */
  85 };
  86
  87 /* How to translate U+00A0 NO-BREAK SPACE.  If u2cp_ is converting to
  88  * UTF-8, it ignores this choice and just encodes the U+00A0.  */
  89 enum nbsp_mode {
  90         /* Convert to NBSP_CHAR.  This lets the HTML renderer
  91          * recognize nbsp even if the codepage doesn't support
  92          * nbsp.  (VISCII doesn't.)  */
  93         NBSP_MODE_HACK = 0,
  94
  95         /* Convert to normal ASCII space.  */
  96         NBSP_MODE_ASCII = 1
  97 };
  98
  99 struct conv_table *get_translation_table(int, int);
 100 const unsigned char *get_entity_string(const unsigned char *str,
 101                                        const int strlen, int encoding);
 102
 103 /* The convert_string() name is also used by Samba (version 3.0.3), which
 104  * provides libnss_wins.so.2, which is called somewhere inside
 105  * _nss_wins_gethostbyname_r(). This name clash causes the elinks hostname
 106  * lookup thread to crash so we need to rename the symbol. */
 107 /* Reported by Derek Poon and filed as bug 453 */
 108 #undef convert_string
 109 #define convert_string convert_string_elinks
 110
 111 /* This routine converts a string from one charset to another according to the
 112  * passed @convert_table, potentially also decoding SGML (HTML) entities along
 113  * the way (according to @mode). It either returns dynamically allocated
 114  * converted string of length @length, or if the @callback is non-NULL it calls
 115  * it each few bytes instead and always returns NULL (@length is undefined).
 116  * Note that it's ok not to care and pass NULL as @length. */
 117 unsigned char *convert_string(struct conv_table *convert_table,
 118                               unsigned char *chars, int charslen, int cp,
 119                               enum convert_string_mode mode, int *length,
 120                               void (*callback)(void *data, unsigned char *buf, int buflen),
 121                               void *callback_data);
 122
 123 int get_cp_index(const unsigned char *);
 124 unsigned char *get_cp_name(int);
 125 unsigned char *get_cp_config_name(int);
 126 unsigned char *get_cp_mime_name(int);
 127 const uint16_t *get_cp_highhalf(const unsigned char *);
 128
 129 int is_cp_utf8(int);
 130 void free_conv_table(void);
 131 unsigned char *encode_utf8(unicode_val_T);
 132 #ifdef CONFIG_UTF8
 133 unsigned char *utf8_prevchar(unsigned char *, int, unsigned char *);
 134 int utf8charlen(const unsigned char *);
 135 int utf8_char2cells(unsigned char *, unsigned char *);
 136 int utf8_ptr2cells(unsigned char *, unsigned char *);
 137 int utf8_ptr2chars(unsigned char *, unsigned char *);
 138 int utf8_cells2bytes(unsigned char *, int, unsigned char *);
 139 /* How utf8_step_forward and utf8_step_backward count steps.  */
 140 enum utf8_step {
 141         /* Each step is one character, even if it is a combining or
 142          * double-width character.  */
 143         UTF8_STEP_CHARACTERS,
 144
 145         /* Each step is one cell.  If the specified number of steps
 146          * would end in the middle of a double-width character, do not
 147          * include the character.  */
 148         UTF8_STEP_CELLS_FEWER,
 149
 150         /* Each step is one cell.  If the specified number of steps
 151          * would end in the middle of a double-width character,
 152          * include the whole character.  */
 153         UTF8_STEP_CELLS_MORE
 154 };
 155 unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
 156                                  int, enum utf8_step, int *);
 157 unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
 158                                   int, enum utf8_step, int *);
 159 int unicode_to_cell(unicode_val_T);
 160 unicode_val_T unicode_fold_label_case(unicode_val_T);
 161 int strlen_utf8(unsigned char **);
 162 #endif /* CONFIG_UTF8 */
 163 unicode_val_T utf8_to_unicode(unsigned char **, const unsigned char *);
 164 unicode_val_T cp_to_unicode(int, unsigned char **, const unsigned char *);
 165
 166 #ifdef CONFIG_COMBINE
 167 extern unicode_val_T last_combined;
 168 extern unicode_val_T **combined;
 169 extern struct hash *combined_hash;
 170 unicode_val_T get_combined(unicode_val_T *, int);
 171 void free_combined();
 172 #endif /* CONFIG_COMBINE */
 173
 174 unicode_val_T cp2u(int, unsigned char);
 175 const unsigned char *cp2utf8(int, int);
 176
 177 const unsigned char *u2cp_(unicode_val_T, int, enum nbsp_mode);
 178 #define u2cp(u, to) u2cp_(u, to, NBSP_MODE_HACK)
 179 #define u2cp_no_nbsp(u, to) u2cp_(u, to, NBSP_MODE_ASCII)
 180
 181 void init_charsets_lookup(void);
 182 void free_charsets_lookup(void);
 183
 184 /* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single
 185  * 16-bit code unit, and each character U+10000...U+10FFFF as a pair
 186  * of two code units: a high surrogate followed by a low surrogate.
 187  * The range U+D800...U+DFFF is reserved for these surrogates.  */
 188 #define is_utf16_surrogate(u)           (((u) & 0xFFFFF800) == 0xD800)
 189 #define is_utf16_high_surrogate(u)      (((u) & 0xFFFFFC00) == 0xD800)
 190 #define is_utf16_low_surrogate(u)       (((u) & 0xFFFFFC00) == 0xDC00)
 191 #define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00))
 192 #define needs_utf16_surrogates(u)       ((uint32_t) ((u) - 0x10000) < 0x100000)
 193 #define get_utf16_high_surrogate(u)     (0xD800 + (((u) - 0x10000) >> 10))
 194 #define get_utf16_low_surrogate(u)      (0xDC00 + ((u) & 0x3FF))
 195
 196 #endif