1 #ifndef EL__INTL_CHARSETS_H
2 #define EL__INTL_CHARSETS_H
6 /* The TRE check in configure.in assumes unicode_val_T is uint32_t. */
7 typedef uint32_t unicode_val_T
;
9 /* U+0020 SPACE. Normally the same as ' ' or L' ' but perhaps ELinks
10 * shouldn't rely on that. */
11 #define UCS_SPACE ((unicode_val_T) 0x0020)
13 /* U+00A0 NO-BREAK SPACE. */
14 #define UCS_NO_BREAK_SPACE ((unicode_val_T) 0x00A0)
16 /* U+00AD SOFT HYPHEN. */
17 #define UCS_SOFT_HYPHEN ((unicode_val_T) 0x00AD)
19 /* U+FFFD REPLACEMENT CHARACTER. Used when no Unicode mapping is
20 * known for a byte in a codepage, or when invalid UTF-8 is received
21 * from a terminal. After generating the character, ELinks then
22 * treats it like any other Unicode character. The user can also type
23 * this character directly, and it can occur in documents. */
24 #define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)
26 /* A special value that fits in unicode_val_T but is outside the range
27 * of Unicode characters. utf8_to_unicode and cp_to_unicode return
28 * this if the input is too short. This is also used as a placeholder
29 * for the second cell of a double-cell character. */
30 #define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)
33 #define UCS_END_COMBINED ((unicode_val_T) 0xFFFFFFFC)
35 #define UCS_BEGIN_COMBINED ((unicode_val_T) (UCS_END_COMBINED - (unicode_val_T) 10000))
37 /* Base character and up to 5 combining characters. */
38 #define UCS_MAX_LENGTH_COMBINED 6
39 #endif /* CONFIG_COMBINE */
41 /* If ELinks should display a double-cell character but there is only
42 * one cell available, it displays this character instead. This must
43 * be a single-cell character but need not be unique. Possible values
44 * might be U+0020 SPACE or U+303F IDEOGRAPHIC HALF FILL SPACE.
46 * Some BFU widgets (at least input fields and list boxes) currently
47 * ignore this setting and use U+0020 instead. (They first draw spaces
48 * and then overwrite with text; look for utf8_cells2bytes calls.)
49 * We should fix that if we ever change the value. */
50 #define UCS_ORPHAN_CELL ((unicode_val_T) 0x20)
52 /* replacement character. See u2cp().
53 * UTF-8 strings should use the encoding of U+00A0 instead. */
54 #define NBSP_CHAR ((unsigned char) 1)
55 #define NBSP_CHAR_STRING "\001"
57 /* How to convert a byte from a source charset. This is used in an
58 * array (struct conv_table[256]) indexed by the byte value. */
60 /* 0 if this is the final byte of a character, or 1 if more
61 * bytes are needed. */
64 /* If @t==0: a null-terminated string that is the
65 * corresponding character in the target charset.
66 * Normally, the string is statically allocated.
67 * However, if the conversion table is to UTF-8, then
68 * the strings in elements 0x80 to 0xFF are allocated
69 * with @mem_alloc and owned by the table. */
70 const unsigned char *str
;
71 /* If @t==1: a pointer to a nested conversion table
72 * (with 256 elements) that describes how to convert
73 * each possible subsequent byte. The conversion
74 * table owns the nested conversion table. */
75 struct conv_table
*tbl
;
80 enum convert_string_mode
{
81 CSM_DEFAULT
, /* Convert any char. */
82 CSM_QUERY
, /* Special handling of '&' and '=' chars. */
83 CSM_FORM
, /* Special handling of '&' and '=' chars in forms. */
84 CSM_NONE
, /* Convert nothing. */
87 /* How to translate U+00A0 NO-BREAK SPACE. If u2cp_ is converting to
88 * UTF-8, it ignores this choice and just encodes the U+00A0. */
90 /* Convert to NBSP_CHAR. This lets the HTML renderer
91 * recognize nbsp even if the codepage doesn't support
92 * nbsp. (VISCII doesn't.) */
95 /* Convert to normal ASCII space. */
99 struct conv_table
*get_translation_table(int, int);
100 const unsigned char *get_entity_string(const unsigned char *str
,
101 const int strlen
, int encoding
);
103 /* The convert_string() name is also used by Samba (version 3.0.3), which
104 * provides libnss_wins.so.2, which is called somewhere inside
105 * _nss_wins_gethostbyname_r(). This name clash causes the elinks hostname
106 * lookup thread to crash so we need to rename the symbol. */
107 /* Reported by Derek Poon and filed as bug 453 */
108 #undef convert_string
109 #define convert_string convert_string_elinks
111 /* This routine converts a string from one charset to another according to the
112 * passed @convert_table, potentially also decoding SGML (HTML) entities along
113 * the way (according to @mode). It either returns dynamically allocated
114 * converted string of length @length, or if the @callback is non-NULL it calls
115 * it each few bytes instead and always returns NULL (@length is undefined).
116 * Note that it's ok not to care and pass NULL as @length. */
117 unsigned char *convert_string(struct conv_table
*convert_table
,
118 unsigned char *chars
, int charslen
, int cp
,
119 enum convert_string_mode mode
, int *length
,
120 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
121 void *callback_data
);
123 int get_cp_index(const unsigned char *);
124 unsigned char *get_cp_name(int);
125 unsigned char *get_cp_config_name(int);
126 unsigned char *get_cp_mime_name(int);
127 const uint16_t *get_cp_highhalf(const unsigned char *);
130 void free_conv_table(void);
131 unsigned char *encode_utf8(unicode_val_T
);
133 unsigned char *utf8_prevchar(unsigned char *, int, unsigned char *);
134 int utf8charlen(const unsigned char *);
135 int utf8_char2cells(unsigned char *, unsigned char *);
136 int utf8_ptr2cells(unsigned char *, unsigned char *);
137 int utf8_ptr2chars(unsigned char *, unsigned char *);
138 int utf8_cells2bytes(unsigned char *, int, unsigned char *);
139 /* How utf8_step_forward and utf8_step_backward count steps. */
141 /* Each step is one character, even if it is a combining or
142 * double-width character. */
143 UTF8_STEP_CHARACTERS
,
145 /* Each step is one cell. If the specified number of steps
146 * would end in the middle of a double-width character, do not
147 * include the character. */
148 UTF8_STEP_CELLS_FEWER
,
150 /* Each step is one cell. If the specified number of steps
151 * would end in the middle of a double-width character,
152 * include the whole character. */
155 unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
156 int, enum utf8_step
, int *);
157 unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
158 int, enum utf8_step
, int *);
159 int unicode_to_cell(unicode_val_T
);
160 unicode_val_T
unicode_fold_label_case(unicode_val_T
);
161 int strlen_utf8(unsigned char **);
162 #endif /* CONFIG_UTF8 */
163 unicode_val_T
utf8_to_unicode(unsigned char **, const unsigned char *);
164 unicode_val_T
cp_to_unicode(int, unsigned char **, const unsigned char *);
166 #ifdef CONFIG_COMBINE
167 extern unicode_val_T last_combined
;
168 extern unicode_val_T
**combined
;
169 extern struct hash
*combined_hash
;
170 unicode_val_T
get_combined(unicode_val_T
*, int);
171 void free_combined();
172 #endif /* CONFIG_COMBINE */
174 unicode_val_T
cp2u(int, unsigned char);
175 const unsigned char *cp2utf8(int, int);
177 const unsigned char *u2cp_(unicode_val_T
, int, enum nbsp_mode
);
178 #define u2cp(u, to) u2cp_(u, to, NBSP_MODE_HACK)
179 #define u2cp_no_nbsp(u, to) u2cp_(u, to, NBSP_MODE_ASCII)
181 void init_charsets_lookup(void);
182 void free_charsets_lookup(void);
184 /* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single
185 * 16-bit code unit, and each character U+10000...U+10FFFF as a pair
186 * of two code units: a high surrogate followed by a low surrogate.
187 * The range U+D800...U+DFFF is reserved for these surrogates. */
188 #define is_utf16_surrogate(u) (((u) & 0xFFFFF800) == 0xD800)
189 #define is_utf16_high_surrogate(u) (((u) & 0xFFFFFC00) == 0xD800)
190 #define is_utf16_low_surrogate(u) (((u) & 0xFFFFFC00) == 0xDC00)
191 #define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00))
192 #define needs_utf16_surrogates(u) ((uint32_t) ((u) - 0x10000) < 0x100000)
193 #define get_utf16_high_surrogate(u) (0xD800 + (((u) - 0x10000) >> 10))
194 #define get_utf16_low_surrogate(u) (0xDC00 + ((u) & 0x3FF))