1 /* Charsets convertor */
4 #define _GNU_SOURCE /* strcasecmp() */
11 #if HAVE_LANGINFO_CODESET
28 #include "document/options.h"
29 #include "intl/charsets.h"
30 #include "util/conv.h"
31 #include "util/error.h"
32 #include "util/fastfind.h"
33 #include "util/hash.h"
34 #include "util/memory.h"
35 #include "util/string.h"
38 /* Fix namespace clash on MacOS. */
39 #define table table_elinks
43 /* This should in principle be unicode_val_T, but because all
44 * the values currently in codepage.inc fit in 16 bits, we can
45 * as well use uint16_t and halve sizeof(struct table_entry)
46 * from 8 bytes to 4. Should other characters ever be needed,
47 * unicode_val_T u : 24 might be a possibility, although it
48 * seems a little unportable as bitfields are in principle
49 * restricted to int, which may be 16-bit. */
53 struct codepage_desc
{
55 unsigned char *const *aliases
;
57 /* The Unicode mappings of codepage bytes 0x80...0xFF.
58 * (0x00...0x7F are assumed to be ASCII in all codepages.)
59 * Because all current values fit in 16 bits, we store them as
60 * uint16_t rather than unicode_val_T. If the codepage does
61 * not use some byte, then @highhalf maps that byte to 0xFFFF,
62 * which C code converts to UCS_REPLACEMENT_CHARACTER where
63 * appropriate. (U+FFFF is reserved and will never be
64 * assigned as a character.) */
65 const uint16_t *highhalf
;
67 /* If some byte in the codepage corresponds to multiple Unicode
68 * characters, then the preferred character is in @highhalf
69 * above, and the rest are listed here in @table. This table
70 * is not used for translating from the codepage to Unicode. */
71 const struct table_entry
*table
;
73 /* Whether use iconv for translation */
77 #include "intl/codepage.inc"
78 #include "intl/uni_7b.inc"
79 #include "intl/entity.inc"
81 /* Declare the external-linkage inline functions defined in this file.
82 * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
83 * called. The functions are not declared inline in charsets.h
84 * because C99 6.7.4p6 says that every external-linkage function
85 * declared inline shall be defined in the same translation unit.
86 * The non-inline declarations in charsets.h also make sure that the
87 * compiler emits global definitions for the symbols so that the
88 * functions can be called from other translation units. */
89 NONSTATIC_INLINE
unsigned char *encode_utf8(unicode_val_T u
);
90 NONSTATIC_INLINE
int utf8charlen(const unsigned char *p
);
91 NONSTATIC_INLINE
int unicode_to_cell(unicode_val_T c
);
92 NONSTATIC_INLINE unicode_val_T
utf8_to_unicode(unsigned char **string
,
93 const unsigned char *end
);
95 static const char strings
[256][2] = {
96 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
97 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
98 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
99 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
100 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
101 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
102 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
103 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
104 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
105 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
106 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
107 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
108 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
109 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
110 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
111 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
112 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
113 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
114 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
115 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
116 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
117 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
118 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
119 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
120 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
121 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
122 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
123 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
124 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
125 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
126 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
127 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
131 static iconv_t iconv_cd
= (iconv_t
)-1;
135 free_translation_table(struct conv_table
*p
)
139 for (i
= 0; i
< 256; i
++)
141 free_translation_table(p
[i
].u
.tbl
);
146 /* A string used in conversion tables when there is no correct
147 * conversion. This is compared by address and therefore should be a
148 * named array rather than a pointer so that it won't share storage
149 * with any other string literal that happens to have the same
151 static const unsigned char no_str
[] = "*";
154 new_translation_table(struct conv_table
*p
)
158 for (i
= 0; i
< 256; i
++)
160 free_translation_table(p
[i
].u
.tbl
);
161 for (i
= 0; i
< 128; i
++) {
163 p
[i
].u
.str
= strings
[i
];
165 for (; i
< 256; i
++) {
172 #define BIN_SEARCH(table, entry, entries, key, result) \
174 long _s = 0, _e = (entries) - 1; \
176 while (_s <= _e || !((result) = -1)) { \
177 long _m = (_s + _e) / 2; \
179 if ((table)[_m].entry == (key)) { \
183 if ((table)[_m].entry > (key)) _e = _m - 1; \
184 if ((table)[_m].entry < (key)) _s = _m + 1; \
188 static const unicode_val_T strange_chars[32] = {
189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
195 #define SYSTEM_CHARSET_FLAG 128
196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
198 const unsigned char *
199 u2cp_(unicode_val_T u
, int to
, enum nbsp_mode nbsp_mode
)
204 if (u
< 128) return strings
[u
];
207 u
= strange_chars
[u
- 0x80];
211 to
&= ~SYSTEM_CHARSET_FLAG
;
213 if (is_cp_ptr_utf8(&codepages
[to
]))
214 return encode_utf8(u
);
216 /* To mark non breaking spaces in non-UTF-8 strings, we use a
217 * special char NBSP_CHAR. */
218 if (u
== UCS_NO_BREAK_SPACE
) {
219 if (nbsp_mode
== NBSP_MODE_HACK
) return NBSP_CHAR_STRING
;
220 else /* NBSP_MODE_ASCII */ return " ";
222 if (u
== UCS_SOFT_HYPHEN
) return "";
225 for (j
= 0; j
< 0x80; j
++)
226 if (codepages
[to
].highhalf
[j
] == u
)
227 return strings
[0x80 + j
];
228 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
229 if (codepages
[to
].table
[j
].u
== u
)
230 return strings
[codepages
[to
].table
[j
].c
];
232 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
233 if (s
!= -1) return unicode_7b
[s
].s
;
238 static unsigned char utf_buffer
[7];
240 NONSTATIC_INLINE
unsigned char *
241 encode_utf8(unicode_val_T u
)
243 memset(utf_buffer
, 0, 7);
248 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
249 utf_buffer
[1] = 0x80 | (u
& 0x3f);
250 else if (u
< 0x10000)
251 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
252 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
253 utf_buffer
[2] = 0x80 | (u
& 0x3f);
254 else if (u
< 0x200000)
255 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
256 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
257 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
258 utf_buffer
[3] = 0x80 | (u
& 0x3f);
259 else if (u
< 0x4000000)
260 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
261 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
262 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
263 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
264 utf_buffer
[4] = 0x80 | (u
& 0x3f);
265 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
266 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
267 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
268 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
269 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
270 utf_buffer
[5] = 0x80 | (u
& 0x3f);
275 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
276 * equal ones and handled different. */
277 static const char utf8char_len_tab
[256] = {
278 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
279 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
282 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
283 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
284 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
285 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
290 utf8charlen(const unsigned char *p
)
292 return p
? utf8char_len_tab
[*p
] : 0;
296 strlen_utf8(unsigned char **str
)
298 unsigned char *s
= *str
;
299 unsigned char *end
= strchr(s
, '\0');
303 for (x
= 0;; x
++, s
+= len
) {
304 len
= utf8charlen(s
);
305 if (s
+ len
> end
) break;
311 #define utf8_issingle(p) (((p) & 0x80) == 0)
312 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
314 /* Start from @current and move back to @pos char. This pointer return. The
315 * most left pointer is @start. */
317 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
319 if (current
== NULL
|| start
== NULL
|| pos
< 0)
321 while (pos
> 0 && current
!= start
) {
323 if (utf8_islead(*current
))
329 /* Count number of standard terminal cells needed for displaying UTF-8
332 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
337 end
= strchr(utf8_char
, '\0');
339 if(!utf8_char
|| !end
)
342 u
= utf8_to_unicode(&utf8_char
, end
);
344 return unicode_to_cell(u
);
347 /* Count number of standard terminal cells needed for displaying string
348 * with UTF-8 characters. */
350 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
352 int charlen
, cell
, cells
= 0;
355 end
= strchr(string
, '\0');
361 charlen
= utf8charlen(string
);
362 if (string
+ charlen
> end
)
365 cell
= utf8_char2cells(string
, end
);
376 /* Count number of characters in string. */
378 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
380 int charlen
, chars
= 0;
383 end
= strchr(string
, '\0');
389 charlen
= utf8charlen(string
);
390 if (string
+ charlen
> end
)
401 * Count number of bytes from begining of the string needed for displaying
402 * specified number of cells.
405 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
407 unsigned int bytes
= 0, cells
= 0;
409 assert(max_cells
>=0);
412 end
= strchr(string
, '\0');
418 int cell
= utf8_char2cells(&string
[bytes
], end
);
423 if (cells
> max_cells
)
426 bytes
+= utf8charlen(&string
[bytes
]);
428 if (string
+ bytes
> end
) {
429 bytes
= end
- string
;
437 /* Take @max steps forward from @string in the specified @way, but
438 * not going past @end. Return the resulting address. Store the
439 * number of steps taken to *@count, unless @count is NULL.
441 * This assumes the text is valid UTF-8, and @string and @end point to
442 * character boundaries. If not, it doesn't crash but the results may
445 * This function can do some of the same jobs as utf8charlen(),
446 * utf8_cells2bytes(), and strlen_utf8(). */
448 utf8_step_forward(unsigned char *string
, unsigned char *end
,
449 int max
, enum utf8_step way
, int *count
)
452 unsigned char *current
= string
;
456 if_assert_failed
goto invalid_arg
;
458 end
= strchr(string
, '\0');
461 case UTF8_STEP_CHARACTERS
:
462 while (steps
< max
&& current
< end
) {
464 if (utf8_islead(*current
))
469 case UTF8_STEP_CELLS_FEWER
:
470 case UTF8_STEP_CELLS_MORE
:
471 while (steps
< max
&& current
< end
) {
473 unsigned char *prev
= current
;
476 u
= utf8_to_unicode(¤t
, end
);
477 if (u
== UCS_NO_CHAR
) {
478 /* Assume the incomplete sequence
485 width
= unicode_to_cell(u
);
486 if (way
== UTF8_STEP_CELLS_FEWER
487 && steps
+ width
> max
) {
497 INTERNAL("impossible enum utf8_step");
506 /* Take @max steps backward from @string in the specified @way, but
507 * not going past @start. Return the resulting address. Store the
508 * number of steps taken to *@count, unless @count is NULL.
510 * This assumes the text is valid UTF-8, and @string and @start point
511 * to character boundaries. If not, it doesn't crash but the results
512 * may be inconsistent.
514 * This function can do some of the same jobs as utf8_prevchar(). */
516 utf8_step_backward(unsigned char *string
, unsigned char *start
,
517 int max
, enum utf8_step way
, int *count
)
520 unsigned char *current
= string
;
525 if_assert_failed
goto invalid_arg
;
528 case UTF8_STEP_CHARACTERS
:
529 while (steps
< max
&& current
> start
) {
531 if (utf8_islead(*current
))
536 case UTF8_STEP_CELLS_FEWER
:
537 case UTF8_STEP_CELLS_MORE
:
538 while (steps
< max
) {
539 unsigned char *prev
= current
;
544 if (current
<= start
)
548 } while (current
> start
&& !utf8_islead(*current
));
551 u
= utf8_to_unicode(&look
, prev
);
552 if (u
== UCS_NO_CHAR
) {
553 /* Assume the incomplete sequence
557 width
= unicode_to_cell(u
);
559 if (way
== UTF8_STEP_CELLS_FEWER
560 && steps
+ width
> max
) {
570 INTERNAL("impossible enum utf8_step");
580 * Find out number of standard terminal collumns needed for displaying symbol
581 * (glyph) which represents Unicode character c.
583 * TODO: Use wcwidth when it is available. This seems to require:
584 * - Make the configure script check whether <wchar.h> and wcwidth exist.
585 * - Define _XOPEN_SOURCE and include <wchar.h>.
586 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
587 * matches ISO 10646 in all locales.)
588 * However, these do not suffice, because wcwidth depends on LC_CTYPE
589 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
590 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
591 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
592 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
593 * character is apparently not supported in all locales. Why is that?
594 * - Perhaps there is standardese that requires supported characters
595 * to be convertable to multibyte form. Then ELinks could just pick
596 * some UTF-8 locale for its wcwidth purposes.
597 * - Perhaps wcwidth can even return different nonnegative values for
598 * the same ISO 10646 character in different locales. Then ELinks
599 * would have to set LC_CTYPE to match at least the terminal's
600 * charset (which may differ from the LC_CTYPE environment variable,
601 * especially when the master process is serving a slave terminal).
602 * But there is no guarantee that the libc supports all the same
603 * charsets as ELinks does.
604 * For now, it seems safest to avoid the potentially locale-dependent
605 * libc version of wcwidth, and instead use a hardcoded mapping.
607 * @return 2 for double-width glyph, 1 for others.
608 * TODO: May be extended to return 0 for zero-width glyphs
609 * (like composing, maybe unprintable too).
612 unicode_to_cell(unicode_val_T c
)
615 && (c
<= 0x115f /* Hangul Jamo */
618 || (c
>= 0x2e80 && c
<= 0xa4cf
619 && c
!= 0x303f) /* CJK ... Yi */
620 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
621 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
623 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
624 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
625 || (c
>= 0xffe0 && c
<= 0xffe6)
626 || (c
>= 0x20000 && c
<= 0x2fffd)
627 || (c
>= 0x30000 && c
<= 0x3fffd)))
633 /* Fold the case of a Unicode character, so that hotkeys in labels can
634 * be compared case-insensitively. It is unspecified whether the
635 * result will be in upper or lower case. */
637 unicode_fold_label_case(unicode_val_T c
)
639 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
641 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
642 /* For now, this supports only ASCII. It would be possible to
643 * use code generated from CaseFolding.txt of Unicode if the
644 * acknowledgements required by http://www.unicode.org/copyright.html
645 * were added to associated documentation of ELinks. */
646 if (c
>= 0x41 && c
<= 0x5A)
650 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
652 #endif /* CONFIG_UTF8 */
654 NONSTATIC_INLINE unicode_val_T
655 utf8_to_unicode(unsigned char **string
, const unsigned char *end
)
657 unsigned char *str
= *string
;
661 length
= utf8char_len_tab
[str
[0]];
663 if (str
+ length
> end
) {
668 case 1: /* U+0000 to U+007F */
669 if (str
[0] >= 0x80) {
672 return UCS_REPLACEMENT_CHARACTER
;
676 case 2: /* U+0080 to U+07FF */
677 if ((str
[1] & 0xc0) != 0x80)
679 u
= (str
[0] & 0x1f) << 6;
680 u
+= (str
[1] & 0x3f);
684 case 3: /* U+0800 to U+FFFF, except surrogates */
685 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80)
687 u
= (str
[0] & 0x0f) << 12;
688 u
+= ((str
[1] & 0x3f) << 6);
689 u
+= (str
[2] & 0x3f);
690 if (u
< 0x800 || is_utf16_surrogate(u
))
693 case 4: /* U+10000 to U+1FFFFF */
694 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
695 || (str
[3] & 0xc0) != 0x80)
697 u
= (str
[0] & 0x0f) << 18;
698 u
+= ((str
[1] & 0x3f) << 12);
699 u
+= ((str
[2] & 0x3f) << 6);
700 u
+= (str
[3] & 0x3f);
704 case 5: /* U+200000 to U+3FFFFFF */
705 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
706 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80)
708 u
= (str
[0] & 0x0f) << 24;
709 u
+= ((str
[1] & 0x3f) << 18);
710 u
+= ((str
[2] & 0x3f) << 12);
711 u
+= ((str
[3] & 0x3f) << 6);
712 u
+= (str
[4] & 0x3f);
716 case 6: /* U+4000000 to U+7FFFFFFF */
717 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
718 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80
719 || (str
[5] & 0xc0) != 0x80)
721 u
= (str
[0] & 0x01) << 30;
722 u
+= ((str
[1] & 0x3f) << 24);
723 u
+= ((str
[2] & 0x3f) << 18);
724 u
+= ((str
[3] & 0x3f) << 12);
725 u
+= ((str
[4] & 0x3f) << 6);
726 u
+= (str
[5] & 0x3f);
731 INTERNAL("utf8char_len_tab out of range");
734 *string
= str
+ length
;
738 /* The common part of cp2u and cp2utf_8. */
740 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
742 unicode_val_T u
= from
->highhalf
[c
- 0x80];
744 if (u
== 0xFFFF) u
= UCS_REPLACEMENT_CHARACTER
;
748 /* Used for converting input from the terminal. */
750 cp2u(int from
, unsigned char c
)
752 from
&= ~SYSTEM_CHARSET_FLAG
;
754 /* UTF-8 is a multibyte codepage and cannot be handled with
756 assert(!is_cp_ptr_utf8(&codepages
[from
]));
757 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
759 if (c
< 0x80) return c
;
760 else return cp2u_shared(&codepages
[from
], c
);
763 /* This slow and ugly code is used by the terminal utf_8_io */
764 const unsigned char *
765 cp2utf8(int from
, int c
)
767 from
&= ~SYSTEM_CHARSET_FLAG
;
769 if (is_cp_ptr_utf8(&codepages
[from
]) || c
< 128)
772 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
776 cp_to_unicode(int codepage
, unsigned char **string
, const unsigned char *end
)
780 if (is_cp_utf8(codepage
))
781 return utf8_to_unicode(string
, end
);
786 ret
= cp2u(codepage
, **string
);
792 #ifdef CONFIG_COMBINE
793 unicode_val_T last_combined
= UCS_BEGIN_COMBINED
- 1;
794 unicode_val_T
**combined
;
795 struct hash
*combined_hash
;
798 get_combined(unicode_val_T
*data
, int length
)
800 struct hash_item
*item
;
804 assert(length
>= 1 && length
<= UCS_MAX_LENGTH_COMBINED
);
805 if_assert_failed
return UCS_NO_CHAR
;
807 if (!combined_hash
) combined_hash
= init_hash8();
808 if (!combined_hash
) return UCS_NO_CHAR
;
809 item
= get_hash_item(combined_hash
, (unsigned char *)data
, length
* sizeof(*data
));
811 if (item
) return (unicode_val_T
)(long)item
->value
;
812 if (last_combined
>= UCS_END_COMBINED
) return UCS_NO_CHAR
;
814 key
= mem_alloc((length
+ 1) * sizeof(*key
));
815 if (!key
) return UCS_NO_CHAR
;
816 for (i
= 0; i
< length
; i
++)
818 key
[i
] = UCS_END_COMBINED
;
821 indeks
= last_combined
- UCS_BEGIN_COMBINED
;
823 combined
= mem_realloc(combined
, sizeof(*combined
) * (indeks
+ 1));
829 combined
[indeks
] = key
;
830 item
= add_hash_item(combined_hash
, (unsigned char *)key
,
831 length
* sizeof(*data
), (void *)(long)(last_combined
));
837 return last_combined
;
843 int i
, end
= last_combined
- UCS_BEGIN_COMBINED
+ 1;
846 free_hash(&combined_hash
);
847 for (i
= 0; i
< end
; i
++)
848 mem_free(combined
[i
]);
849 mem_free_if(combined
);
851 #endif /* CONFIG_COMBINE */
855 add_utf8(struct conv_table
*ct
, unicode_val_T u
, const unsigned char *str
)
857 unsigned char *p
= encode_utf8(u
);
860 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
862 struct conv_table
*nct
;
864 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
865 if_assert_failed
return;
867 nct
= mem_calloc(256, sizeof(*nct
));
869 new_translation_table(nct
);
877 assertm(!ct
[*p
].t
, "bad utf encoding #2");
878 if_assert_failed
return;
880 if (ct
[*p
].u
.str
== no_str
)
884 /* A conversion table from some charset to UTF-8.
885 * If it is from UTF-8 to UTF-8, it converts each byte separately.
886 * Unlike in other translation tables, the strings in elements 0x80 to
887 * 0xFF are allocated dynamically. */
888 struct conv_table utf_table
[256];
889 int utf_table_init
= 1;
896 /* Cast away const. */
897 for (i
= 128; i
< 256; i
++)
898 mem_free((unsigned char *) utf_table
[i
].u
.str
);
901 static struct conv_table
*
902 get_translation_table_to_utf8(int from
)
907 if (from
== -1) return NULL
;
908 from
&= ~SYSTEM_CHARSET_FLAG
;
909 if (from
== lfr
) return utf_table
;
911 if (utf_table_init
) {
912 memset(utf_table
, 0, sizeof(utf_table
));
917 for (i
= 0; i
< 128; i
++)
918 utf_table
[i
].u
.str
= strings
[i
];
920 if (is_cp_ptr_utf8(&codepages
[from
])) {
921 for (i
= 128; i
< 256; i
++)
922 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
926 for (i
= 128; i
< 256; i
++) {
927 unicode_val_T u
= codepages
[from
].highhalf
[i
- 0x80];
930 utf_table
[i
].u
.str
= NULL
;
932 utf_table
[i
].u
.str
= stracpy(encode_utf8(u
));
935 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
936 unicode_val_T u
= codepages
[from
].table
[i
].u
;
938 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
939 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
940 stracpy(encode_utf8(u
));
943 for (i
= 128; i
< 256; i
++)
944 if (!utf_table
[i
].u
.str
)
945 utf_table
[i
].u
.str
= stracpy(no_str
);
950 /* A conversion table between two charsets, where the target is not UTF-8. */
951 static struct conv_table table
[256];
952 static int first
= 1;
955 free_conv_table(void)
957 if (!utf_table_init
) free_utf_table();
959 memset(table
, 0, sizeof(table
));
962 new_translation_table(table
);
964 if (iconv_cd
!= (iconv_t
)-1) {
965 iconv_close(iconv_cd
);
966 iconv_cd
= (iconv_t
)-1;
973 get_translation_table(int from
, int to
)
978 from
&= ~SYSTEM_CHARSET_FLAG
;
979 to
&= ~SYSTEM_CHARSET_FLAG
;
981 memset(table
, 0, sizeof(table
));
985 if (codepages
[from
].iconv
) {
986 struct conv_table
*table2
= get_translation_table_to_utf8(34);
988 if (table2
) table2
->iconv_cp
= from
;
992 if (/*from == to ||*/ from
== -1 || to
== -1)
994 if (is_cp_ptr_utf8(&codepages
[to
])) {
995 struct conv_table
*table2
= get_translation_table_to_utf8(from
);
997 if (table2
) table2
->iconv_cp
= -1;
1000 if (from
== lfr
&& to
== lto
)
1004 new_translation_table(table
);
1006 if (is_cp_ptr_utf8(&codepages
[from
])) {
1009 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
1010 add_utf8(table
, UCS_NO_BREAK_SPACE
, strings
[NBSP_CHAR
]);
1011 add_utf8(table
, UCS_SOFT_HYPHEN
, "");
1013 for (i
= 0x80; i
<= 0xFF; i
++)
1014 if (codepages
[to
].highhalf
[i
- 0x80] != 0xFFFF)
1016 codepages
[to
].highhalf
[i
- 0x80],
1019 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
1020 add_utf8(table
, codepages
[to
].table
[i
].u
,
1021 strings
[codepages
[to
].table
[i
].c
]);
1023 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
1024 if (unicode_7b
[i
].x
>= 0x80)
1025 add_utf8(table
, unicode_7b
[i
].x
,
1031 for (i
= 128; i
< 256; i
++) {
1032 if (codepages
[from
].highhalf
[i
- 0x80] != 0xFFFF) {
1033 const unsigned char *u
;
1035 u
= u2cp(codepages
[from
].highhalf
[i
- 0x80], to
);
1036 if (u
) table
[i
].u
.str
= u
;
1045 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
1048 if (*s1
> *s2
) return 1;
1049 if (*s1
< *s2
) return -1;
1055 return *s2
? -1 : 0;
1058 /* Entity cache debugging purpose. */
1060 #define DEBUG_ENTITY_CACHE
1062 #undef DEBUG_ENTITY_CACHE
1065 struct entity_cache
{
1069 const unsigned char *result
;
1070 unsigned char str
[20]; /* Suffice in any case. */
1073 /* comparison function for qsort() */
1075 hits_cmp(const void *v1
, const void *v2
)
1077 const struct entity_cache
*a
= v1
, *b
= v2
;
1079 if (a
->hits
== b
->hits
) return 0;
1080 if (a
->hits
> b
->hits
) return -1;
1085 compare_entities(const void *key_
, const void *element_
)
1087 struct string
*key
= (struct string
*) key_
;
1088 struct entity
*element
= (struct entity
*) element_
;
1089 int length
= key
->length
;
1090 unsigned char *first
= key
->source
;
1091 unsigned char *second
= element
->s
;
1093 return xxstrcmp(first
, second
, length
);
1096 const unsigned char *
1097 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
1099 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1100 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1101 will go in [0] table */
1102 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
1103 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
1104 unsigned int slen
= 0;
1105 const unsigned char *result
= NULL
;
1107 /* Note that an object of static storage duration is automatically
1108 * initialised to zero in C. */
1110 if (strlen
<= 0) return NULL
;
1113 /* TODO: caching UTF-8 */
1114 encoding
&= ~SYSTEM_CHARSET_FLAG
;
1115 if (is_cp_ptr_utf8(&codepages
[encoding
]))
1117 #endif /* CONFIG_UTF8 */
1119 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1120 * + google + slashdot + websites that result from a search for test on google,
1121 * + various ones) show quite impressive improvment:
1123 * 0: hits=2459 l=4 st='nbsp'
1124 * 1: hits=2152 l=6 st='eacute'
1125 * 2: hits=235 l=6 st='egrave'
1126 * 3: hits=136 l=6 st='agrave'
1127 * 4: hits=100 l=3 st='amp'
1128 * 5: hits=40 l=5 st='laquo'
1129 * 6: hits=8 l=4 st='copy'
1130 * 7: hits=5 l=2 st='gt'
1131 * 8: hits=2 l=2 st='lt'
1132 * 9: hits=1 l=6 st='middot'
1134 * Most of the time cache hit ratio is near 95%.
1136 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1137 * count is kept < 2 (worst case 1.58). Not so bad ;)
1141 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1142 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
1144 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
1147 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
1148 if (entity_cache
[slen
][i
].encoding
== encoding
1149 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
1150 #ifdef DEBUG_ENTITY_CACHE
1151 static double total_iter
= 0;
1152 static unsigned long hit_count
= 0;
1154 total_iter
+= i
+ 1;
1156 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
1158 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
1159 entity_cache
[slen
][i
].hits
++;
1160 return entity_cache
[slen
][i
].result
;
1163 #ifdef DEBUG_ENTITY_CACHE
1164 fprintf(stderr
, "miss\n");
1169 #endif /* CONFIG_UTF8 */
1170 if (*str
== '#') { /* Numeric entity. */
1171 int l
= (int) strlen
;
1172 unsigned char *st
= (unsigned char *) str
;
1173 unicode_val_T n
= 0;
1175 if (l
== 1) goto end
; /* &#; ? */
1177 if ((*st
| 32) == 'x') { /* Hexadecimal */
1179 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
1182 unsigned char c
= (*(st
++) | 32);
1185 n
= (n
<< 4) | (c
- '0');
1186 else if (isxdigit(c
))
1187 n
= (n
<< 4) | (c
- 'a' + 10);
1189 goto end
; /* Bad char. */
1191 } else { /* Decimal */
1192 if (l
> 10) goto end
; /* 4294967295 max. */
1194 unsigned char c
= *(st
++);
1197 n
= n
* 10 + c
- '0';
1199 goto end
; /* Bad char. */
1200 /* Limit to 0xFFFFFFFF. */
1201 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1206 result
= u2cp(n
, encoding
);
1208 #ifdef DEBUG_ENTITY_CACHE
1209 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1211 } else { /* Text entity. */
1212 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1213 struct entity
*element
= bsearch((void *) &key
, entities
,
1218 if (element
) result
= u2cp(element
->c
, encoding
);
1222 if (is_cp_ptr_utf8(&codepages
[encoding
])) {
1225 #endif /* CONFIG_UTF8 */
1227 /* Take care of potential buffer overflow. */
1228 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1229 struct entity_cache
*ece
;
1231 /* Sort entries by hit order. */
1232 if (nb_entity_cache
[slen
] > 1)
1233 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1234 sizeof(entity_cache
[slen
][0]), hits_cmp
);
1236 /* Increment number of cache entries if possible.
1237 * Else, just replace the least used entry. */
1238 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1239 ece
= &entity_cache
[slen
][nb_entity_cache
[slen
] - 1];
1241 /* Copy new entry to cache. */
1243 ece
->strlen
= strlen
;
1244 ece
->encoding
= encoding
;
1245 ece
->result
= result
;
1246 memcpy(ece
->str
, str
, strlen
);
1247 ece
->str
[strlen
] = '\0';
1250 #ifdef DEBUG_ENTITY_CACHE
1251 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1252 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1257 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1258 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1259 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1260 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1261 entity_cache
[slen
][i
].str
);
1262 fprintf(stderr
, "-----------------\n");
1264 #endif /* DEBUG_ENTITY_CACHE */
1270 convert_string(struct conv_table
*convert_table
,
1271 unsigned char *chars2
, int charslen2
, int cp
,
1272 enum convert_string_mode mode
, int *length
,
1273 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1274 void *callback_data
)
1276 unsigned char *buffer
;
1279 unsigned char *chars
= chars2
;
1280 int charslen
= charslen2
;
1283 static char iconv_input
[256];
1284 static char iconv_output
[256 * 8];
1285 static size_t iconv_offset
;
1286 static int iconv_cp
;
1287 static size_t iconv_inleft
;
1288 size_t iconv_outleft
= 256 * 8;
1291 int chars_offset
= 0;
1293 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1295 if (charslen
) callback(callback_data
, chars
, charslen
);
1298 return memacpy(chars
, charslen
);
1303 if (convert_table
&& convert_table
->iconv_cp
> 0) {
1305 cp
= convert_table
->iconv_cp
;
1307 is_iconv
= codepages
[cp
& ~SYSTEM_CHARSET_FLAG
].iconv
;
1312 /* Buffer allocation */
1314 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1315 if (!buffer
) return NULL
;
1320 size_t before
, to_copy
;
1323 if (iconv_cd
>= 0) {
1324 if (cp
!= iconv_cp
) {
1325 iconv_close(iconv_cd
);
1326 iconv_cd
= (iconv_t
)-1;
1329 if (iconv_cd
== (iconv_t
)-1) {
1331 iconv_cd
= iconv_open("utf-8", get_cp_mime_name(cp
));
1332 if (iconv_cd
== (iconv_t
)-1) {
1339 to_copy
= charslen2
- chars_offset
;
1340 if (to_copy
> 256 - iconv_offset
) to_copy
= 256 - iconv_offset
;
1341 memcpy(iconv_input
+ iconv_offset
, chars
+ chars_offset
, to_copy
);
1342 iconv_outleft
= 256 * 8;
1343 iconv_inleft
= iconv_offset
+ to_copy
;
1345 outp
= iconv_output
;
1346 before
= iconv_inleft
;
1348 v
= iconv(iconv_cd
, &inp
, &iconv_inleft
, &outp
, &iconv_outleft
);
1349 chars_offset
+= before
- iconv_inleft
;
1350 charslen
= 256 * 8 - iconv_outleft
;
1352 chars
= (unsigned char *)iconv_output
;
1358 memcpy(iconv_input
, inp
, iconv_inleft
);
1359 iconv_offset
= iconv_inleft
;
1372 loop
= chars_offset
< charslen2
;
1378 while (charspos
< charslen
) {
1379 const unsigned char *translit
;
1382 buffer[bufferpos++] = chars[charspos++]; \
1387 if (chars
[charspos
] != '&') {
1388 struct conv_table
*t
;
1391 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1396 while (t
[chars
[i
]].t
) {
1397 t
= t
[chars
[i
++]].u
.tbl
;
1398 if (i
>= charslen
) PUTC
;
1401 translit
= t
[chars
[i
]].u
.str
;
1404 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1408 int start
= charspos
+ 1;
1412 && (isasciialpha(chars
[i
])
1413 || isdigit(chars
[i
])
1414 || (chars
[i
] == '#')))
1417 /* This prevents bug 213: we were expanding "entities"
1418 * in URL query strings. */
1419 /* XXX: But this disables    usage, which
1420 * appears to be relatively common! --pasky */
1421 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1423 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1424 translit
= get_entity_string(&chars
[start
], i
- start
,
1426 if (chars
[i
] != ';') {
1427 /* Eat    <foo> happily, but
1428 * pull back from the character after
1429 * entity string if it is not the valid
1434 if (!translit
) PUTC
;
1435 charspos
= i
+ (i
< charslen
);
1439 if (!translit
[0]) continue;
1442 buffer
[bufferpos
++] = translit
[0];
1450 buffer
[bufferpos
++] = *(translit
++);
1452 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1455 buffer
[bufferpos
] = 0;
1456 callback(callback_data
, buffer
, bufferpos
);
1459 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1471 if (loop
) goto repeat
;
1475 buffer
[bufferpos
] = 0;
1476 if (length
) *length
= bufferpos
;
1479 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1488 #ifndef USE_FASTFIND
1490 get_cp_index(const unsigned char *name
)
1495 if (!c_strcasecmp(name
, "System")) {
1496 #if HAVE_LANGINFO_CODESET
1497 name
= nl_langinfo(CODESET
);
1498 syscp
= SYSTEM_CHARSET_FLAG
;
1504 for (i
= 0; codepages
[i
].name
; i
++) {
1505 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1506 /* In the past, we looked for the longest substring
1507 * in all the names; it is way too expensive, though:
1509 * % cumulative self self total
1510 * time seconds seconds calls us/call us/call name
1511 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1513 * Anything called from redraw_screen() is in fact
1514 * relatively expensive, even if it's called just
1515 * once. So we will do a simple strcasecmp() here.
1518 if (!c_strcasecmp(name
, codepages
[i
].aliases
[a
]))
1524 return get_cp_index("us-ascii") | syscp
;
1532 static unsigned int i_name
= 0;
1533 static unsigned int i_alias
= 0;
1535 /* Reset internal list pointer */
1537 charsets_list_reset(void)
1543 /* Returns a pointer to a struct that contains current key and data pointers
1544 * and increment internal pointer. It returns NULL when key is NULL. */
1545 struct fastfind_key_value
*
1546 charsets_list_next(void)
1548 static struct fastfind_key_value kv
;
1550 if (!codepages
[i_name
].name
) return NULL
;
1552 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1553 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1555 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1565 static struct fastfind_index ff_charsets_index
1566 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1568 /* It searchs for a charset named @name or one of its aliases and
1569 * returns index for it or -1 if not found. */
1571 get_cp_index(const unsigned char *name
)
1573 const struct codepage_desc
*codepage
;
1576 if (!c_strcasecmp(name
, "System")) {
1577 #if HAVE_LANGINFO_CODESET
1578 name
= nl_langinfo(CODESET
);
1579 syscp
= SYSTEM_CHARSET_FLAG
;
1585 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1587 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1588 return (codepage
- codepages
) | syscp
;
1591 return get_cp_index("us-ascii") | syscp
;
1598 #endif /* USE_FASTFIND */
1601 init_charsets_lookup(void)
1604 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1609 free_charsets_lookup(void)
1612 fastfind_done(&ff_charsets_index
);
1616 /* Get the codepage's name for displaying to the user, or NULL if
1617 * @cp_index is one past the end. In the future, we might want to
1618 * localize these with gettext. So it may be best not to use this
1619 * function if the name will have to be converted back to an
1622 get_cp_name(int cp_index
)
1624 if (cp_index
< 0) return "none";
1625 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1627 return codepages
[cp_index
].name
;
1630 /* Get the codepage's name for saving to a configuration file. These
1631 * names can be converted back to indexes, even in future versions of
1634 get_cp_config_name(int cp_index
)
1636 if (cp_index
< 0) return "none";
1637 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1638 if (!codepages
[cp_index
].aliases
) return NULL
;
1640 return codepages
[cp_index
].aliases
[0];
1643 /* Get the codepage's name for sending to a library or server that
1644 * understands MIME charset names. This function irreversibly maps
1645 * the "System" codepage to the underlying charset. */
1647 get_cp_mime_name(int cp_index
)
1649 if (cp_index
< 0) return "none";
1650 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1651 if (!codepages
[cp_index
].aliases
) return NULL
;
1653 return codepages
[cp_index
].aliases
[0];
1657 is_cp_utf8(int cp_index
)
1659 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1660 return is_cp_ptr_utf8(&codepages
[cp_index
]);
1663 /* This function will be used by the xhtml parser. */
1665 get_cp_highhalf(const unsigned char *name
)
1667 int cp
= get_cp_index(name
);
1669 if (cp
< 0) return NULL
;
1670 cp
&= ~SYSTEM_CHARSET_FLAG
;
1671 return codepages
[cp
].highhalf
;