src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #ifdef HAVE_ICONV
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #endif
  25
  26 #include "elinks.h"
  27
  28 #include "document/options.h"
  29 #include "intl/charsets.h"
  30 #include "util/conv.h"
  31 #include "util/error.h"
  32 #include "util/fastfind.h"
  33 #include "util/hash.h"
  34 #include "util/memory.h"
  35 #include "util/string.h"
  36
  37
  38 /* Fix namespace clash on MacOS. */
  39 #define table table_elinks
  40
  41 struct table_entry {
  42         unsigned char c;
  43         /* This should in principle be unicode_val_T, but because all
  44          * the values currently in codepage.inc fit in 16 bits, we can
  45          * as well use uint16_t and halve sizeof(struct table_entry)
  46          * from 8 bytes to 4.  Should other characters ever be needed,
  47          * unicode_val_T u : 24 might be a possibility, although it
  48          * seems a little unportable as bitfields are in principle
  49          * restricted to int, which may be 16-bit.  */
  50         uint16_t u;
  51 };
  52
  53 struct codepage_desc {
  54         unsigned char *name;
  55         unsigned char *const *aliases;
  56
  57         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  58          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  59          * Because all current values fit in 16 bits, we store them as
  60          * uint16_t rather than unicode_val_T.  If the codepage does
  61          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  62          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  63          * appropriate.  (U+FFFF is reserved and will never be
  64          * assigned as a character.)  */
  65         const uint16_t *highhalf;
  66
  67         /* If some byte in the codepage corresponds to multiple Unicode
  68          * characters, then the preferred character is in @highhalf
  69          * above, and the rest are listed here in @table.  This table
  70          * is not used for translating from the codepage to Unicode.  */
  71         const struct table_entry *table;
  72
  73         /* Whether use iconv for translation */
  74         unsigned int iconv:1;
  75 };
  76
  77 #include "intl/codepage.inc"
  78 #include "intl/uni_7b.inc"
  79 #include "intl/entity.inc"
  80
  81 /* Declare the external-linkage inline functions defined in this file.
  82  * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
  83  * called.  The functions are not declared inline in charsets.h
  84  * because C99 6.7.4p6 says that every external-linkage function
  85  * declared inline shall be defined in the same translation unit.
  86  * The non-inline declarations in charsets.h also make sure that the
  87  * compiler emits global definitions for the symbols so that the
  88  * functions can be called from other translation units.  */
  89 NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
  90 NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
  91 NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
  92 NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
  93                                                const unsigned char *end);
  94
  95 static const char strings[256][2] = {
  96         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  97         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  98         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  99         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
 100         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
 101         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
 102         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
 103         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
 104         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
 105         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
 106         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
 107         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
 108         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
 109         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
 110         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
 111         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
 112         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
 113         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
 114         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
 115         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
 116         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
 117         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
 118         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
 119         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
 120         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
 121         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 122         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 123         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 124         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 125         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 126         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 127         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 128 };
 129
 130 #ifdef HAVE_ICONV
 131 static iconv_t iconv_cd = (iconv_t)-1;
 132 #endif
 133
 134 static void
 135 free_translation_table(struct conv_table *p)
 136 {
 137         int i;
 138
 139         for (i = 0; i < 256; i++)
 140                 if (p[i].t)
 141                         free_translation_table(p[i].u.tbl);
 142
 143         mem_free(p);
 144 }
 145
 146 /* A string used in conversion tables when there is no correct
 147  * conversion.  This is compared by address and therefore should be a
 148  * named array rather than a pointer so that it won't share storage
 149  * with any other string literal that happens to have the same
 150  * characters.  */
 151 static const unsigned char no_str[] = "*";
 152
 153 static void
 154 new_translation_table(struct conv_table *p)
 155 {
 156         int i;
 157
 158         for (i = 0; i < 256; i++)
 159                 if (p[i].t)
 160                         free_translation_table(p[i].u.tbl);
 161         for (i = 0; i < 128; i++) {
 162                 p[i].t = 0;
 163                 p[i].u.str = strings[i];
 164         }
 165         for (; i < 256; i++) {
 166                 p[i].t = 0;
 167                 p[i].u.str = no_str;
 168         }
 169         p->iconv_cp = -1;
 170 }
 171
 172 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 173 {                                                                                       \
 174         long _s = 0, _e = (entries) - 1;                                                \
 175                                                                                         \
 176         while (_s <= _e || !((result) = -1)) {                                          \
 177                 long _m = (_s + _e) / 2;                                                \
 178                                                                                         \
 179                 if ((table)[_m].entry == (key)) {                                       \
 180                         (result) = _m;                                                  \
 181                         break;                                                          \
 182                 }                                                                       \
 183                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 184                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 185         }                                                                               \
 186 }                                                                                       \
 187
 188 static const unicode_val_T strange_chars[32] = {
 189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 193 };
 194
 195 #define SYSTEM_CHARSET_FLAG 128
 196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 197
 198 const unsigned char *
 199 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 200 {
 201         int j;
 202         int s;
 203
 204         if (u < 128) return strings[u];
 205
 206         if (u < 0xa0) {
 207                 u = strange_chars[u - 0x80];
 208                 if (!u) return NULL;
 209         }
 210
 211         to &= ~SYSTEM_CHARSET_FLAG;
 212
 213         if (is_cp_ptr_utf8(&codepages[to]))
 214                 return encode_utf8(u);
 215
 216         /* To mark non breaking spaces in non-UTF-8 strings, we use a
 217          * special char NBSP_CHAR. */
 218         if (u == UCS_NO_BREAK_SPACE) {
 219                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 220                 else /* NBSP_MODE_ASCII */ return " ";
 221         }
 222         if (u == UCS_SOFT_HYPHEN) return "";
 223
 224         if (u < 0xFFFF)
 225                 for (j = 0; j < 0x80; j++)
 226                         if (codepages[to].highhalf[j] == u)
 227                                 return strings[0x80 + j];
 228         for (j = 0; codepages[to].table[j].c; j++)
 229                 if (codepages[to].table[j].u == u)
 230                         return strings[codepages[to].table[j].c];
 231
 232         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 233         if (s != -1) return unicode_7b[s].s;
 234
 235         return no_str;
 236 }
 237
 238 static unsigned char utf_buffer[7];
 239
 240 NONSTATIC_INLINE unsigned char *
 241 encode_utf8(unicode_val_T u)
 242 {
 243         memset(utf_buffer, 0, 7);
 244
 245         if (u < 0x80)
 246                 utf_buffer[0] = u;
 247         else if (u < 0x800)
 248                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 249                 utf_buffer[1] = 0x80 | (u & 0x3f);
 250         else if (u < 0x10000)
 251                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 252                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 253                 utf_buffer[2] = 0x80 | (u & 0x3f);
 254         else if (u < 0x200000)
 255                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 256                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 257                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 258                 utf_buffer[3] = 0x80 | (u & 0x3f);
 259         else if (u < 0x4000000)
 260                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 261                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 262                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 263                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 264                 utf_buffer[4] = 0x80 | (u & 0x3f);
 265         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 266                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 267                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 268                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 269                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 270                 utf_buffer[5] = 0x80 | (u & 0x3f);
 271
 272         return utf_buffer;
 273 }
 274
 275 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 276  * equal ones and handled different. */
 277 static const char utf8char_len_tab[256] = {
 278         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 279         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 280         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 281         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 282         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 283         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 284         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 285         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 286 };
 287
 288 #ifdef CONFIG_UTF8
 289 NONSTATIC_INLINE int
 290 utf8charlen(const unsigned char *p)
 291 {
 292         return p ? utf8char_len_tab[*p] : 0;
 293 }
 294
 295 int
 296 strlen_utf8(unsigned char **str)
 297 {
 298         unsigned char *s = *str;
 299         unsigned char *end = strchr(s, '\0');
 300         int x;
 301         int len;
 302
 303         for (x = 0;; x++, s += len) {
 304                 len = utf8charlen(s);
 305                 if (s + len > end) break;
 306         }
 307         *str = s;
 308         return x;
 309 }
 310
 311 #define utf8_issingle(p) (((p) & 0x80) == 0)
 312 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 313
 314 /* Start from @current and move back to @pos char. This pointer return. The
 315  * most left pointer is @start. */
 316 unsigned char *
 317 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 318 {
 319         if (current == NULL || start == NULL || pos < 0)
 320                 return NULL;
 321         while (pos > 0 && current != start) {
 322                 current--;
 323                 if (utf8_islead(*current))
 324                         pos--;
 325         }
 326         return current;
 327 }
 328
 329 /* Count number of standard terminal cells needed for displaying UTF-8
 330  * character. */
 331 int
 332 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 333 {
 334         unicode_val_T u;
 335
 336         if (end == NULL)
 337                 end = strchr(utf8_char, '\0');
 338
 339         if(!utf8_char || !end)
 340                 return -1;
 341
 342         u = utf8_to_unicode(&utf8_char, end);
 343
 344         return unicode_to_cell(u);
 345 }
 346
 347 /* Count number of standard terminal cells needed for displaying string
 348  * with UTF-8 characters. */
 349 int
 350 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 351 {
 352         int charlen, cell, cells = 0;
 353
 354         if (end == NULL)
 355                 end = strchr(string, '\0');
 356
 357         if(!string || !end)
 358                 return -1;
 359
 360         do {
 361                 charlen = utf8charlen(string);
 362                 if (string + charlen > end)
 363                         break;
 364
 365                 cell = utf8_char2cells(string, end);
 366                 if  (cell < 0)
 367                         return -1;
 368
 369                 cells += cell;
 370                 string += charlen;
 371         } while (1);
 372
 373         return cells;
 374 }
 375
 376 /* Count number of characters in string. */
 377 int
 378 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 379 {
 380         int charlen, chars = 0;
 381
 382         if (end == NULL)
 383                 end = strchr(string, '\0');
 384
 385         if(!string || !end)
 386                 return -1;
 387
 388         do {
 389                 charlen = utf8charlen(string);
 390                 if (string + charlen > end)
 391                         break;
 392
 393                 chars++;
 394                 string += charlen;
 395         } while (1);
 396
 397         return chars;
 398 }
 399
 400 /*
 401  * Count number of bytes from begining of the string needed for displaying
 402  * specified number of cells.
 403  */
 404 int
 405 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 406 {
 407         unsigned int bytes = 0, cells = 0;
 408
 409         assert(max_cells>=0);
 410
 411         if (end == NULL)
 412                 end = strchr(string, '\0');
 413
 414         if(!string || !end)
 415                 return -1;
 416
 417         do {
 418                 int cell = utf8_char2cells(&string[bytes], end);
 419                 if (cell < 0)
 420                         return -1;
 421
 422                 cells += cell;
 423                 if (cells > max_cells)
 424                         break;
 425
 426                 bytes += utf8charlen(&string[bytes]);
 427
 428                 if (string + bytes > end) {
 429                         bytes = end - string;
 430                         break;
 431                 }
 432         } while(1);
 433
 434         return bytes;
 435 }
 436
 437 /* Take @max steps forward from @string in the specified @way, but
 438  * not going past @end.  Return the resulting address.  Store the
 439  * number of steps taken to *@count, unless @count is NULL.
 440  *
 441  * This assumes the text is valid UTF-8, and @string and @end point to
 442  * character boundaries.  If not, it doesn't crash but the results may
 443  * be inconsistent.
 444  *
 445  * This function can do some of the same jobs as utf8charlen(),
 446  * utf8_cells2bytes(), and strlen_utf8().  */
 447 unsigned char *
 448 utf8_step_forward(unsigned char *string, unsigned char *end,
 449                   int max, enum utf8_step way, int *count)
 450 {
 451         int steps = 0;
 452         unsigned char *current = string;
 453
 454         assert(string);
 455         assert(max >= 0);
 456         if_assert_failed goto invalid_arg;
 457         if (end == NULL)
 458                 end = strchr(string, '\0');
 459
 460         switch (way) {
 461         case UTF8_STEP_CHARACTERS:
 462                 while (steps < max && current < end) {
 463                         ++current;
 464                         if (utf8_islead(*current))
 465                                 ++steps;
 466                 }
 467                 break;
 468
 469         case UTF8_STEP_CELLS_FEWER:
 470         case UTF8_STEP_CELLS_MORE:
 471                 while (steps < max && current < end) {
 472                         unicode_val_T u;
 473                         unsigned char *prev = current;
 474                         int width;
 475
 476                         u = utf8_to_unicode(&current, end);
 477                         if (u == UCS_NO_CHAR) {
 478                                 /* Assume the incomplete sequence
 479                                  * costs one cell.  */
 480                                 current = end;
 481                                 ++steps;
 482                                 break;
 483                         }
 484
 485                         width = unicode_to_cell(u);
 486                         if (way == UTF8_STEP_CELLS_FEWER
 487                             && steps + width > max) {
 488                                 /* Back off.  */
 489                                 current = prev;
 490                                 break;
 491                         }
 492                         steps += width;
 493                 }
 494                 break;
 495
 496         default:
 497                 INTERNAL("impossible enum utf8_step");
 498         }
 499
 500 invalid_arg:
 501         if (count)
 502                 *count = steps;
 503         return current;
 504 }
 505
 506 /* Take @max steps backward from @string in the specified @way, but
 507  * not going past @start.  Return the resulting address.  Store the
 508  * number of steps taken to *@count, unless @count is NULL.
 509  *
 510  * This assumes the text is valid UTF-8, and @string and @start point
 511  * to character boundaries.  If not, it doesn't crash but the results
 512  * may be inconsistent.
 513  *
 514  * This function can do some of the same jobs as utf8_prevchar().  */
 515 unsigned char *
 516 utf8_step_backward(unsigned char *string, unsigned char *start,
 517                    int max, enum utf8_step way, int *count)
 518 {
 519         int steps = 0;
 520         unsigned char *current = string;
 521
 522         assert(string);
 523         assert(start);
 524         assert(max >= 0);
 525         if_assert_failed goto invalid_arg;
 526
 527         switch (way) {
 528         case UTF8_STEP_CHARACTERS:
 529                 while (steps < max && current > start) {
 530                         --current;
 531                         if (utf8_islead(*current))
 532                                 ++steps;
 533                 }
 534                 break;
 535
 536         case UTF8_STEP_CELLS_FEWER:
 537         case UTF8_STEP_CELLS_MORE:
 538                 while (steps < max) {
 539                         unsigned char *prev = current;
 540                         unsigned char *look;
 541                         unicode_val_T u;
 542                         int width;
 543
 544                         if (current <= start)
 545                                 break;
 546                         do {
 547                                 --current;
 548                         } while (current > start && !utf8_islead(*current));
 549
 550                         look = current;
 551                         u = utf8_to_unicode(&look, prev);
 552                         if (u == UCS_NO_CHAR) {
 553                                 /* Assume the incomplete sequence
 554                                  * costs one cell.  */
 555                                 width = 1;
 556                         } else
 557                                 width = unicode_to_cell(u);
 558
 559                         if (way == UTF8_STEP_CELLS_FEWER
 560                             && steps + width > max) {
 561                                 /* Back off.  */
 562                                 current = prev;
 563                                 break;
 564                         }
 565                         steps += width;
 566                 }
 567                 break;
 568
 569         default:
 570                 INTERNAL("impossible enum utf8_step");
 571         }
 572
 573 invalid_arg:
 574         if (count)
 575                 *count = steps;
 576         return current;
 577 }
 578
 579 /*
 580  * Find out number of standard terminal collumns needed for displaying symbol
 581  * (glyph) which represents Unicode character c.
 582  *
 583  * TODO: Use wcwidth when it is available. This seems to require:
 584  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 585  * - Define _XOPEN_SOURCE and include <wchar.h>.
 586  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 587  *   matches ISO 10646 in all locales.)
 588  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 589  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 590  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 591  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 592  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 593  * character is apparently not supported in all locales.  Why is that?
 594  * - Perhaps there is standardese that requires supported characters
 595  *   to be convertable to multibyte form.  Then ELinks could just pick
 596  *   some UTF-8 locale for its wcwidth purposes.
 597  * - Perhaps wcwidth can even return different nonnegative values for
 598  *   the same ISO 10646 character in different locales.  Then ELinks
 599  *   would have to set LC_CTYPE to match at least the terminal's
 600  *   charset (which may differ from the LC_CTYPE environment variable,
 601  *   especially when the master process is serving a slave terminal).
 602  *   But there is no guarantee that the libc supports all the same
 603  *   charsets as ELinks does.
 604  * For now, it seems safest to avoid the potentially locale-dependent
 605  * libc version of wcwidth, and instead use a hardcoded mapping.
 606  *
 607  * @return      2 for double-width glyph, 1 for others.
 608  *              TODO: May be extended to return 0 for zero-width glyphs
 609  *              (like composing, maybe unprintable too).
 610  */
 611 NONSTATIC_INLINE int
 612 unicode_to_cell(unicode_val_T c)
 613 {
 614         if (c >= 0x1100
 615                 && (c <= 0x115f                 /* Hangul Jamo */
 616                 || c == 0x2329
 617                 || c == 0x232a
 618                 || (c >= 0x2e80 && c <= 0xa4cf
 619                         && c != 0x303f)         /* CJK ... Yi */
 620                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 621                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 622                                                                 Ideographs */
 623                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 624                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 625                 || (c >= 0xffe0 && c <= 0xffe6)
 626                 || (c >= 0x20000 && c <= 0x2fffd)
 627                 || (c >= 0x30000 && c <= 0x3fffd)))
 628                 return 2;
 629
 630         return 1;
 631 }
 632
 633 /* Fold the case of a Unicode character, so that hotkeys in labels can
 634  * be compared case-insensitively.  It is unspecified whether the
 635  * result will be in upper or lower case.  */
 636 unicode_val_T
 637 unicode_fold_label_case(unicode_val_T c)
 638 {
 639 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 640         return towlower(c);
 641 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 642         /* For now, this supports only ASCII.  It would be possible to
 643          * use code generated from CaseFolding.txt of Unicode if the
 644          * acknowledgements required by http://www.unicode.org/copyright.html
 645          * were added to associated documentation of ELinks.  */
 646         if (c >= 0x41 && c <= 0x5A)
 647                 return c + 0x20;
 648         else
 649                 return c;
 650 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 651 }
 652 #endif /* CONFIG_UTF8 */
 653
 654 NONSTATIC_INLINE unicode_val_T
 655 utf8_to_unicode(unsigned char **string, const unsigned char *end)
 656 {
 657         unsigned char *str = *string;
 658         unicode_val_T u;
 659         int length;
 660
 661         length = utf8char_len_tab[str[0]];
 662
 663         if (str + length > end) {
 664                 return UCS_NO_CHAR;
 665         }
 666
 667         switch (length) {
 668                 case 1:         /* U+0000 to U+007F */
 669                         if (str[0] >= 0x80) {
 670 invalid_utf8:
 671                                 ++*string;
 672                                 return UCS_REPLACEMENT_CHARACTER;
 673                         }
 674                         u = str[0];
 675                         break;
 676                 case 2:         /* U+0080 to U+07FF */
 677                         if ((str[1] & 0xc0) != 0x80)
 678                                 goto invalid_utf8;
 679                         u = (str[0] & 0x1f) << 6;
 680                         u += (str[1] & 0x3f);
 681                         if (u < 0x80)
 682                                 goto invalid_utf8;
 683                         break;
 684                 case 3:         /* U+0800 to U+FFFF, except surrogates */
 685                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
 686                                 goto invalid_utf8;
 687                         u = (str[0] & 0x0f) << 12;
 688                         u += ((str[1] & 0x3f) << 6);
 689                         u += (str[2] & 0x3f);
 690                         if (u < 0x800 || is_utf16_surrogate(u))
 691                                 goto invalid_utf8;
 692                         break;
 693                 case 4:         /* U+10000 to U+1FFFFF */
 694                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 695                             || (str[3] & 0xc0) != 0x80)
 696                                 goto invalid_utf8;
 697                         u = (str[0] & 0x0f) << 18;
 698                         u += ((str[1] & 0x3f) << 12);
 699                         u += ((str[2] & 0x3f) << 6);
 700                         u += (str[3] & 0x3f);
 701                         if (u < 0x10000)
 702                                 goto invalid_utf8;
 703                         break;
 704                 case 5:         /* U+200000 to U+3FFFFFF */
 705                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 706                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
 707                                 goto invalid_utf8;
 708                         u = (str[0] & 0x0f) << 24;
 709                         u += ((str[1] & 0x3f) << 18);
 710                         u += ((str[2] & 0x3f) << 12);
 711                         u += ((str[3] & 0x3f) << 6);
 712                         u += (str[4] & 0x3f);
 713                         if (u < 0x200000)
 714                                 goto invalid_utf8;
 715                         break;
 716                 case 6:         /* U+4000000 to U+7FFFFFFF */
 717                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 718                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
 719                             || (str[5] & 0xc0) != 0x80)
 720                                 goto invalid_utf8;
 721                         u = (str[0] & 0x01) << 30;
 722                         u += ((str[1] & 0x3f) << 24);
 723                         u += ((str[2] & 0x3f) << 18);
 724                         u += ((str[3] & 0x3f) << 12);
 725                         u += ((str[4] & 0x3f) << 6);
 726                         u += (str[5] & 0x3f);
 727                         if (u < 0x4000000)
 728                                 goto invalid_utf8;
 729                         break;
 730                 default:
 731                         INTERNAL("utf8char_len_tab out of range");
 732                         goto invalid_utf8;
 733         }
 734         *string = str + length;
 735         return u;
 736 }
 737
 738 /* The common part of cp2u and cp2utf_8.  */
 739 static unicode_val_T
 740 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 741 {
 742         unicode_val_T u = from->highhalf[c - 0x80];
 743
 744         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 745         return u;
 746 }
 747
 748 /* Used for converting input from the terminal.  */
 749 unicode_val_T
 750 cp2u(int from, unsigned char c)
 751 {
 752         from &= ~SYSTEM_CHARSET_FLAG;
 753
 754         /* UTF-8 is a multibyte codepage and cannot be handled with
 755          * this function.  */
 756         assert(!is_cp_ptr_utf8(&codepages[from]));
 757         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 758
 759         if (c < 0x80) return c;
 760         else return cp2u_shared(&codepages[from], c);
 761 }
 762
 763 /* This slow and ugly code is used by the terminal utf_8_io */
 764 const unsigned char *
 765 cp2utf8(int from, int c)
 766 {
 767         from &= ~SYSTEM_CHARSET_FLAG;
 768
 769         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 770                 return strings[c];
 771
 772         return encode_utf8(cp2u_shared(&codepages[from], c));
 773 }
 774
 775 unicode_val_T
 776 cp_to_unicode(int codepage, unsigned char **string, const unsigned char *end)
 777 {
 778         unicode_val_T ret;
 779
 780         if (is_cp_utf8(codepage))
 781                 return utf8_to_unicode(string, end);
 782
 783         if (*string >= end)
 784                 return UCS_NO_CHAR;
 785
 786         ret = cp2u(codepage, **string);
 787         ++*string;
 788         return ret;
 789 }
 790
 791
 792 #ifdef CONFIG_COMBINE
 793 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
 794 unicode_val_T **combined;
 795 struct hash *combined_hash;
 796
 797 unicode_val_T
 798 get_combined(unicode_val_T *data, int length)
 799 {
 800         struct hash_item *item;
 801         unicode_val_T *key;
 802         int i, indeks;
 803
 804         assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
 805         if_assert_failed return UCS_NO_CHAR;
 806
 807         if (!combined_hash) combined_hash = init_hash8();
 808         if (!combined_hash) return UCS_NO_CHAR;
 809         item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
 810
 811         if (item) return (unicode_val_T)(long)item->value;
 812         if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
 813
 814         key = mem_alloc((length + 1) * sizeof(*key));
 815         if (!key) return UCS_NO_CHAR;
 816         for (i = 0; i < length; i++)
 817                 key[i] = data[i];
 818         key[i] = UCS_END_COMBINED;
 819
 820         last_combined++;
 821         indeks = last_combined - UCS_BEGIN_COMBINED;
 822
 823         combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
 824         if (!combined) {
 825                 mem_free(key);
 826                 last_combined--;
 827                 return UCS_NO_CHAR;
 828         }
 829         combined[indeks] = key;
 830         item = add_hash_item(combined_hash, (unsigned char *)key,
 831                              length * sizeof(*data), (void *)(long)(last_combined));
 832         if (!item) {
 833                 last_combined--;
 834                 mem_free(key);
 835                 return UCS_NO_CHAR;
 836         }
 837         return last_combined;
 838 }
 839
 840 void
 841 free_combined()
 842 {
 843         int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
 844
 845         if (combined_hash)
 846                 free_hash(&combined_hash);
 847         for (i = 0; i < end; i++)
 848                 mem_free(combined[i]);
 849         mem_free_if(combined);
 850 }
 851 #endif /* CONFIG_COMBINE */
 852
 853
 854 static void
 855 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
 856 {
 857         unsigned char *p = encode_utf8(u);
 858
 859         while (p[1]) {
 860                 if (ct[*p].t) ct = ct[*p].u.tbl;
 861                 else {
 862                         struct conv_table *nct;
 863
 864                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 865                         if_assert_failed return;
 866
 867                         nct = mem_calloc(256, sizeof(*nct));
 868                         if (!nct) return;
 869                         new_translation_table(nct);
 870                         ct[*p].t = 1;
 871                         ct[*p].u.tbl = nct;
 872                         ct = nct;
 873                 }
 874                 p++;
 875         }
 876
 877         assertm(!ct[*p].t, "bad utf encoding #2");
 878         if_assert_failed return;
 879
 880         if (ct[*p].u.str == no_str)
 881                 ct[*p].u.str = str;
 882 }
 883
 884 /* A conversion table from some charset to UTF-8.
 885  * If it is from UTF-8 to UTF-8, it converts each byte separately.
 886  * Unlike in other translation tables, the strings in elements 0x80 to
 887  * 0xFF are allocated dynamically.  */
 888 struct conv_table utf_table[256];
 889 int utf_table_init = 1;
 890
 891 static void
 892 free_utf_table(void)
 893 {
 894         int i;
 895
 896         /* Cast away const.  */
 897         for (i = 128; i < 256; i++)
 898                 mem_free((unsigned char *) utf_table[i].u.str);
 899 }
 900
 901 static struct conv_table *
 902 get_translation_table_to_utf8(int from)
 903 {
 904         int i;
 905         static int lfr = -1;
 906
 907         if (from == -1) return NULL;
 908         from &= ~SYSTEM_CHARSET_FLAG;
 909         if (from == lfr) return utf_table;
 910         lfr = from;
 911         if (utf_table_init) {
 912                 memset(utf_table, 0, sizeof(utf_table));
 913                 utf_table_init = 0;
 914         } else
 915                 free_utf_table();
 916
 917         for (i = 0; i < 128; i++)
 918                 utf_table[i].u.str = strings[i];
 919
 920         if (is_cp_ptr_utf8(&codepages[from])) {
 921                 for (i = 128; i < 256; i++)
 922                         utf_table[i].u.str = stracpy(strings[i]);
 923                 return utf_table;
 924         }
 925
 926         for (i = 128; i < 256; i++) {
 927                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 928
 929                 if (u == 0xFFFF)
 930                         utf_table[i].u.str = NULL;
 931                 else
 932                         utf_table[i].u.str = stracpy(encode_utf8(u));
 933         }
 934
 935         for (i = 0; codepages[from].table[i].c; i++) {
 936                 unicode_val_T u = codepages[from].table[i].u;
 937
 938                 if (!utf_table[codepages[from].table[i].c].u.str)
 939                         utf_table[codepages[from].table[i].c].u.str =
 940                                 stracpy(encode_utf8(u));
 941         }
 942
 943         for (i = 128; i < 256; i++)
 944                 if (!utf_table[i].u.str)
 945                         utf_table[i].u.str = stracpy(no_str);
 946
 947         return utf_table;
 948 }
 949
 950 /* A conversion table between two charsets, where the target is not UTF-8.  */
 951 static struct conv_table table[256];
 952 static int first = 1;
 953
 954 void
 955 free_conv_table(void)
 956 {
 957         if (!utf_table_init) free_utf_table();
 958         if (first) {
 959                 memset(table, 0, sizeof(table));
 960                 first = 0;
 961         }
 962         new_translation_table(table);
 963 #ifdef HAVE_ICONV
 964         if (iconv_cd != (iconv_t)-1) {
 965                 iconv_close(iconv_cd);
 966                 iconv_cd = (iconv_t)-1;
 967         }
 968 #endif
 969 }
 970
 971
 972 struct conv_table *
 973 get_translation_table(int from, int to)
 974 {
 975         static int lfr = -1;
 976         static int lto = -1;
 977
 978         from &= ~SYSTEM_CHARSET_FLAG;
 979         to &= ~SYSTEM_CHARSET_FLAG;
 980         if (first) {
 981                 memset(table, 0, sizeof(table));
 982                 first = 0;
 983         }
 984
 985         if (codepages[from].iconv) {
 986                 struct conv_table *table2 = get_translation_table_to_utf8(34);
 987
 988                 if (table2) table2->iconv_cp = from;
 989                 return table2;
 990         }
 991
 992         if (/*from == to ||*/ from == -1 || to == -1)
 993                 return NULL;
 994         if (is_cp_ptr_utf8(&codepages[to])) {
 995                 struct conv_table *table2 = get_translation_table_to_utf8(from);
 996
 997                 if (table2) table2->iconv_cp = -1;
 998                 return table2;
 999         }
1000         if (from == lfr && to == lto)
1001                 return table;
1002         lfr = from;
1003         lto = to;
1004         new_translation_table(table);
1005
1006         if (is_cp_ptr_utf8(&codepages[from])) {
1007                 int i;
1008
1009                 /* Map U+00A0 and U+00AD the same way as u2cp() would.  */
1010                 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
1011                 add_utf8(table, UCS_SOFT_HYPHEN, "");
1012
1013                 for (i = 0x80; i <= 0xFF; i++)
1014                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
1015                                 add_utf8(table,
1016                                          codepages[to].highhalf[i - 0x80],
1017                                          strings[i]);
1018
1019                 for (i = 0; codepages[to].table[i].c; i++)
1020                         add_utf8(table, codepages[to].table[i].u,
1021                                  strings[codepages[to].table[i].c]);
1022
1023                 for (i = 0; unicode_7b[i].x != -1; i++)
1024                         if (unicode_7b[i].x >= 0x80)
1025                                 add_utf8(table, unicode_7b[i].x,
1026                                          unicode_7b[i].s);
1027
1028         } else {
1029                 int i;
1030
1031                 for (i = 128; i < 256; i++) {
1032                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
1033                                 const unsigned char *u;
1034
1035                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1036                                 if (u) table[i].u.str = u;
1037                         }
1038                 }
1039         }
1040
1041         return table;
1042 }
1043
1044 static inline int
1045 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1046 {
1047         while (l2) {
1048                 if (*s1 > *s2) return 1;
1049                 if (*s1 < *s2) return -1;
1050                 s1++;
1051                 s2++;
1052                 l2--;
1053         }
1054
1055         return *s2 ? -1 : 0;
1056 }
1057
1058 /* Entity cache debugging purpose. */
1059 #if 0
1060 #define DEBUG_ENTITY_CACHE
1061 #else
1062 #undef DEBUG_ENTITY_CACHE
1063 #endif
1064
1065 struct entity_cache {
1066         unsigned int hits;
1067         int strlen;
1068         int encoding;
1069         const unsigned char *result;
1070         unsigned char str[20]; /* Suffice in any case. */
1071 };
1072
1073 /* comparison function for qsort() */
1074 static int
1075 hits_cmp(const void *v1, const void *v2)
1076 {
1077         const struct entity_cache *a = v1, *b = v2;
1078
1079         if (a->hits == b->hits) return 0;
1080         if (a->hits > b->hits) return -1;
1081         else return 1;
1082 }
1083
1084 static int
1085 compare_entities(const void *key_, const void *element_)
1086 {
1087         struct string *key = (struct string *) key_;
1088         struct entity *element = (struct entity *) element_;
1089         int length = key->length;
1090         unsigned char *first = key->source;
1091         unsigned char *second = element->s;
1092
1093         return xxstrcmp(first, second, length);
1094 }
1095
1096 const unsigned char *
1097 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1098 {
1099 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
1100 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1101                                    will go in [0] table */
1102         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1103         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1104         unsigned int slen = 0;
1105         const unsigned char *result = NULL;
1106
1107         /* Note that an object of static storage duration is automatically
1108          * initialised to zero in C.  */
1109
1110         if (strlen <= 0) return NULL;
1111
1112 #ifdef CONFIG_UTF8
1113         /* TODO: caching UTF-8 */
1114         encoding &= ~SYSTEM_CHARSET_FLAG;
1115         if (is_cp_ptr_utf8(&codepages[encoding]))
1116                 goto skip;
1117 #endif /* CONFIG_UTF8 */
1118
1119         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1120          * + google + slashdot + websites that result from a search for test on google,
1121          * + various ones) show quite impressive improvment:
1122          * Top ten is:
1123          * 0: hits=2459 l=4 st='nbsp'
1124          * 1: hits=2152 l=6 st='eacute'
1125          * 2: hits=235 l=6 st='egrave'
1126          * 3: hits=136 l=6 st='agrave'
1127          * 4: hits=100 l=3 st='amp'
1128          * 5: hits=40 l=5 st='laquo'
1129          * 6: hits=8 l=4 st='copy'
1130          * 7: hits=5 l=2 st='gt'
1131          * 8: hits=2 l=2 st='lt'
1132          * 9: hits=1 l=6 st='middot'
1133          *
1134          * Most of the time cache hit ratio is near 95%.
1135          *
1136          * A long test shows: 15186 hits vs. 24 misses and mean iteration
1137          * count is kept < 2 (worst case 1.58). Not so bad ;)
1138          *
1139          * --Zas */
1140
1141         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1142         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1143
1144         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1145                 int i;
1146
1147                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1148                         if (entity_cache[slen][i].encoding == encoding
1149                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1150 #ifdef DEBUG_ENTITY_CACHE
1151                                 static double total_iter = 0;
1152                                 static unsigned long hit_count = 0;
1153
1154                                 total_iter += i + 1;
1155                                 hit_count++;
1156                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1157 #endif
1158                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1159                                         entity_cache[slen][i].hits++;
1160                                 return entity_cache[slen][i].result;
1161                         }
1162                 }
1163 #ifdef DEBUG_ENTITY_CACHE
1164                 fprintf(stderr, "miss\n");
1165 #endif
1166         }
1167 #ifdef CONFIG_UTF8
1168 skip:
1169 #endif /* CONFIG_UTF8 */
1170         if (*str == '#') { /* Numeric entity. */
1171                 int l = (int) strlen;
1172                 unsigned char *st = (unsigned char *) str;
1173                 unicode_val_T n = 0;
1174
1175                 if (l == 1) goto end; /* &#; ? */
1176                 st++, l--;
1177                 if ((*st | 32) == 'x') { /* Hexadecimal */
1178
1179                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1180                         st++, l--;
1181                         do {
1182                                 unsigned char c = (*(st++) | 32);
1183
1184                                 if (isdigit(c))
1185                                         n = (n << 4) | (c - '0');
1186                                 else if (isxdigit(c))
1187                                         n = (n << 4) | (c - 'a' + 10);
1188                                 else
1189                                         goto end; /* Bad char. */
1190                         } while (--l);
1191                 } else { /* Decimal */
1192                         if (l > 10) goto end; /* 4294967295 max. */
1193                         do {
1194                                 unsigned char c = *(st++);
1195
1196                                 if (isdigit(c))
1197                                         n = n * 10 + c - '0';
1198                                 else
1199                                         goto end; /* Bad char. */
1200                                 /* Limit to 0xFFFFFFFF. */
1201                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1202                                         goto end;
1203                         } while (--l);
1204                 }
1205
1206                 result = u2cp(n, encoding);
1207
1208 #ifdef DEBUG_ENTITY_CACHE
1209                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1210 #endif
1211         } else { /* Text entity. */
1212                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1213                 struct entity *element = bsearch((void *) &key, entities,
1214                                                  N_ENTITIES,
1215                                                  sizeof(*element),
1216                                                  compare_entities);
1217
1218                 if (element) result = u2cp(element->c, encoding);
1219         }
1220
1221 #ifdef CONFIG_UTF8
1222         if (is_cp_ptr_utf8(&codepages[encoding])) {
1223                 return result;
1224         }
1225 #endif /* CONFIG_UTF8 */
1226 end:
1227         /* Take care of potential buffer overflow. */
1228         if (strlen < sizeof(entity_cache[slen][0].str)) {
1229                 struct entity_cache *ece;
1230
1231                 /* Sort entries by hit order. */
1232                 if (nb_entity_cache[slen] > 1)
1233                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1234                               sizeof(entity_cache[slen][0]), hits_cmp);
1235
1236                 /* Increment number of cache entries if possible.
1237                  * Else, just replace the least used entry.  */
1238                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1239                 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1240
1241                 /* Copy new entry to cache. */
1242                 ece->hits = 1;
1243                 ece->strlen = strlen;
1244                 ece->encoding = encoding;
1245                 ece->result = result;
1246                 memcpy(ece->str, str, strlen);
1247                 ece->str[strlen] = '\0';
1248
1249
1250 #ifdef DEBUG_ENTITY_CACHE
1251                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1252                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1253
1254         {
1255                 unsigned int i;
1256
1257                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1258                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1259                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1260                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1261                                 entity_cache[slen][i].str);
1262                 fprintf(stderr, "-----------------\n");
1263         }
1264 #endif  /* DEBUG_ENTITY_CACHE */
1265         }
1266         return result;
1267 }
1268
1269 unsigned char *
1270 convert_string(struct conv_table *convert_table,
1271                unsigned char *chars2, int charslen2, int cp,
1272                enum convert_string_mode mode, int *length,
1273                void (*callback)(void *data, unsigned char *buf, int buflen),
1274                void *callback_data)
1275 {
1276         unsigned char *buffer;
1277         int bufferpos = 0;
1278         int charspos = 0;
1279         unsigned char *chars = chars2;
1280         int charslen = charslen2;
1281
1282 #ifdef HAVE_ICONV
1283         static char iconv_input[256];
1284         static char iconv_output[256 * 8];
1285         static size_t iconv_offset;
1286         static int iconv_cp;
1287         static size_t iconv_inleft;
1288         size_t iconv_outleft = 256 * 8;
1289         int loop = 0;
1290         int is_iconv = 0;
1291         int chars_offset = 0;
1292
1293         if (!convert_table && !memchr(chars, '&', charslen)) {
1294                 if (callback) {
1295                         if (charslen) callback(callback_data, chars, charslen);
1296                         return NULL;
1297                 } else {
1298                         return memacpy(chars, charslen);
1299                 }
1300         }
1301
1302         if (cp >= 0) {
1303                 if (convert_table && convert_table->iconv_cp > 0) {
1304                         is_iconv = 1;
1305                         cp = convert_table->iconv_cp;
1306                 } else {
1307                         is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
1308                 }
1309         }
1310 #endif
1311
1312         /* Buffer allocation */
1313
1314         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1315         if (!buffer) return NULL;
1316
1317 #ifdef HAVE_ICONV
1318         if (is_iconv) {
1319                 int v;
1320                 size_t before, to_copy;
1321                 char *outp, *inp;
1322
1323                 if (iconv_cd >= 0) {
1324                         if (cp != iconv_cp) {
1325                                 iconv_close(iconv_cd);
1326                                 iconv_cd = (iconv_t)-1;
1327                         }
1328                 }
1329                 if (iconv_cd == (iconv_t)-1) {
1330                         iconv_offset = 0;
1331                         iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
1332                         if (iconv_cd == (iconv_t)-1) {
1333                                 mem_free(buffer);
1334                                 return NULL;
1335                         }
1336                         iconv_cp = cp;
1337                 }
1338 repeat:
1339                 to_copy = charslen2 - chars_offset;
1340                 if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
1341                 memcpy(iconv_input + iconv_offset, chars + chars_offset, to_copy);
1342                 iconv_outleft = 256 * 8;
1343                 iconv_inleft = iconv_offset + to_copy;
1344                 inp = iconv_input;
1345                 outp = iconv_output;
1346                 before = iconv_inleft;
1347
1348                 v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
1349                 chars_offset += before - iconv_inleft;
1350                 charslen = 256 * 8 - iconv_outleft;
1351
1352                 chars = (unsigned char *)iconv_output;
1353                 charspos = 0;
1354
1355                 if (v == -1) {
1356                         switch (errno) {
1357                         case EINVAL:
1358                                 memcpy(iconv_input, inp, iconv_inleft);
1359                                 iconv_offset = iconv_inleft;
1360                                 break;
1361                         case EILSEQ:
1362                                 loop = 0;
1363                                 goto out;
1364                                 break;
1365                         default:
1366                                 iconv_offset = 0;
1367                         }
1368                 } else {
1369                         iconv_offset = 0;
1370                 }
1371
1372                 loop = chars_offset < charslen2;
1373         }
1374 #endif
1375         /* Iterate ;-) */
1376
1377 out:
1378         while (charspos < charslen) {
1379                 const unsigned char *translit;
1380
1381 #define PUTC do { \
1382                 buffer[bufferpos++] = chars[charspos++]; \
1383                 translit = ""; \
1384                 goto flush; \
1385         } while (0)
1386
1387                 if (chars[charspos] != '&') {
1388                         struct conv_table *t;
1389                         int i;
1390
1391                         if (chars[charspos] < 128 || !convert_table) PUTC;
1392
1393                         t = convert_table;
1394                         i = charspos;
1395
1396                         while (t[chars[i]].t) {
1397                                 t = t[chars[i++]].u.tbl;
1398                                 if (i >= charslen) PUTC;
1399                         }
1400
1401                         translit = t[chars[i]].u.str;
1402                         charspos = i + 1;
1403
1404                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1405                         PUTC;
1406
1407                 } else {
1408                         int start = charspos + 1;
1409                         int i = start;
1410
1411                         while (i < charslen
1412                                && (isasciialpha(chars[i])
1413                                    || isdigit(chars[i])
1414                                    || (chars[i] == '#')))
1415                                 i++;
1416
1417                         /* This prevents bug 213: we were expanding "entities"
1418                          * in URL query strings. */
1419                         /* XXX: But this disables &nbsp&nbsp usage, which
1420                          * appears to be relatively common! --pasky */
1421                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1422                             && i > start
1423                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1424                                 translit = get_entity_string(&chars[start], i - start,
1425                                                       cp);
1426                                 if (chars[i] != ';') {
1427                                         /* Eat &nbsp &nbsp<foo> happily, but
1428                                          * pull back from the character after
1429                                          * entity string if it is not the valid
1430                                          * terminator. */
1431                                         i--;
1432                                 }
1433
1434                                 if (!translit) PUTC;
1435                                 charspos = i + (i < charslen);
1436                         } else PUTC;
1437                 }
1438
1439                 if (!translit[0]) continue;
1440
1441                 if (!translit[1]) {
1442                         buffer[bufferpos++] = translit[0];
1443                         translit = "";
1444                         goto flush;
1445                 }
1446
1447                 while (*translit) {
1448                         unsigned char *new;
1449
1450                         buffer[bufferpos++] = *(translit++);
1451 flush:
1452                         if (bufferpos & (ALLOC_GR - 1)) continue;
1453
1454                         if (callback) {
1455                                 buffer[bufferpos] = 0;
1456                                 callback(callback_data, buffer, bufferpos);
1457                                 bufferpos = 0;
1458                         } else {
1459                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1460                                 if (!new) {
1461                                         mem_free(buffer);
1462                                         return NULL;
1463                                 }
1464                                 buffer = new;
1465                         }
1466                 }
1467 #undef PUTC
1468         }
1469
1470 #ifdef HAVE_ICONV
1471         if (loop) goto repeat;
1472 #endif
1473         /* Say bye */
1474
1475         buffer[bufferpos] = 0;
1476         if (length) *length = bufferpos;
1477
1478         if (callback) {
1479                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1480                 mem_free(buffer);
1481                 return NULL;
1482         } else {
1483                 return buffer;
1484         }
1485 }
1486
1487
1488 #ifndef USE_FASTFIND
1489 int
1490 get_cp_index(const unsigned char *name)
1491 {
1492         int i, a;
1493         int syscp = 0;
1494
1495         if (!c_strcasecmp(name, "System")) {
1496 #if HAVE_LANGINFO_CODESET
1497                 name = nl_langinfo(CODESET);
1498                 syscp = SYSTEM_CHARSET_FLAG;
1499 #else
1500                 name = "us-ascii";
1501 #endif
1502         }
1503
1504         for (i = 0; codepages[i].name; i++) {
1505                 for (a = 0; codepages[i].aliases[a]; a++) {
1506                         /* In the past, we looked for the longest substring
1507                          * in all the names; it is way too expensive, though:
1508                          *
1509                          *   %   cumulative   self              self     total
1510                          *  time   seconds   seconds    calls  us/call  us/call  name
1511                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1512                          *
1513                          * Anything called from redraw_screen() is in fact
1514                          * relatively expensive, even if it's called just
1515                          * once. So we will do a simple strcasecmp() here.
1516                          */
1517
1518                         if (!c_strcasecmp(name, codepages[i].aliases[a]))
1519                                 return i | syscp;
1520                 }
1521         }
1522
1523         if (syscp) {
1524                 return get_cp_index("us-ascii") | syscp;
1525         } else {
1526                 return -1;
1527         }
1528 }
1529
1530 #else
1531
1532 static unsigned int i_name = 0;
1533 static unsigned int i_alias = 0;
1534
1535 /* Reset internal list pointer */
1536 void
1537 charsets_list_reset(void)
1538 {
1539         i_name = 0;
1540         i_alias = 0;
1541 }
1542
1543 /* Returns a pointer to a struct that contains current key and data pointers
1544  * and increment internal pointer.  It returns NULL when key is NULL. */
1545 struct fastfind_key_value *
1546 charsets_list_next(void)
1547 {
1548         static struct fastfind_key_value kv;
1549
1550         if (!codepages[i_name].name) return NULL;
1551
1552         kv.key = codepages[i_name].aliases[i_alias];
1553         kv.data = (void *) &codepages[i_name]; /* cast away const */
1554
1555         if (codepages[i_name].aliases[i_alias + 1])
1556                 i_alias++;
1557         else {
1558                 i_name++;
1559                 i_alias = 0;
1560         }
1561
1562         return &kv;
1563 }
1564
1565 static struct fastfind_index ff_charsets_index
1566         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1567
1568 /* It searchs for a charset named @name or one of its aliases and
1569  * returns index for it or -1 if not found. */
1570 int
1571 get_cp_index(const unsigned char *name)
1572 {
1573         const struct codepage_desc *codepage;
1574         int syscp = 0;
1575
1576         if (!c_strcasecmp(name, "System")) {
1577 #if HAVE_LANGINFO_CODESET
1578                 name = nl_langinfo(CODESET);
1579                 syscp = SYSTEM_CHARSET_FLAG;
1580 #else
1581                 name = "us-ascii";
1582 #endif
1583         }
1584
1585         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1586         if (codepage) {
1587                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1588                 return (codepage - codepages) | syscp;
1589
1590         } else if (syscp) {
1591                 return get_cp_index("us-ascii") | syscp;
1592
1593         } else {
1594                 return -1;
1595         }
1596 }
1597
1598 #endif /* USE_FASTFIND */
1599
1600 void
1601 init_charsets_lookup(void)
1602 {
1603 #ifdef USE_FASTFIND
1604         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1605 #endif
1606 }
1607
1608 void
1609 free_charsets_lookup(void)
1610 {
1611 #ifdef USE_FASTFIND
1612         fastfind_done(&ff_charsets_index);
1613 #endif
1614 }
1615
1616 /* Get the codepage's name for displaying to the user, or NULL if
1617  * @cp_index is one past the end.  In the future, we might want to
1618  * localize these with gettext.  So it may be best not to use this
1619  * function if the name will have to be converted back to an
1620  * index.  */
1621 unsigned char *
1622 get_cp_name(int cp_index)
1623 {
1624         if (cp_index < 0) return "none";
1625         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1626
1627         return codepages[cp_index].name;
1628 }
1629
1630 /* Get the codepage's name for saving to a configuration file.  These
1631  * names can be converted back to indexes, even in future versions of
1632  * ELinks.  */
1633 unsigned char *
1634 get_cp_config_name(int cp_index)
1635 {
1636         if (cp_index < 0) return "none";
1637         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1638         if (!codepages[cp_index].aliases) return NULL;
1639
1640         return codepages[cp_index].aliases[0];
1641 }
1642
1643 /* Get the codepage's name for sending to a library or server that
1644  * understands MIME charset names.  This function irreversibly maps
1645  * the "System" codepage to the underlying charset.  */
1646 unsigned char *
1647 get_cp_mime_name(int cp_index)
1648 {
1649         if (cp_index < 0) return "none";
1650         cp_index &= ~SYSTEM_CHARSET_FLAG;
1651         if (!codepages[cp_index].aliases) return NULL;
1652
1653         return codepages[cp_index].aliases[0];
1654 }
1655
1656 int
1657 is_cp_utf8(int cp_index)
1658 {
1659         cp_index &= ~SYSTEM_CHARSET_FLAG;
1660         return is_cp_ptr_utf8(&codepages[cp_index]);
1661 }
1662
1663 /* This function will be used by the xhtml parser. */
1664 const uint16_t *
1665 get_cp_highhalf(const unsigned char *name)
1666 {
1667         int cp = get_cp_index(name);
1668
1669         if (cp < 0) return NULL;
1670         cp &= ~SYSTEM_CHARSET_FLAG;
1671         return codepages[cp].highhalf;
1672 }