workbench/libs/codesets/src/codesets.c

   1 /***************************************************************************
   2
   3  codesets.library - Amiga shared library for handling different codesets
   4  Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
   5  Copyright (C) 2005-2014 codesets.library Open Source Team
   6
   7  This library is free software; you can redistribute it and/or
   8  modify it under the terms of the GNU Lesser General Public
   9  License as published by the Free Software Foundation; either
  10  version 2.1 of the License, or (at your option) any later version.
  11
  12  This library is distributed in the hope that it will be useful,
  13  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  Lesser General Public License for more details.
  16
  17  codesets.library project: http://sourceforge.net/projects/codesetslib/
  18
  19  Most of the code included in this file was relicensed from GPL to LGPL
  20  from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
  21  with full permissions by its authors.
  22
  23  $Id$
  24
  25 ***************************************************************************/
  26
  27 #include "lib.h"
  28
  29 #include <clib/alib_protos.h>
  30
  31 #include <diskfont/glyph.h>
  32 #include <diskfont/diskfonttag.h>
  33 #include <proto/diskfont.h>
  34 #include <ctype.h>
  35 #include <limits.h>
  36
  37 #ifdef __MORPHOS__
  38 #include <proto/keymap.h>
  39 #include <proto/locale.h>
  40 #endif
  41
  42 #include "codesets_table.h"
  43 #include "convertUTF.h"
  44 #include "codepages.h"
  45
  46 #include "SDI_stdarg.h"
  47
  48 #include "debug.h"
  49
  50 #define __NOLIBBASE__
  51 #include <proto/codesets.h>
  52
  53 /**************************************************************************/
  54
  55 // a union used for various type casts while avoiding the annoying "dereferencing
  56 // type punned pointer is breaking strict alias rules" warnings of GCC4+
  57 union TypeAliases
  58 {
  59   void **voidptr;
  60   char **schar;
  61   unsigned char **uchar;
  62   STRPTR *strptr;
  63   UTF8 **utf8;
  64   const UTF8 **cutf8;
  65   UTF16 **utf16;
  66   const UTF16 **cutf16;
  67   UTF32 **utf32;
  68   const UTF32 **cutf32;
  69 };
  70
  71 /// BIN_SEARCH()
  72 // search a sorted array in O(log n) e.g.
  73 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
  74 #define BIN_SEARCH(array,low,high,compare,result) \
  75   {\
  76     int l = low;\
  77     int h = high;\
  78     int m = (low+high)/2;\
  79     result = NULL;\
  80     while (l<=h)\
  81     {\
  82       int d = compare;\
  83       if (!d){ result = &array[m]; break; }\
  84       if (d < 0) h = m - 1;\
  85       else l = m + 1;\
  86       m = (l + h)/2;\
  87     }\
  88   }
  89
  90 ///
  91 /// mystrdup()
  92 static STRPTR mystrdup(const char *str)
  93 {
  94   STRPTR newStr = NULL;
  95
  96   ENTER();
  97
  98   if(str != NULL)
  99   {
 100     int len;
 101
 102     if((len = strlen(str)) > 0)
 103     {
 104       if((newStr = allocArbitrateVecPooled(len+1)) != NULL)
 105         strlcpy(newStr, str, len+1);
 106     }
 107   }
 108
 109   RETURN(newStr);
 110   return newStr;
 111 }
 112
 113 ///
 114 /// mystrndup()
 115 static STRPTR mystrndup(const char *str1, int n)
 116 {
 117   STRPTR dest;
 118
 119   ENTER();
 120
 121   if((dest = allocArbitrateVecPooled(n+1)) != NULL)
 122   {
 123     if(str1 != NULL)
 124       strlcpy(dest, str1, n+1);
 125     else
 126       dest[0] = '\0';
 127   }
 128
 129   RETURN(dest);
 130   return dest;
 131 }
 132
 133 ///
 134 /// readLine()
 135 static BOOL readLine(BPTR fh, char *buf, ULONG size)
 136 {
 137   BOOL success = FALSE;
 138   char *c;
 139
 140   ENTER();
 141
 142   if((c = FGets(fh, buf, size)) != NULL)
 143   {
 144     // we succeeded in reading something
 145     success = TRUE;
 146
 147     // now find the end of the line and strip the LF/CR character
 148     for(; *c; c++)
 149     {
 150       if(*c == '\n' || *c == '\r')
 151       {
 152         *c = '\0';
 153         break;
 154       }
 155     }
 156   }
 157
 158   RETURN(success);
 159   return success;
 160 }
 161
 162 ///
 163 /// getConfigItem()
 164 static const char *getConfigItem(const char *buf, const char *item)
 165 {
 166   const char *configItem = NULL;
 167   int len;
 168
 169   ENTER();
 170
 171   len = strlen(item);
 172
 173   if(strnicmp(buf, item, len) == 0)
 174   {
 175     char c;
 176
 177     buf += len;
 178
 179     // skip spaces
 180     while((c = *buf) != '\0' && isspace(c))
 181       buf++;
 182
 183     if(*buf == '=')
 184     {
 185       buf++;
 186
 187       // skip spaces
 188       while((c = *buf) != '\0'  && isspace(c))
 189         buf++;
 190
 191       configItem = buf;
 192     }
 193   }
 194
 195   RETURN(configItem);
 196   return configItem;
 197 }
 198
 199 ///
 200 /// parseUtf8()
 201 static int parseUtf8(CONST_STRPTR *ps)
 202 {
 203   CONST_STRPTR s = *ps;
 204   int wc, n, i;
 205
 206   ENTER();
 207
 208   if(*s<0x80)
 209   {
 210     *ps = s+1;
 211
 212     RETURN(*s);
 213     return *s;
 214   }
 215
 216   if(*s<0xc2)
 217   {
 218     RETURN(-1);
 219     return -1;
 220   }
 221   else
 222   {
 223     if(*s<0xe0)
 224     {
 225       if((s[1] & 0xc0)!=0x80)
 226       {
 227         RETURN(-1);
 228         return -1;
 229       }
 230
 231       *ps = s+2;
 232
 233       RETURN(((s[0] & 0x1f)<<6) | (s[1] & 0x3f));
 234       return ((s[0] & 0x1f)<<6) | (s[1] & 0x3f);
 235     }
 236     else
 237     {
 238       if(*s<0xf0)
 239       {
 240         n = 3;
 241       }
 242       else
 243       {
 244         if(*s<0xf8)
 245         {
 246           n = 4;
 247         }
 248         else
 249         {
 250           if(*s<0xfc)
 251           {
 252             n = 5;
 253           }
 254           else
 255           {
 256             if(*s<0xfe)
 257             {
 258               n = 6;
 259             }
 260             else
 261             {
 262               RETURN(-1);
 263               return -1;
 264             }
 265           }
 266         }
 267       }
 268     }
 269   }
 270
 271   wc = *s++ & ((1<<(7-n))-1);
 272
 273   for(i = 1; i<n; i++)
 274   {
 275     if((*s & 0xc0) != 0x80)
 276     {
 277       RETURN(-1);
 278       return -1;
 279     }
 280
 281     wc = (wc << 6) | (*s++ & 0x3f);
 282   }
 283
 284   if(wc < (1 << (5 * n - 4)))
 285   {
 286     RETURN(-1);
 287     return -1;
 288   }
 289
 290   *ps = s;
 291
 292   RETURN(wc);
 293   return wc;
 294 }
 295
 296 ///
 297 /// countCodesets()
 298 static int countCodesets(struct codesetList *csList, BOOL allowMultibyte)
 299 {
 300   struct Node *node;
 301   int num = 0;
 302
 303   for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
 304   {
 305     struct codeset *cs = (struct codeset *)node;
 306
 307     if(allowMultibyte == TRUE ||
 308        (cs != CodesetsBase->utf8Codeset && cs != CodesetsBase->utf16Codeset && cs != CodesetsBase->utf32Codeset))
 309     {
 310       num++;
 311     }
 312   }
 313
 314   return num;
 315 }
 316
 317 ///
 318 /// mapUTF8toASCII()
 319 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
 320 // function is used to replace these unknown sequences with lookalike characters that
 321 // still make the text more readable. For more replacement see
 322 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
 323 //
 324 // The conversion table in this function is partly borrowed from the awebcharset plugin
 325 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
 326 //
 327 struct UTF8Replacement
 328 {
 329   const char *utf8;     // the original UTF8 string we are going to replace
 330   const int utf8len;    // the length of the UTF8 string
 331   const char *rep;      // pointer to the replacement string
 332   const int replen;     // the length of the replacement string (minus for signalling an UTF8 string)
 333 };
 334
 335 static int compareUTF8Replacements(const void *p1, const void *p2)
 336 {
 337   struct UTF8Replacement *key = (struct UTF8Replacement *)p1;
 338   struct UTF8Replacement *rep = (struct UTF8Replacement *)p2;
 339   int cmp;
 340
 341   // compare the length first, after that compare the strings
 342   cmp = key->utf8len - rep->utf8len;
 343   if(cmp == 0)
 344     cmp = memcmp(key->utf8, rep->utf8, key->utf8len);
 345
 346   return cmp;
 347 }
 348
 349 static int mapUTF8toASCII(const char **dst, const unsigned char *src, const int utf8len)
 350 {
 351   int len = 0;
 352   struct UTF8Replacement key = { (char *)src, utf8len, NULL, 0 };
 353   struct UTF8Replacement *rep;
 354
 355   static struct UTF8Replacement const utf8map[] =
 356   {
 357     // U+0100 ... U+017F (Latin Extended-A)
 358     { "\xC4\x80", 2, "A",         1 }, // U+0100 -> A       (LATIN CAPITAL LETTER A WITH MACRON)
 359     { "\xC4\x81", 2, "a",         1 }, // U+0101 -> a       (LATIN SMALL LETTER A WITH MACRON)
 360     { "\xC4\x82", 2, "A",         1 }, // U+0102 -> A       (LATIN CAPITAL LETTER A WITH BREVE)
 361     { "\xC4\x83", 2, "a",         1 }, // U+0103 -> a       (LATIN SMALL LETTER A WITH BREVE)
 362     { "\xC4\x84", 2, "A",         1 }, // U+0104 -> A       (LATIN CAPITAL LETTER A WITH OGONEK)
 363     { "\xC4\x85", 2, "a",         1 }, // U+0105 -> a       (LATIN SMALL LETTER A WITH OGONEK)
 364     { "\xC4\x86", 2, "C",         1 }, // U+0106 -> C       (LATIN CAPITAL LETTER C WITH ACUTE)
 365     { "\xC4\x87", 2, "c",         1 }, // U+0107 -> c       (LATIN SMALL LETTER C WITH ACUTE)
 366     { "\xC4\x88", 2, "C",         1 }, // U+0108 -> C       (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
 367     { "\xC4\x89", 2, "c",         1 }, // U+0109 -> c       (LATIN SMALL LETTER C WITH CIRCUMFLEX)
 368     { "\xC4\x8A", 2, "C",         1 }, // U+010A -> C       (LATIN CAPITAL LETTER C WITH DOT ABOVE)
 369     { "\xC4\x8B", 2, "c",         1 }, // U+010B -> c       (LATIN SMALL LETTER C WITH DOT ABOVE)
 370     { "\xC4\x8C", 2, "C",         1 }, // U+010C -> C       (LATIN CAPITAL LETTER C WITH CARON)
 371     { "\xC4\x8D", 2, "c",         1 }, // U+010D -> c       (LATIN SMALL LETTER C WITH CARON)
 372     { "\xC4\x8E", 2, "D",         1 }, // U+010E -> D       (LATIN CAPITAL LETTER D WITH CARON)
 373     { "\xC4\x8F", 2, "d",         1 }, // U+010F -> d       (LATIN SMALL LETTER D WITH CARON)
 374     { "\xC4\x90", 2, "D",         1 }, // U+0110 -> D       (LATIN CAPITAL LETTER D WITH STROKE)
 375     { "\xC4\x91", 2, "d",         1 }, // U+0111 -> d       (LATIN SMALL LETTER D WITH STROKE)
 376     { "\xC4\x92", 2, "E",         1 }, // U+0112 -> E       (LATIN CAPITAL LETTER E WITH MACRON)
 377     { "\xC4\x93", 2, "e",         1 }, // U+0113 -> e       (LATIN SMALL LETTER E WITH MACRON)
 378     { "\xC4\x94", 2, "E",         1 }, // U+0114 -> E       (LATIN CAPITAL LETTER E WITH BREVE)
 379     { "\xC4\x95", 2, "e",         1 }, // U+0115 -> e       (LATIN SMALL LETTER E WITH BREVE)
 380     { "\xC4\x96", 2, "E",         1 }, // U+0116 -> E       (LATIN CAPITAL LETTER E WITH DOT ABOVE)
 381     { "\xC4\x97", 2, "e",         1 }, // U+0117 -> e       (LATIN SMALL LETTER E WITH DOT ABOVE)
 382     { "\xC4\x98", 2, "E",         1 }, // U+0118 -> E       (LATIN CAPITAL LETTER E WITH OGONEK)
 383     { "\xC4\x99", 2, "e",         1 }, // U+0119 -> e       (LATIN SMALL LETTER E WITH OGONEK)
 384     { "\xC4\x9A", 2, "E",         1 }, // U+011A -> E       (LATIN CAPITAL LETTER E WITH CARON)
 385     { "\xC4\x9B", 2, "e",         1 }, // U+011B -> e       (LATIN SMALL LETTER E WITH CARON)
 386     { "\xC4\x9C", 2, "G",         1 }, // U+011C -> G       (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
 387     { "\xC4\x9D", 2, "g",         1 }, // U+011D -> g       (LATIN SMALL LETTER G WITH CIRCUMFLEX)
 388     { "\xC4\x9E", 2, "G",         1 }, // U+011E -> G       (LATIN CAPITAL LETTER G WITH BREVE)
 389     { "\xC4\x9F", 2, "g",         1 }, // U+011F -> g       (LATIN SMALL LETTER G WITH BREVE)
 390     { "\xC4\xA0", 2, "G",         1 }, // U+0120 -> G       (LATIN CAPITAL LETTER G WITH DOT ABOVE)
 391     { "\xC4\xA1", 2, "g",         1 }, // U+0121 -> g       (LATIN SMALL LETTER G WITH DOT ABOVE)
 392     { "\xC4\xA2", 2, "G",         1 }, // U+0122 -> G       (LATIN CAPITAL LETTER G WITH CEDILLA)
 393     { "\xC4\xA3", 2, "g",         1 }, // U+0123 -> g       (LATIN SMALL LETTER G WITH CEDILLA)
 394     { "\xC4\xA4", 2, "H",         1 }, // U+0124 -> H       (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
 395     { "\xC4\xA5", 2, "h",         1 }, // U+0125 -> h       (LATIN SMALL LETTER H WITH CIRCUMFLEX)
 396     { "\xC4\xA6", 2, "H",         1 }, // U+0126 -> H       (LATIN CAPITAL LETTER H WITH STROKE)
 397     { "\xC4\xA7", 2, "h",         1 }, // U+0127 -> h       (LATIN SMALL LETTER H WITH STROKE)
 398     { "\xC4\xA8", 2, "I",         1 }, // U+0128 -> I       (LATIN CAPITAL LETTER I WITH TILDE)
 399     { "\xC4\xA9", 2, "i",         1 }, // U+0129 -> i       (LATIN SMALL LETTER I WITH TILDE)
 400     { "\xC4\xAA", 2, "I",         1 }, // U+012A -> I       (LATIN CAPITAL LETTER I WITH MACRON)
 401     { "\xC4\xAB", 2, "i",         1 }, // U+012B -> i       (LATIN SMALL LETTER I WITH MACRON)
 402     { "\xC4\xAC", 2, "I",         1 }, // U+012C -> I       (LATIN CAPITAL LETTER I WITH BREVE)
 403     { "\xC4\xAD", 2, "i",         1 }, // U+012D -> i       (LATIN SMALL LETTER I WITH BREVE)
 404     { "\xC4\xAE", 2, "I",         1 }, // U+012E -> I       (LATIN CAPITAL LETTER I WITH OGONEK)
 405     { "\xC4\xAF", 2, "i",         1 }, // U+012F -> i       (LATIN SMALL LETTER I WITH OGONEK)
 406     { "\xC4\xB0", 2, "I",         1 }, // U+0130 -> I       (LATIN CAPITAL LETTER I WITH DOT ABOVE)
 407     { "\xC4\xB1", 2, "i",         1 }, // U+0131 -> i       (LATIN SMALL LETTER DOTLESS I)
 408     { "\xC4\xB2", 2, "Ij",        2 }, // U+0132 -> Ij      (LATIN CAPITAL LIGATURE IJ)
 409     { "\xC4\xB3", 2, "ij",        2 }, // U+0133 -> ij      (LATIN SMALL LIGATURE IJ)
 410     { "\xC4\xB4", 2, "J",         1 }, // U+0134 -> J       (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
 411     { "\xC4\xB5", 2, "j",         1 }, // U+0135 -> j       (LATIN SMALL LETTER J WITH CIRCUMFLEX)
 412     { "\xC4\xB6", 2, "K",         1 }, // U+0136 -> K       (LATIN CAPITAL LETTER K WITH CEDILLA)
 413     { "\xC4\xB7", 2, "k",         1 }, // U+0137 -> k       (LATIN SMALL LETTER K WITH CEDILLA)
 414     { "\xC4\xB8", 2, "k",         1 }, // U+0138 -> k       (LATIN SMALL LETTER KRA)
 415     { "\xC4\xB9", 2, "L",         1 }, // U+0139 -> L       (LATIN CAPITAL LETTER L WITH ACUTE)
 416     { "\xC4\xBA", 2, "l",         1 }, // U+013A -> l       (LATIN SMALL LETTER L WITH ACUTE)
 417     { "\xC4\xBB", 2, "L",         1 }, // U+013B -> L       (LATIN CAPITAL LETTER L WITH CEDILLA)
 418     { "\xC4\xBC", 2, "l",         1 }, // U+013C -> l       (LATIN SMALL LETTER L WITH CEDILLA)
 419     { "\xC4\xBD", 2, "L",         1 }, // U+013D -> L       (LATIN CAPITAL LETTER L WITH CARON)
 420     { "\xC4\xBE", 2, "l",         1 }, // U+013E -> l       (LATIN SMALL LETTER L WITH CARON)
 421     { "\xC4\xBF", 2, "L",         1 }, // U+013F -> L       (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
 422     { "\xC5\x80", 2, "l",         1 }, // U+0140 -> l       (LATIN SMALL LETTER L WITH MIDDLE DOT)
 423     { "\xC5\x81", 2, "L",         1 }, // U+0141 -> L       (LATIN CAPITAL LETTER L WITH STROKE)
 424     { "\xC5\x82", 2, "l",         1 }, // U+0142 -> l       (LATIN SMALL LETTER L WITH STROKE)
 425     { "\xC5\x83", 2, "N",         1 }, // U+0143 -> N       (LATIN CAPITAL LETTER N WITH ACUTE)
 426     { "\xC5\x84", 2, "n",         1 }, // U+0144 -> n       (LATIN SMALL LETTER N WITH ACUTE)
 427     { "\xC5\x85", 2, "N",         1 }, // U+0145 -> N       (LATIN CAPITAL LETTER N WITH CEDILLA)
 428     { "\xC5\x86", 2, "n",         1 }, // U+0146 -> n       (LATIN SMALL LETTER N WITH CEDILLA)
 429     { "\xC5\x87", 2, "N",         1 }, // U+0147 -> N       (LATIN CAPITAL LETTER N WITH CARON)
 430     { "\xC5\x88", 2, "n",         1 }, // U+0148 -> n       (LATIN SMALL LETTER N WITH CARON)
 431     { "\xC5\x89", 2, "'n",        2 }, // U+0149 -> 'n      (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
 432     { "\xC5\x8A", 2, "Ng",        2 }, // U+014A -> Ng      (LATIN CAPITAL LETTER ENG)
 433     { "\xC5\x8B", 2, "ng",        2 }, // U+014B -> ng      (LATIN SMALL LETTER ENG)
 434     { "\xC5\x8C", 2, "O",         1 }, // U+014C -> O       (LATIN CAPITAL LETTER O WITH MACRON)
 435     { "\xC5\x8D", 2, "o",         1 }, // U+014D -> o       (LATIN SMALL LETTER O WITH MACRON)
 436     { "\xC5\x8E", 2, "O",         1 }, // U+014E -> O       (LATIN CAPITAL LETTER O WITH BREVE)
 437     { "\xC5\x8F", 2, "o",         1 }, // U+014F -> o       (LATIN SMALL LETTER O WITH BREVE)
 438     { "\xC5\x90", 2, "O",         1 }, // U+0150 -> O       (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
 439     { "\xC5\x91", 2, "o",         1 }, // U+0151 -> o       (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
 440     { "\xC5\x92", 2, "Oe",        2 }, // U+0152 -> Oe      (LATIN CAPITAL LIGATURE OE)
 441     { "\xC5\x93", 2, "oe",        2 }, // U+0153 -> oe      (LATIN SMALL LIGATURE OE)
 442     { "\xC5\x94", 2, "R",         1 }, // U+0154 -> R       (LATIN CAPITAL LETTER R WITH ACUTE)
 443     { "\xC5\x95", 2, "r",         1 }, // U+0155 -> r       (LATIN SMALL LETTER R WITH ACUTE)
 444     { "\xC5\x96", 2, "R",         1 }, // U+0156 -> R       (LATIN CAPITAL LETTER R WITH CEDILLA)
 445     { "\xC5\x97", 2, "r",         1 }, // U+0157 -> r       (LATIN SMALL LETTER R WITH CEDILLA)
 446     { "\xC5\x98", 2, "R",         1 }, // U+0158 -> R       (LATIN CAPITAL LETTER R WITH CARON)
 447     { "\xC5\x99", 2, "r",         1 }, // U+0159 -> r       (LATIN SMALL LETTER R WITH CARON)
 448     { "\xC5\x9A", 2, "S",         1 }, // U+015A -> S       (LATIN CAPITAL LETTER S WITH ACUTE)
 449     { "\xC5\x9B", 2, "s",         1 }, // U+015B -> s       (LATIN SMALL LETTER S WITH ACUTE)
 450     { "\xC5\x9C", 2, "S",         1 }, // U+015C -> S       (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
 451     { "\xC5\x9D", 2, "s",         1 }, // U+015D -> s       (LATIN SMALL LETTER S WITH CIRCUMFLEX)
 452     { "\xC5\x9E", 2, "S",         1 }, // U+015E -> S       (LATIN CAPITAL LETTER S WITH CEDILLA)
 453     { "\xC5\x9F", 2, "s",         1 }, // U+015F -> s       (LATIN SMALL LETTER S WITH CEDILLA)
 454     { "\xC5\xA0", 2, "S",         1 }, // U+0160 -> S       (LATIN CAPITAL LETTER S WITH CARON)
 455     { "\xC5\xA1", 2, "s",         1 }, // U+0161 -> s       (LATIN SMALL LETTER S WITH CARON)
 456     { "\xC5\xA2", 2, "T",         1 }, // U+0162 -> T       (LATIN CAPITAL LETTER T WITH CEDILLA)
 457     { "\xC5\xA3", 2, "t",         1 }, // U+0163 -> t       (LATIN SMALL LETTER T WITH CEDILLA)
 458     { "\xC5\xA4", 2, "T",         1 }, // U+0164 -> T       (LATIN CAPITAL LETTER T WITH CARON)
 459     { "\xC5\xA5", 2, "t",         1 }, // U+0165 -> t       (LATIN SMALL LETTER T WITH CARON)
 460     { "\xC5\xA6", 2, "T",         1 }, // U+0166 -> T       (LATIN CAPITAL LETTER T WITH STROKE)
 461     { "\xC5\xA7", 2, "t",         1 }, // U+0167 -> t       (LATIN SMALL LETTER T WITH STROKE)
 462     { "\xC5\xA8", 2, "U",         1 }, // U+0168 -> U       (LATIN CAPITAL LETTER U WITH TILDE)
 463     { "\xC5\xA9", 2, "u",         1 }, // U+0169 -> u       (LATIN SMALL LETTER U WITH TILDE)
 464     { "\xC5\xAA", 2, "U",         1 }, // U+016A -> U       (LATIN CAPITAL LETTER U WITH MACRON)
 465     { "\xC5\xAB", 2, "u",         1 }, // U+016B -> u       (LATIN SMALL LETTER U WITH MACRON)
 466     { "\xC5\xAC", 2, "U",         1 }, // U+016C -> U       (LATIN CAPITAL LETTER U WITH BREVE)
 467     { "\xC5\xAD", 2, "u",         1 }, // U+016D -> u       (LATIN SMALL LETTER U WITH BREVE)
 468     { "\xC5\xAE", 2, "U",         1 }, // U+016E -> U       (LATIN CAPITAL LETTER U WITH RING ABOVE)
 469     { "\xC5\xAF", 2, "u",         1 }, // U+016F -> u       (LATIN SMALL LETTER U WITH RING ABOVE)
 470     { "\xC5\xB0", 2, "U",         1 }, // U+0170 -> U       (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
 471     { "\xC5\xB1", 2, "u",         1 }, // U+0171 -> u       (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
 472     { "\xC5\xB2", 2, "U",         1 }, // U+0172 -> U       (LATIN CAPITAL LETTER U WITH OGONEK)
 473     { "\xC5\xB3", 2, "u",         1 }, // U+0173 -> u       (LATIN SMALL LETTER U WITH OGONEK)
 474     { "\xC5\xB4", 2, "W",         1 }, // U+0174 -> W       (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
 475     { "\xC5\xB5", 2, "w",         1 }, // U+0175 -> w       (LATIN SMALL LETTER W WITH CIRCUMFLEX)
 476     { "\xC5\xB6", 2, "Y",         1 }, // U+0176 -> Y       (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
 477     { "\xC5\xB7", 2, "y",         1 }, // U+0177 -> y       (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
 478     { "\xC5\xB8", 2, "Y",         1 }, // U+0178 -> Y       (LATIN CAPITAL LETTER Y WITH DIAERESIS)
 479     { "\xC5\xB9", 2, "Z",         1 }, // U+0179 -> Z       (LATIN CAPITAL LETTER Z WITH ACUTE)
 480     { "\xC5\xBA", 2, "z",         1 }, // U+017A -> z       (LATIN SMALL LETTER Z WITH ACUTE)
 481     { "\xC5\xBB", 2, "Z",         1 }, // U+017B -> Z       (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
 482     { "\xC5\xBC", 2, "z",         1 }, // U+017C -> z       (LATIN SMALL LETTER Z WITH DOT ABOVE)
 483     { "\xC5\xBD", 2, "Z",         1 }, // U+017D -> Z       (LATIN CAPITAL LETTER Z WITH CARON)
 484     { "\xC5\xBE", 2, "z",         1 }, // U+017E -> z       (LATIN SMALL LETTER Z WITH CARON)
 485     { "\xC5\xBF", 2, "s",         1 }, // U+017F -> s       (LATIN SMALL LETTER LONG S
 486
 487     // U+2000 ... U+206F (General Punctuation)
 488     { "\xE2\x80\x90", 3, "-",         1 }, // U+2010 -> -       (HYPHEN)
 489     { "\xE2\x80\x91", 3, "-",         1 }, // U+2011 -> -       (NON-BREAKING HYPHEN)
 490     { "\xE2\x80\x92", 3, "--",        2 }, // U+2012 -> --      (FIGURE DASH)
 491     { "\xE2\x80\x93", 3, "--",        2 }, // U+2013 -> --      (EN DASH)
 492     { "\xE2\x80\x94", 3, "---",       3 }, // U+2014 -> ---     (EM DASH)
 493     { "\xE2\x80\x95", 3, "---",       3 }, // U+2015 -> ---     (HORIZONTAL BAR)
 494     { "\xE2\x80\x96", 3, "||",        2 }, // U+2016 -> ||      (DOUBLE VERTICAL LINE)
 495     { "\xE2\x80\x97", 3, "_",         1 }, // U+2017 -> _       (DOUBLE LOW LINE)
 496     { "\xE2\x80\x98", 3, "`",         1 }, // U+2018 -> `       (LEFT SINGLE QUOTATION MARK)
 497     { "\xE2\x80\x99", 3, "'",         1 }, // U+2019 -> '       (RIGHT SINGLE QUOTATION MARK)
 498     { "\xE2\x80\x9A", 3, ",",         1 }, // U+201A -> ,       (SINGLE LOW-9 QUOTATION MARK)
 499     { "\xE2\x80\x9B", 3, "'",         1 }, // U+201B -> '       (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
 500     { "\xE2\x80\x9C", 3, "\"",        1 }, // U+201C -> "       (LEFT DOUBLE QUOTATION MARK)
 501     { "\xE2\x80\x9D", 3, "\"",        1 }, // U+201D -> "       (RIGHT DOUBLE QUOTATION MARK)
 502     { "\xE2\x80\x9E", 3, ",,",        2 }, // U+201E -> ,,      (DOUBLE LOW-9 QUOTATION MARK)
 503     { "\xE2\x80\x9F", 3, "``",        2 }, // U+201F -> ``      (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
 504     { "\xE2\x80\xA0", 3, "+",         1 }, // U+2020 -> +       (DAGGER)
 505     { "\xE2\x80\xA1", 3, "+",         1 }, // U+2021 -> +       (DOUBLE DAGGER)
 506     { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7  (BULLET) -> (MIDDLE POINT)
 507     { "\xE2\x80\xA3", 3, ".",         1 }, // U+2023 -> .       (TRIANGULAR BULLET)
 508     { "\xE2\x80\xA4", 3, ".",         1 }, // U+2024 -> .       (ONE DOT LEADER)
 509     { "\xE2\x80\xA5", 3, "..",        2 }, // U+2025 -> ..      (TWO DOT LEADER)
 510     { "\xE2\x80\xA6", 3, "...",       3 }, // U+2026 -> ...     (HORIZONTAL ELLIPSIS)
 511     { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7  (HYPHENATION POINT) -> (MIDDLE POINT)
 512     { "\xE2\x80\xB0", 3, "%.",        2 }, // U+2030 -> %.      (PER MILLE SIGN)
 513     { "\xE2\x80\xB1", 3, "%..",       3 }, // U+2031 -> %..     (PER TEN THOUSAND SIGN)
 514     { "\xE2\x80\xB2", 3, "'",         1 }, // U+2032 -> `       (PRIME)
 515     { "\xE2\x80\xB3", 3, "''",        2 }, // U+2033 -> ''      (DOUBLE PRIME)
 516     { "\xE2\x80\xB4", 3, "'''",       3 }, // U+2034 -> '''     (TRIPLE PRIME)
 517     { "\xE2\x80\xB5", 3, "`",         1 }, // U+2035 -> `       (REVERSED PRIME)
 518     { "\xE2\x80\xB6", 3, "``",        2 }, // U+2036 -> ``      (REVERSED DOUBLE PRIME)
 519     { "\xE2\x80\xB7", 3, "```",       3 }, // U+2037 -> ```     (REVERSED TRIPLE PRIME)
 520     { "\xE2\x80\xB8", 3, "^",         1 }, // U+2038 -> ^       (CARET)
 521     { "\xE2\x80\xB9", 3, "<",         1 }, // U+2039 -> <       (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
 522     { "\xE2\x80\xBA", 3, ">",         1 }, // U+203A -> >       (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
 523     { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7  (REFERENCE MARK) -> (MULTIPLICATION SIGN)
 524     { "\xE2\x80\xBC", 3, "!!",        2 }, // U+203C -> !!      (DOUBLE EXCLAMATION MARK)
 525     { "\xE2\x80\xBD", 3, "?",         1 }, // U+203D -> ?       (INTERROBANG)
 526     { "\xE2\x81\x82", 3, "*",         1 }, // U+2042 -> *       (ASTERISM)
 527     { "\xE2\x81\x83", 3, ".",         1 }, // U+2043 -> .       (HYPHEN BULLET)
 528     { "\xE2\x81\x84", 3, "/",         1 }, // U+2044 -> /       (FRACTION SLASH)
 529     { "\xE2\x81\x87", 3, "??",        2 }, // U+2047 -> ??      (DOUBLE QUESTION MARK)
 530     { "\xE2\x81\x88", 3, "?!",        2 }, // U+2048 -> ?!      (QUESTION EXCLAMATION MARK)
 531     { "\xE2\x81\x89", 3, "!?",        2 }, // U+2049 -> !?      (EXCLAMATION QUESTION MARK)
 532     { "\xE2\x81\x8E", 3, "*",         1 }, // U+204E -> *       (LOW ASTERISK)
 533     { "\xE2\x81\x8F", 3, ";",         1 }, // U+204F -> ;       (REVERSED SEMICOLON)
 534     { "\xE2\x81\x91", 3, "*",         1 }, // U+2051 -> *       (TWO ASTERISKS ALIGNED VERTICALLY)
 535     { "\xE2\x81\x92", 3, "-",         1 }, // U+2052 -> -       (COMMERCIAL MINUS SIGN)
 536     { "\xE2\x81\x93", 3, "~",         1 }, // U+2053 -> ~       (SWUNG DASH)
 537     { "\xE2\x81\x95", 3, "*",         1 }, // U+2055 -> *       (FLOWER PUNCTUATION MARK)
 538     { "\xE2\x81\x97", 3, "''''",      4 }, // U+2057 -> ''''    (QUADRUPLE PRIME)
 539     { "\xE2\x81\x9A", 3, ":",         1 }, // U+205A -> :       (TWO DOT PUNCTUATION)
 540     { "\xE2\x81\x9C", 3, "+",         1 }, // U+205C -> +       (DOTTED CROSS)
 541
 542     // U+20A0 ... U+20CF (Currency Symbols)
 543     { "\xE2\x82\xA0", 3, "ECU",       3 }, // U+20A0 -> ECU     (EURO-CURRENCY SIGN)
 544     { "\xE2\x82\xA1", 3, "CRC",       3 }, // U+20A1 -> CRC     (COLON SIGN)
 545     { "\xE2\x82\xA2", 3, "BRC",       3 }, // U+20A2 -> BRC     (CRUZEIRO SIGN)
 546     { "\xE2\x82\xA3", 3, "BEF",       3 }, // U+20A3 -> BEF     (FRENCH FRANC SIGN)
 547     { "\xE2\x82\xA4", 3, "ITL",       3 }, // U+20A4 -> ITL     (LIRA SIGN)
 548     { "\xE2\x82\xA6", 3, "NGN",       3 }, // U+20A6 -> NGN     (NEIRA SIGN)
 549     { "\xE2\x82\xA7", 3, "ESP",       3 }, // U+20A7 -> ESP     (PESETA SIGN)
 550     { "\xE2\x82\xA8", 3, "MVQ",       3 }, // U+20A8 -> MVQ     (RUPEE SIGN)
 551     { "\xE2\x82\xA9", 3, "KPW",       3 }, // U+20A9 -> KPW     (WON SIGN)
 552     { "\xE2\x82\xAA", 3, "ILS",       3 }, // U+20AA -> ILS     (NEW SHEQEL SIGN)
 553     { "\xE2\x82\xAB", 3, "VNC",       3 }, // U+20AB -> VNC     (DONG SIGN)
 554     { "\xE2\x82\xAC", 3, "EUR",       3 }, // U+20AC -> EUR     (EURO SIGN)
 555     { "\xE2\x82\xAD", 3, "LAK",       3 }, // U+20AD -> LAK     (KIP SIGN)
 556     { "\xE2\x82\xAE", 3, "MNT",       3 }, // U+20AE -> MNT     (TUGRIK SIGN)
 557     { "\xE2\x82\xAF", 3, "GRD",       3 }, // U+20AF -> GRD     (DRACHMA SIGN)
 558     { "\xE2\x82\xB0", 3, "Pf",        2 }, // U+20B0 -> Pf      (GERMAN PENNY SIGN)
 559     { "\xE2\x82\xB1", 3, "P",         1 }, // U+20B1 -> P       (PESO SIGN)
 560     { "\xE2\x82\xB2", 3, "PYG",       3 }, // U+20B2 -> PYG     (GUARANI SIGN)
 561     { "\xE2\x82\xB3", 3, "ARA",       3 }, // U+20B3 -> ARA     (AUSTRAL SIGN)
 562     { "\xE2\x82\xB4", 3, "UAH",       3 }, // U+20B4 -> UAH     (HRYVNIA SIGN)
 563     { "\xE2\x82\xB5", 3, "GHS",       3 }, // U+20B5 -> GHS     (CEDI SIGN)
 564
 565     // U+2190 ... U+21FF (Arrows)
 566     { "\xE2\x86\x90", 3, "<-",        2 }, // U+2190 -> <-      (LEFTWARDS ARROW)
 567     { "\xE2\x86\x92", 3, "->",        2 }, // U+2192 -> ->      (RIGHTWARDS ARROW)
 568   };
 569
 570   ENTER();
 571
 572   // start with no replacement string
 573   *dst = NULL;
 574
 575   // perform a binary search in the lookup table
 576   if((rep = bsearch(&key, utf8map, sizeof(utf8map) / sizeof(utf8map[0]), sizeof(utf8map[0]), compareUTF8Replacements)) != NULL)
 577   {
 578     // if we found something, then copy this over to the result variables
 579     *dst = rep->rep;
 580     len = rep->replen;
 581   }
 582
 583   RETURN(len);
 584   return len;
 585 }
 586
 587 ///
 588 /// matchCodesetAlias()
 589 //
 590 struct CodesetAliases
 591 {
 592   const char *MIMEname;   // The official and correct MIME name for a codeset
 593   const char *Aliases;    // A space separated array with well-known aliases
 594 };
 595
 596 const struct CodesetAliases codesetAliases[] =
 597 {
 598   // MIME name       Aliases
 599   { "Amiga-1251",   "Ami1251 Amiga1251"  },
 600   { "AmigaPL",      "AmiPL Amiga-PL"     },
 601   { "ISO-8859-1",   "ISO8859-1 8859-1" },
 602   { "ISO-8859-2",   "ISO8859-2 8859-2" },
 603   { "ISO-8859-3",   "ISO8859-3 8859-3" },
 604   { "ISO-8859-4",   "ISO8859-4 8859-4" },
 605   { "ISO-8859-5",   "ISO8859-5 8859-5" },
 606   { "ISO-8859-6",   "ISO8859-6 8859-6" },
 607   { "ISO-8859-7",   "ISO8859-7 8859-7" },
 608   { "ISO-8859-8",   "ISO8859-8 8859-8" },
 609   { "ISO-8859-9",   "ISO8859-9 8859-9" },
 610   { "ISO-8859-10",  "ISO8859-10 8859-10" },
 611   { "ISO-8859-11",  "ISO8859-11 8859-11" },
 612   { "ISO-8859-12",  "ISO8859-12 8859-12" },
 613   { "ISO-8859-13",  "ISO8859-13 8859-13" },
 614   { "ISO-8859-14",  "ISO8859-14 8859-14" },
 615   { "ISO-8859-15",  "ISO8859-15 8859-15" },
 616   { "ISO-8859-16",  "ISO8859-16 8859-16" },
 617   { "ISO-8859-10",  "ISO8859-10 8859-10" },
 618   { "KOI8-R",       "KOI8R" },
 619   { "US-ASCII",     "ASCII" },
 620   { "UTF-8",        "UTF8 UTF" },
 621   { "UTF-16",       "UTF16" },
 622   { "UTF-32",       "UTF32" },
 623   { "windows-1250", "cp1250 windows1250" },
 624   { "windows-1251", "cp1251 windows1251" },
 625   { "windows-1252", "cp1252 windows1252" },
 626   { "windows-1253", "cp1253 windows1253" },
 627   { "windows-1254", "cp1254 windows1254" },
 628   { "windows-1255", "cp1255 windows1255" },
 629   { "windows-1256", "cp1256 windows1256" },
 630   { "windows-1257", "cp1257 windows1257" },
 631   { NULL,           NULL,                }
 632 };
 633
 634 static const char *matchCodesetAlias(const char *search)
 635 {
 636   const char *result = NULL;
 637   size_t len = strlen(search);
 638   int i;
 639
 640   ENTER();
 641
 642   for(i=0; codesetAliases[i].MIMEname != NULL; i++)
 643   {
 644     BOOL found = FALSE;
 645
 646     // search the MIMEname first
 647     if(stricmp(search, codesetAliases[i].MIMEname) == 0)
 648       found = TRUE;
 649     else
 650     {
 651       const char *s = codesetAliases[i].Aliases;
 652
 653       // loop through space separated list of aliases
 654       while(s != NULL && *s != '\0')
 655       {
 656         if(strnicmp(search, s, len) == 0)
 657         {
 658           found = TRUE;
 659           break;
 660         }
 661
 662         if((s = strpbrk(s, " ")) != NULL)
 663           s++;
 664       }
 665     }
 666
 667     if(found == TRUE)
 668     {
 669       result = codesetAliases[i].MIMEname;
 670
 671       break;
 672     }
 673   }
 674
 675   RETURN(result);
 676   return result;
 677 }
 678
 679 ///
 680
 681 /**************************************************************************/
 682
 683 /// defaultCodeset()
 684 static struct codeset *defaultCodeset(BOOL useSemaphore)
 685 {
 686   char buf[256];
 687   struct codeset *codeset;
 688
 689   ENTER();
 690
 691   if(useSemaphore == TRUE)
 692     ObtainSemaphoreShared(&CodesetsBase->libSem);
 693
 694   buf[0] = '\0';
 695   GetVar("codeset_default" ,buf, sizeof(buf), GVF_GLOBAL_ONLY);
 696
 697   if(buf[0] == '\0' || (codeset = codesetsFind(&CodesetsBase->codesets, buf)) == NULL)
 698     codeset = CodesetsBase->systemCodeset;
 699
 700   if(useSemaphore == TRUE)
 701     ReleaseSemaphore(&CodesetsBase->libSem);
 702
 703   RETURN(codeset);
 704   return codeset;
 705 }
 706
 707 ///
 708 /// codesetsCmpUnicode()
 709 // The compare function
 710 static int codesetsCmpUnicode(const void *a1, const void *a2)
 711 {
 712   struct single_convert *arg1 = (struct single_convert *)a1;
 713   struct single_convert *arg2 = (struct single_convert *)a2;
 714
 715   return strcmp((char*)&arg1->utf8[1], (char*)&arg2->utf8[1]);
 716 }
 717
 718 ///
 719 /// codesetsReadTable()
 720
 721 #define ITEM_STANDARD           "Standard"
 722 #define ITEM_ALTSTANDARD        "AltStandard"
 723 #define ITEM_READONLY           "ReadOnly"
 724 #define ITEM_CHARACTERIZATION   "Characterization"
 725
 726 // Reads a coding table and adds it
 727 static BOOL codesetsReadTable(struct codesetList *csList, STRPTR name)
 728 {
 729   BPTR fh;
 730   BOOL res = FALSE;
 731
 732   ENTER();
 733
 734   D(DBF_STARTUP, "trying to read charset file '%s'...", name);
 735
 736   if((fh = Open(name, MODE_OLDFILE)) != (BPTR)NULL)
 737   {
 738     struct codeset *codeset;
 739
 740     if((codeset = (struct codeset *)allocArbitrateVecPooled(sizeof(*codeset))) != NULL)
 741     {
 742       int i;
 743       char buf[512];
 744
 745       memset(codeset, 0, sizeof(*codeset));
 746
 747       for(i = 0; i<256; i++)
 748       {
 749         codeset->table[i].code = i;
 750         codeset->table[i].ucs4 = i;
 751       }
 752
 753       while(readLine(fh, buf, sizeof(buf)) == TRUE)
 754       {
 755         const char *result;
 756
 757         if(buf[0] != '#')
 758         {
 759           if((result = getConfigItem(buf, ITEM_STANDARD)) != NULL)
 760             codeset->name = mystrdup(result);
 761           else if(codeset->name == NULL) // a valid file starts with "Standard" and nothing else!!
 762             break;
 763           else if((result = getConfigItem(buf, ITEM_ALTSTANDARD)) != NULL)
 764             codeset->alt_name = mystrdup(result);
 765           else if((result = getConfigItem(buf, ITEM_READONLY)) != NULL)
 766             codeset->read_only = (atoi(result) == 0) ? 0 : 1;
 767           else if((result = getConfigItem(buf, ITEM_CHARACTERIZATION)) != NULL)
 768           {
 769             if(result[0] == '_' && result[1] == '(' && result[2] == '"')
 770             {
 771               char *end = strchr(result + 3, '"');
 772
 773               if(end != NULL)
 774                 codeset->characterization = mystrndup(result+3, end-(result+3));
 775             }
 776             else
 777               codeset->characterization = mystrdup(result);
 778           }
 779           else
 780           {
 781             char *p = buf;
 782             int fmt2 = 0;
 783
 784             if(*p == '=' || (fmt2 = ((*p=='0') || (*(p+1)=='x'))))
 785             {
 786               p++;
 787               p += fmt2;
 788
 789               i = strtol(p, &p, 16);
 790               if(i>0 && i<256)
 791               {
 792                 while(isspace(*p))
 793                   p++;
 794
 795                 if(strnicmp(p, "U+", 2) == 0)
 796                 {
 797                   p += 2;
 798                   codeset->table[i].ucs4 = strtol(p, &p, 16);
 799                 }
 800                 else if(*p != '#')
 801                 {
 802                   codeset->table[i].ucs4 = strtol(p, &p, 0);
 803                 }
 804               }
 805             }
 806           }
 807         }
 808       }
 809
 810       // check if there is not already codeset with the same name in here
 811       if(codeset->name != NULL && codesetsFind(csList, codeset->name) == NULL)
 812       {
 813         for(i=0; i<256; i++)
 814         {
 815           UTF32 src = codeset->table[i].ucs4;
 816           UTF32 *src_ptr = &src;
 817           UTF8 *dest_ptr = &codeset->table[i].utf8[1];
 818
 819           CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
 820           *dest_ptr = 0;
 821           codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)(&codeset->table[i].utf8[1]);
 822         }
 823
 824         memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
 825         qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
 826         D(DBF_STARTUP, "adding external codeset '%s'", codeset->name);
 827         AddTail((struct List *)csList, (struct Node *)&codeset->node);
 828
 829         res = TRUE;
 830       }
 831       else
 832       {
 833         // cleanup
 834         if(codeset->name != NULL)
 835           freeArbitrateVecPooled(codeset->name);
 836         if(codeset->alt_name != NULL)
 837           freeArbitrateVecPooled(codeset->alt_name);
 838         if(codeset->characterization != NULL)
 839           freeArbitrateVecPooled(codeset->characterization);
 840         freeArbitrateVecPooled(codeset);
 841       }
 842     }
 843
 844     Close(fh);
 845   }
 846
 847   RETURN(res);
 848   return res;
 849 }
 850 ///
 851 /// codesetsScanDir()
 852 static void codesetsScanDir(struct codesetList *csList, const char *dirPath)
 853 {
 854   ENTER();
 855
 856   if(dirPath != NULL && dirPath[0] != '\0')
 857   {
 858     #if defined(__amigaos4__)
 859     APTR dirContext;
 860
 861     if((dirContext = ObtainDirContextTags(EX_StringNameInput, dirPath,
 862                                           EX_DataFields,      EXF_NAME|EXF_TYPE,
 863                                           TAG_END)) != NULL)
 864     {
 865       struct ExamineData *exd;
 866
 867       D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
 868
 869       while((exd = ExamineDir(dirContext)) != NULL)
 870       {
 871         if(EXD_IS_FILE(exd))
 872         {
 873           char filePath[620];
 874
 875           strlcpy(filePath, dirPath, sizeof(filePath));
 876           AddPart(filePath, exd->Name, sizeof(filePath));
 877
 878           D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
 879
 880           codesetsReadTable(csList, filePath);
 881         }
 882       }
 883
 884       ReleaseDirContext(dirContext);
 885     }
 886     #else
 887     BPTR dirLock;
 888
 889     if((dirLock = Lock(dirPath, ACCESS_READ)))
 890     {
 891       struct ExAllControl *eac;
 892
 893       D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
 894
 895       if((eac = AllocDosObject(DOS_EXALLCONTROL, NULL)) != NULL)
 896       {
 897         struct ExAllData *ead;
 898         struct ExAllData *eabuffer;
 899         LONG more;
 900
 901         eac->eac_LastKey = 0;
 902         eac->eac_MatchString = NULL;
 903         eac->eac_MatchFunc = NULL;
 904
 905         if((eabuffer = allocVecPooled(CodesetsBase->pool, 10*sizeof(struct ExAllData))) != NULL)
 906         {
 907           char filePath[620];
 908
 909           do
 910           {
 911             more = ExAll(dirLock, eabuffer, 10*sizeof(struct ExAllData), ED_TYPE, eac);
 912             if(!more && IoErr() != ERROR_NO_MORE_ENTRIES)
 913               break;
 914
 915             if(eac->eac_Entries == 0)
 916               continue;
 917
 918             ead = (struct ExAllData *)eabuffer;
 919             do
 920             {
 921               // we only take that ead if it is a file (ed_Type < 0)
 922               if(ead->ed_Type < 0)
 923               {
 924                 strlcpy(filePath, dirPath, sizeof(filePath));
 925                 AddPart(filePath, (char *)ead->ed_Name, sizeof(filePath));
 926
 927                 D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
 928
 929                 codesetsReadTable(csList, filePath);
 930               }
 931               ead = ead->ed_Next;
 932             }
 933             while(ead != NULL);
 934           }
 935           while(more);
 936
 937           freeVecPooled(CodesetsBase->pool, eabuffer);
 938         }
 939
 940         FreeDosObject(DOS_EXALLCONTROL, eac);
 941       }
 942
 943       UnLock(dirLock);
 944     }
 945     #endif
 946   }
 947
 948   LEAVE();
 949 }
 950
 951 ///
 952 /// codesetsInit()
 953 // Initialized and loads the codesets
 954 BOOL codesetsInit(struct codesetList *csList)
 955 {
 956   BOOL success = FALSE;
 957   struct codeset *codeset;
 958   UTF32 src;
 959   int i;
 960   #if defined(__amigaos4__)
 961   ULONG nextMIB = 3;
 962   #endif
 963
 964   ENTER();
 965
 966   NewList((struct List *)csList);
 967
 968   // to make the list of the supported codesets complete we also add fake
 969   // 'UTF-8', 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
 970   if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
 971     goto end;
 972
 973   memset(codeset, 0, sizeof(*codeset));
 974   codeset->name             = mystrdup("UTF-8");
 975   codeset->alt_name         = mystrdup("UTF8");
 976   codeset->characterization = mystrdup("Unicode");
 977   codeset->read_only        = 0;
 978   D(DBF_STARTUP, "adding internal codeset 'UTF-8'");
 979   AddTail((struct List *)csList, (struct Node *)&codeset->node);
 980   CodesetsBase->utf8Codeset = codeset;
 981
 982   if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
 983     goto end;
 984
 985   memset(codeset, 0, sizeof(*codeset));
 986   codeset->name             = mystrdup("UTF-16");
 987   codeset->alt_name         = mystrdup("UTF16");
 988   codeset->characterization = mystrdup("16-bit Unicode");
 989   codeset->read_only        = 0;
 990   D(DBF_STARTUP, "adding internal codeset 'UTF-16'");
 991   AddTail((struct List *)csList, (struct Node *)&codeset->node);
 992   CodesetsBase->utf16Codeset = codeset;
 993
 994   if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
 995     goto end;
 996
 997   memset(codeset, 0, sizeof(*codeset));
 998   codeset->name             = mystrdup("UTF-32");
 999   codeset->alt_name         = mystrdup("UTF32");
1000   codeset->characterization = mystrdup("32-bit Unicode");
1001   codeset->read_only        = 0;
1002   D(DBF_STARTUP, "adding internal codeset 'UTF-32'");
1003   AddTail((struct List *)csList, (struct Node *)&codeset->node);
1004   CodesetsBase->utf32Codeset = codeset;
1005
1006   // on AmigaOS4 we can use diskfont.library to inquire charset information as
1007   // it comes with a quite rich implementation of different charsets.
1008   #if defined(__amigaos4__)
1009   D(DBF_STARTUP, "OS4, asking diskfont.library for codesets");
1010   do
1011   {
1012     char *mimename;
1013     char *ianaName;
1014     ULONG *mapTable;
1015     ULONG curMIB = nextMIB;
1016
1017     nextMIB = ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NEXTNUMBER);
1018     if(nextMIB == 0)
1019       break;
1020
1021     mapTable = (ULONG *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MAPTABLE);
1022     mimename = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MIMENAME);
1023     ianaName = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NAME);
1024     if(mapTable != NULL && mimename != NULL && codesetsFind(csList, mimename) == NULL)
1025     {
1026       D(DBF_STARTUP, "loading charset '%s' from diskfont.library...", mimename);
1027
1028       if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1029         goto end;
1030
1031       codeset->name             = mystrdup(mimename);
1032       codeset->alt_name         = NULL;
1033       codeset->characterization = mystrdup(ianaName);
1034       codeset->read_only        = 0;
1035
1036       for(i=0; i<256; i++)
1037       {
1038         UTF32 *src_ptr = &src;
1039         UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1040
1041         src = mapTable[i];
1042
1043         codeset->table[i].code = i;
1044         codeset->table[i].ucs4 = src;
1045         CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1046         *dest_ptr = 0;
1047         codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1048       }
1049
1050       memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1051       qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1052
1053       D(DBF_STARTUP, "adding diskfont.library codeset '%s'", codeset->name);
1054       AddTail((struct List *)csList, (struct Node *)&codeset->node);
1055     }
1056   }
1057   while(TRUE);
1058   #endif
1059
1060   #if defined(__MORPHOS__)
1061   {
1062     struct Library *KeymapBase;
1063     struct Library *LocaleBase;
1064     // assume success at first
1065     BOOL success = TRUE;
1066
1067     D(DBF_STARTUP, "MorphOS, asking keymap.library for codesets");
1068     if((KeymapBase = OpenLibrary("keymap.library", 51)) != NULL)
1069     {
1070       if((LocaleBase = OpenLibrary("locale.library", 51)) != NULL)
1071       {
1072         struct KeyMap *keymap = AskKeyMapDefault();
1073         // it doesn't matter if this call fails, as we don't depend on the system codesets
1074         CONST_STRPTR name = GetKeyMapCodepage(keymap);
1075
1076         // legacy keymaps dont have codepage or Unicode mappings
1077         if(name != NULL && keymap != NULL)
1078         {
1079           D(DBF_STARTUP, "loading charset '%s' from keymap.library...", name);
1080
1081           if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) != NULL)
1082           {
1083              codeset->name             = mystrdup(name);
1084              codeset->alt_name         = NULL;
1085              codeset->characterization = mystrdup(name);  // No further information available
1086              codeset->read_only        = 0;
1087
1088              for(i=0; i<256; i++)
1089              {
1090                UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1091                LONG rc;
1092
1093                codeset->table[i].code = i;
1094                codeset->table[i].ucs4 = src = ToUCS4(i, keymap);
1095
1096                // here we use UTF8_Encode() instead of ConvertUCS4ToUTF8() because
1097                // of an internal bug in MorphOS 2.2.
1098                rc = UTF8_Encode(src, dest_ptr);
1099                rc = rc > 0 ? rc : 1;
1100
1101                dest_ptr[rc] = '\0';
1102                codeset->table[i].utf8[0] = rc;
1103              }
1104
1105              memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1106              qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1107
1108              D(DBF_STARTUP, "adding keymap.library codeset '%s'", codeset->name);
1109              AddTail((struct List *)csList, (struct Node *)&codeset->node);
1110           }
1111           else
1112           {
1113             // only failed memory allocations are treated as error
1114             success = FALSE;
1115           }
1116         }
1117
1118         CloseLibrary(LocaleBase);
1119       }
1120
1121       CloseLibrary(KeymapBase);
1122     }
1123
1124     if(success == FALSE)
1125       goto end;
1126   }
1127   #endif
1128
1129   D(DBF_STARTUP, "loading charsets from LIBS:Charsets...");
1130
1131   // we try to walk to the LIBS:Charsets directory on our own and readin our
1132   // own charset tables
1133   codesetsScanDir(csList, "LIBS:Charsets");
1134
1135   //
1136   // now we go and initialize our internally supported codesets but only if
1137   // we have not already loaded a charset with the same name
1138   //
1139   D(DBF_STARTUP, "initializing internal charsets...");
1140
1141   // ISO-8859-1 + EURO
1142   if(codesetsFind(csList, "ISO-8859-1 + Euro") == NULL)
1143   {
1144     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1145       goto end;
1146
1147     codeset->name             = mystrdup("ISO-8859-1 + Euro");
1148     codeset->alt_name         = NULL;
1149     codeset->characterization = mystrdup("West European (with EURO)");
1150     codeset->read_only        = 1;
1151
1152     for(i = 0; i<256; i++)
1153     {
1154       UTF32 *src_ptr = &src;
1155       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1156
1157       if(i==164)
1158         src = 0x20AC; // the EURO sign
1159       else
1160         src = i;
1161
1162       codeset->table[i].code = i;
1163       codeset->table[i].ucs4 = src;
1164       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1165       *dest_ptr = 0;
1166       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1167     }
1168     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1169     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1170
1171     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1172     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1173   }
1174
1175   // ISO-8859-1
1176   if(codesetsFind(csList, "ISO-8859-1") == NULL)
1177   {
1178     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1179       goto end;
1180
1181     codeset->name             = mystrdup("ISO-8859-1");
1182     codeset->alt_name         = mystrdup("ISO8859-1");
1183     codeset->characterization = mystrdup("West European");
1184     codeset->read_only        = 0;
1185
1186     for(i = 0; i<256; i++)
1187     {
1188       UTF32 *src_ptr = &src;
1189       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1190
1191       src = i;
1192
1193       codeset->table[i].code = i;
1194       codeset->table[i].ucs4 = src;
1195       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1196       *dest_ptr = 0;
1197       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1198     }
1199     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1200     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1201
1202     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1203     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1204   }
1205
1206   // ISO-8859-2
1207   if(codesetsFind(csList, "ISO-8859-2") == NULL)
1208   {
1209     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1210       goto end;
1211
1212     codeset->name             = mystrdup("ISO-8859-2");
1213     codeset->alt_name         = mystrdup("ISO8859-2");
1214     codeset->characterization = mystrdup("Central/East European");
1215     codeset->read_only        = 0;
1216
1217     for(i = 0; i<256; i++)
1218     {
1219       UTF32 *src_ptr = &src;
1220       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1221
1222       if(i<0xa0)
1223         src = i;
1224       else
1225         src = iso_8859_2_to_ucs4[i-0xa0];
1226
1227       codeset->table[i].code = i;
1228       codeset->table[i].ucs4 = src;
1229       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr,dest_ptr+6, CSF_StrictConversion);
1230       *dest_ptr = 0;
1231       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1232     }
1233     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1234     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1235
1236     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1237     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1238   }
1239
1240   // ISO-8859-3
1241   if(codesetsFind(csList, "ISO-8859-3") == NULL)
1242   {
1243     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1244       goto end;
1245
1246     codeset->name             = mystrdup("ISO-8859-3");
1247     codeset->alt_name         = mystrdup("ISO8859-3");
1248     codeset->characterization = mystrdup("South European");
1249     codeset->read_only        = 0;
1250
1251     for(i = 0; i<256; i++)
1252     {
1253       UTF32 *src_ptr = &src;
1254       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1255
1256       if(i<0xa0)
1257         src = i;
1258       else
1259         src = iso_8859_3_to_ucs4[i-0xa0];
1260
1261       codeset->table[i].code = i;
1262       codeset->table[i].ucs4 = src;
1263       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1264       *dest_ptr = 0;
1265       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1266     }
1267     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1268     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1269
1270     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1271     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1272   }
1273
1274   // ISO-8859-4
1275   if(codesetsFind(csList, "ISO-8859-4") == NULL)
1276   {
1277     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1278       goto end;
1279
1280     codeset->name             = mystrdup("ISO-8859-4");
1281     codeset->alt_name         = mystrdup("ISO8859-4");
1282     codeset->characterization = mystrdup("North European");
1283     codeset->read_only        = 0;
1284
1285     for(i = 0; i<256; i++)
1286     {
1287       UTF32 *src_ptr = &src;
1288       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1289
1290       if(i<0xa0)
1291         src = i;
1292       else
1293         src = iso_8859_4_to_ucs4[i-0xa0];
1294
1295       codeset->table[i].code = i;
1296       codeset->table[i].ucs4 = src;
1297       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1298       *dest_ptr = 0;
1299       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1300     }
1301     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1302     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1303
1304     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1305     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1306   }
1307
1308   // ISO-8859-5
1309   if(codesetsFind(csList, "ISO-8859-5") == NULL)
1310   {
1311     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1312       goto end;
1313
1314     codeset->name             = mystrdup("ISO-8859-5");
1315     codeset->alt_name         = mystrdup("ISO8859-5");
1316     codeset->characterization = mystrdup("Slavic languages");
1317     codeset->read_only        = 0;
1318
1319     for(i = 0; i<256; i++)
1320     {
1321       UTF32 *src_ptr = &src;
1322       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1323
1324       if(i<0xa0)
1325         src = i;
1326       else
1327         src = iso_8859_5_to_ucs4[i-0xa0];
1328
1329       codeset->table[i].code = i;
1330       codeset->table[i].ucs4 = src;
1331       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1332       *dest_ptr = 0;
1333       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1334     }
1335     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1336     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1337
1338     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1339     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1340   }
1341
1342   // ISO-8859-9
1343   if(codesetsFind(csList, "ISO-8859-9") == NULL)
1344   {
1345     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1346       goto end;
1347
1348     codeset->name             = mystrdup("ISO-8859-9");
1349     codeset->alt_name         = mystrdup("ISO8859-9");
1350     codeset->characterization = mystrdup("Turkish");
1351     codeset->read_only        = 0;
1352
1353     for(i = 0; i<256; i++)
1354     {
1355       UTF32 *src_ptr = &src;
1356       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1357
1358       if(i<0xa0)
1359         src = i;
1360       else
1361         src = iso_8859_9_to_ucs4[i-0xa0];
1362
1363       codeset->table[i].code = i;
1364       codeset->table[i].ucs4 = src;
1365       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1366       *dest_ptr = 0;
1367       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1368     }
1369     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1370     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1371
1372     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1373     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1374   }
1375
1376   // ISO-8859-15
1377   if(codesetsFind(csList, "ISO-8859-15") == NULL)
1378   {
1379     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1380       goto end;
1381
1382     codeset->name             = mystrdup("ISO-8859-15");
1383     codeset->alt_name         = mystrdup("ISO8859-15");
1384     codeset->characterization = mystrdup("West European II");
1385     codeset->read_only        = 0;
1386
1387     for(i = 0; i<256; i++)
1388     {
1389       UTF32 *src_ptr = &src;
1390       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1391
1392       if(i<0xa0)
1393         src = i;
1394       else
1395         src = iso_8859_15_to_ucs4[i-0xa0];
1396
1397       codeset->table[i].code = i;
1398       codeset->table[i].ucs4 = src;
1399       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1400       *dest_ptr = 0;
1401       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1402     }
1403     memcpy(codeset->table_sorted,codeset->table,sizeof (codeset->table));
1404     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1405
1406     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1407     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1408   }
1409
1410   // ISO-8859-16
1411   if(codesetsFind(csList, "ISO-8859-16") == NULL)
1412   {
1413     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1414       goto end;
1415
1416     codeset->name             = mystrdup("ISO-8859-16");
1417     codeset->alt_name         = mystrdup("ISO8869-16");
1418     codeset->characterization = mystrdup("South-Eastern European");
1419     codeset->read_only        = 0;
1420
1421     for(i=0;i<256;i++)
1422     {
1423       UTF32 *src_ptr = &src;
1424       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1425
1426       if(i < 0xa0)
1427         src = i;
1428       else
1429         src = iso_8859_16_to_ucs4[i-0xa0];
1430
1431       codeset->table[i].code = i;
1432       codeset->table[i].ucs4 = src;
1433       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1434       *dest_ptr = 0;
1435       codeset->table[i].utf8[0] = (IPTR)dest_ptr - (IPTR)&codeset->table[i].utf8[1];
1436     }
1437     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1438     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1439
1440     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1441     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1442   }
1443
1444   // KOI8-R
1445   if(codesetsFind(csList, "KOI8-R") == NULL)
1446   {
1447     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1448       goto end;
1449
1450     codeset->name               = mystrdup("KOI8-R");
1451     codeset->alt_name           = mystrdup("KOI8R");
1452     codeset->characterization   = mystrdup("Russian");
1453     codeset->read_only          = 0;
1454
1455     for(i = 0; i<256; i++)
1456     {
1457       UTF32 *src_ptr = &src;
1458       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1459
1460       if(i<0x80)
1461         src = i;
1462       else
1463         src = koi8r_to_ucs4[i-0x80];
1464
1465       codeset->table[i].code = i;
1466       codeset->table[i].ucs4 = src;
1467       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1468       *dest_ptr = 0;
1469       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1470     }
1471     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1472     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1473
1474     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1475     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1476   }
1477
1478   // AmigaPL
1479   if(codesetsFind(csList, "AmigaPL") == NULL)
1480   {
1481     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1482       goto end;
1483
1484     codeset->name             = mystrdup("AmigaPL");
1485     codeset->alt_name         = mystrdup("AmiPL");
1486     codeset->characterization = mystrdup("Polish (Amiga)");
1487     codeset->read_only        = 1;
1488
1489     for(i=0; i<256; i++)
1490     {
1491       UTF32 *src_ptr = &src;
1492       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1493
1494       if(i<0xa0)
1495         src = i;
1496       else
1497         src = amigapl_to_ucs4[i-0xa0];
1498
1499       codeset->table[i].code = i;
1500       codeset->table[i].ucs4 = src;
1501       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1502       *dest_ptr = 0;
1503       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1504     }
1505     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1506     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1507
1508     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1509     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1510   }
1511
1512   // Amiga-1251
1513   if(codesetsFind(csList, "Amiga-1251") == NULL)
1514   {
1515     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1516       goto end;
1517
1518     codeset->name             = mystrdup("Amiga-1251");
1519     codeset->alt_name         = mystrdup("Ami1251");
1520     codeset->characterization = mystrdup("Cyrillic (Amiga)");
1521     codeset->read_only        = 1;
1522
1523     for(i=0; i<256; i++)
1524     {
1525       UTF32 *src_ptr = &src;
1526       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1527
1528       if(i < 0xa0)
1529         src = i;
1530       else
1531         src = amiga1251_to_ucs4[i-0xa0];
1532
1533       codeset->table[i].code = i;
1534       codeset->table[i].ucs4 = src;
1535       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1536       *dest_ptr = 0;
1537       codeset->table[i].utf8[0] = (char*)dest_ptr - (char*)&codeset->table[i].utf8[1];
1538     }
1539     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1540     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1541
1542     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1543     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1544   }
1545
1546   success = TRUE;
1547
1548 end:
1549   RETURN(success);
1550   return success;
1551 }
1552
1553 ///
1554 /// codesetsCleanup()
1555 // Cleanup the memory for the codeset
1556 void codesetsCleanup(struct codesetList *csList)
1557 {
1558   struct codeset *code;
1559
1560   ENTER();
1561
1562   while((code = (struct codeset *)RemHead((struct List *)csList)) != NULL)
1563   {
1564     if(code->name != NULL)
1565       freeArbitrateVecPooled(code->name);
1566     if(code->alt_name != NULL)
1567       freeArbitrateVecPooled(code->alt_name);
1568     if(code->characterization != NULL)
1569       freeArbitrateVecPooled(code->characterization);
1570
1571     freeArbitrateVecPooled(code);
1572   }
1573
1574   LEAVE();
1575 }
1576
1577 ///
1578 /// codesetsFind()
1579 // Returns the given codeset.
1580 struct codeset *codesetsFind(struct codesetList *csList, const char *name)
1581 {
1582   struct codeset *res = NULL;
1583
1584   ENTER();
1585
1586   if(name != NULL && name[0] != '\0')
1587   {
1588     struct Node *node;
1589     const char *matchedName;
1590
1591     if((matchedName = matchCodesetAlias(name)) != NULL)
1592       name = matchedName;
1593
1594     for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
1595     {
1596       struct codeset *mstate = (struct codeset *)node;
1597
1598       if(stricmp(name, mstate->name) == 0 ||
1599         (mstate->alt_name != NULL && stricmp(name, mstate->alt_name) == 0))
1600       {
1601         // break out
1602         res = mstate;
1603         break;
1604       }
1605     }
1606   }
1607
1608   RETURN(res);
1609   return res;
1610 }
1611
1612 ///
1613 /// checkTextAgainstSingleCodeset
1614 // check how good a text can be represented by a specific codeset
1615 static int checkTextAgainstSingleCodeset(CONST_STRPTR text, ULONG textLen, struct codeset *codeset)
1616 {
1617   int errors = textLen;
1618
1619   ENTER();
1620
1621   if(codeset->read_only == 0 &&
1622      codeset != CodesetsBase->utf8Codeset &&
1623      codeset != CodesetsBase->utf16Codeset &&
1624      codeset != CodesetsBase->utf32Codeset)
1625   {
1626     CONST_STRPTR text_ptr = text;
1627     ULONG i;
1628
1629     errors = 0;
1630
1631     // the following identification/detection routine is NOT really smart.
1632     // we just see how each UTF8 string is the representation of each char
1633     // in our source text and then check if they are valid or not. As said,
1634     // not very smart, but we don't have anything better right now :(
1635     for(i=0; i < textLen; i++)
1636     {
1637       unsigned char c = *text_ptr++;
1638
1639       if(c != '\0')
1640       {
1641         struct single_convert *f = &codeset->table[c];
1642
1643         if(f->utf8[0] == 0x00 || f->utf8[1] == 0x00)
1644           errors++;
1645       }
1646       else
1647         break;
1648     }
1649   }
1650   else
1651     W(DBF_STARTUP, "codeset '%s' is either read-only (%ld) or UTF8/16/32 (%ld)", codeset->name, codeset->read_only, codeset == CodesetsBase->utf8Codeset || codeset == CodesetsBase->utf16Codeset || codeset == CodesetsBase->utf32Codeset);
1652
1653   D(DBF_STARTUP, "tried to identify text as '%s' text with %ld of %ld errors", codeset->name, errors, textLen);
1654
1655   RETURN(errors);
1656   return errors;
1657 }
1658
1659 ///
1660 /// checkTextAgainstCodesetList
1661 static int checkTextAgainstCodesetList(CONST_STRPTR text, ULONG textLen, struct codesetList *csList, struct codeset **bestCodeset)
1662 {
1663   struct Node *node;
1664   int bestErrors = textLen;
1665
1666   ENTER();
1667
1668   *bestCodeset = NULL;
1669
1670   for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
1671   {
1672     struct codeset *codeset = (struct codeset *)node;
1673     int errors;
1674
1675     errors = checkTextAgainstSingleCodeset(text, textLen, codeset);
1676     if(errors < bestErrors)
1677     {
1678       *bestCodeset = codeset;
1679       bestErrors = errors;
1680
1681       if(bestErrors == 0)
1682         break;
1683     }
1684   }
1685
1686   RETURN(bestErrors);
1687   return bestErrors;
1688 }
1689
1690 ///
1691 /// codesetsFindBest()
1692 // Returns the best codeset for the given text
1693 static struct codeset *codesetsFindBest(struct TagItem *attrs, ULONG csFamily, CONST_STRPTR text, ULONG textLen, int *errorPtr)
1694 {
1695   struct codeset *bestCodeset = NULL;
1696   int bestErrors = textLen;
1697   BOOL found = FALSE;
1698
1699   ENTER();
1700
1701   ObtainSemaphoreShared(&CodesetsBase->libSem);
1702
1703   // in case the user specified the codeset family as a
1704   // cyrillic one we go and do our cyrillic specific analysis first
1705   if(csFamily == CSV_CodesetFamily_Cyrillic)
1706   {
1707     #define NUM_CYRILLIC 3
1708
1709     struct CodesetSearch
1710     {
1711       const char *name;
1712       const char *data;
1713     };
1714
1715     struct CodesetSearch search[NUM_CYRILLIC];
1716     unsigned char *p;
1717     unsigned char *tp;
1718     int ctr[NUM_CYRILLIC];
1719     int Nmax;
1720     int NGlob = 1;
1721     int max;
1722     int gr = 0;
1723     int lr = 0;
1724
1725     D(DBF_STARTUP, "performing cyrillic analysis");
1726
1727     search[0].name = "windows-1251";
1728     search[0].data = cp1251_data;
1729     search[1].name = "IBM866";
1730     search[1].data = cp866_data;
1731     search[2].name = "KOI8-R";
1732     search[2].data = koi8r_data;
1733
1734     memset(&ctr, 0, sizeof(ctr));
1735
1736     tp = (unsigned char *)text;
1737
1738     do
1739     {
1740       int n;
1741       int mid = max = -466725766; // TODO: what's the magic behind this constant?
1742       Nmax = 0;
1743
1744       for(n=0; n < NUM_CYRILLIC; n++)
1745       {
1746         unsigned char la = 0;
1747         unsigned char *tptr = (unsigned char *)search[n].data;
1748
1749         p = tp;
1750
1751         do
1752         {
1753           unsigned char lb = (*p++) ^ 128;
1754
1755           if(!((la | lb) & 128))
1756             ctr[n] += (signed char)tptr[(la << 7) + lb];
1757
1758           la = lb;
1759         }
1760         while(*p);
1761
1762         if(max < ctr[n])
1763         {
1764           mid = max;
1765           max = ctr[n];
1766           Nmax = n+1;
1767         }
1768       }
1769
1770       tp = p;
1771       if((max >= 500) && ((max-mid) >= 1000))
1772       {
1773         lr = gr = 1;
1774         NGlob = Nmax;
1775       }
1776     }
1777     while((*p) && (!gr));
1778
1779     if(gr || ((!(*p)) && lr))
1780       Nmax = NGlob;
1781
1782     // if our analysis found something, we go and try
1783     // to find the corresponding codeset in out codeset list
1784     if(max != 0)
1785     {
1786       struct TagItem *tstate = attrs;
1787       struct TagItem *tag;
1788
1789       D(DBF_STARTUP, "identified text as '%s", search[Nmax-1].name);
1790
1791       // now we walk through our taglist and check if the user
1792       // supplied
1793       while((tag = NextTagItem((APTR)&tstate)) != NULL)
1794       {
1795         if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1796         {
1797           struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1798
1799           if((bestCodeset = codesetsFind(csList, search[Nmax-1].name)) != NULL)
1800             break;
1801         }
1802       }
1803
1804       // if we still haven't found the matching codeset
1805       // we search the internal list
1806       if(bestCodeset == NULL)
1807         bestCodeset = codesetsFind(&CodesetsBase->codesets, search[Nmax-1].name);
1808
1809       bestErrors = 0;
1810
1811       found = TRUE;
1812     }
1813   }
1814
1815   // if we haven't found the best codeset (through the cyrillic analysis)
1816   // we go and do the dumb latin search in our codesetlist
1817   if(found == FALSE)
1818   {
1819     struct TagItem *tstate = attrs;
1820     struct TagItem *tag;
1821
1822     // check text against all codesets in all supplied lists of codesets
1823     while((tag = NextTagItem((APTR)&tstate)) != NULL)
1824     {
1825       switch(tag->ti_Tag)
1826       {
1827         case CSA_CodesetList:
1828         {
1829           struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1830           struct codeset *bestCodesetInList;
1831           int bestErrorsInList;
1832
1833           D(DBF_STARTUP, "checking against external codeset list");
1834           bestErrorsInList = checkTextAgainstCodesetList(text, textLen, csList, &bestCodesetInList);
1835           if(bestErrorsInList < bestErrors && bestCodesetInList != NULL)
1836           {
1837             bestCodeset = bestCodesetInList;
1838             bestErrors = bestErrorsInList;
1839
1840             if(bestErrors == 0)
1841               break;
1842           }
1843         }
1844         break;
1845       }
1846     }
1847
1848     // we didn't find a "best" codeset in the supplied codesets lists so far,
1849     // so now we check against our internal list
1850     if(bestErrors != 0)
1851     {
1852       struct codeset *bestCodesetInList;
1853       int bestErrorsInList;
1854
1855       D(DBF_STARTUP, "checking against internal codeset list");
1856       bestErrorsInList = checkTextAgainstCodesetList(text, textLen, &CodesetsBase->codesets, &bestCodesetInList);
1857       if(bestErrorsInList < bestErrors && bestCodesetInList != NULL)
1858       {
1859         bestCodeset = bestCodesetInList;
1860         bestErrors = bestErrorsInList;
1861       }
1862     }
1863   }
1864
1865   ReleaseSemaphore(&CodesetsBase->libSem);
1866
1867   if(errorPtr != NULL)
1868     *errorPtr = bestErrors;
1869
1870   RETURN(bestCodeset);
1871   return bestCodeset;
1872 }
1873
1874 ///
1875
1876 /**************************************************************************/
1877
1878 /// CodesetsSupportedA()
1879 LIBPROTO(CodesetsSupportedA, STRPTR *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
1880 {
1881   STRPTR *array = NULL;
1882   struct TagItem *tstate = attrs;
1883   struct TagItem *tag;
1884   BOOL allowMultibyte;
1885   int numCodesets;
1886
1887   ENTER();
1888
1889   allowMultibyte = GetTagData(CSA_AllowMultibyteCodesets, TRUE, attrs);
1890
1891   ObtainSemaphoreShared(&CodesetsBase->libSem);
1892
1893   // first we need to check how many codesets our supplied
1894   // lists carry.
1895   numCodesets = countCodesets(&CodesetsBase->codesets, allowMultibyte);
1896   while((tag = NextTagItem((APTR)&tstate)) != NULL)
1897   {
1898     switch(tag->ti_Tag)
1899     {
1900       case CSA_CodesetList:
1901       {
1902         numCodesets += countCodesets((struct codesetList *)tag->ti_Data, allowMultibyte);
1903       }
1904       break;
1905     }
1906   }
1907
1908   // now that we know how many codesets we have in our lists we
1909   // can put their names into our string arrays
1910   if(numCodesets > 0)
1911   {
1912     if((array = allocArbitrateVecPooled((numCodesets+1)*sizeof(STRPTR))) != NULL)
1913     {
1914       struct Node *node;
1915       int i=0;
1916
1917       // first we walk through the internal codesets list and
1918       // add the names
1919       for(node = GetHead((struct List *)&CodesetsBase->codesets); node != NULL; node = GetSucc(node))
1920       {
1921         struct codeset *code = (struct codeset *)node;
1922
1923         if(allowMultibyte == TRUE ||
1924            (code != CodesetsBase->utf8Codeset && code != CodesetsBase->utf16Codeset && code != CodesetsBase->utf32Codeset))
1925         {
1926           array[i] = code->name;
1927           i++;
1928         }
1929       }
1930
1931       // reset the tstate
1932       tstate = attrs;
1933
1934       // then we also iterate through our private codesets list
1935       while((tag = NextTagItem((APTR)&tstate)) != NULL)
1936       {
1937         switch(tag->ti_Tag)
1938         {
1939           case CSA_CodesetList:
1940           {
1941             for(node = GetHead((struct List *)tag->ti_Data); node != NULL; node = GetSucc(node))
1942             {
1943               struct codeset *code = (struct codeset *)node;
1944
1945               if(allowMultibyte == TRUE ||
1946                  (code != CodesetsBase->utf8Codeset && code != CodesetsBase->utf16Codeset && code != CodesetsBase->utf32Codeset))
1947               {
1948                 array[i] = code->name;
1949                 i++;
1950               }
1951             }
1952           }
1953           break;
1954         }
1955       }
1956
1957       array[i] = NULL;
1958     }
1959   }
1960
1961   ReleaseSemaphore(&CodesetsBase->libSem);
1962
1963   RETURN(array);
1964   return array;
1965 }
1966
1967 #if defined(__amigaos4__)
1968 LIBPROTOVA(CodesetsSupported, STRPTR *, REG(a6, UNUSED __BASE_OR_IFACE), ...)
1969 {
1970   STRPTR *res;
1971   VA_LIST args;
1972
1973   VA_START(args, ICodesets);
1974   res = CodesetsSupportedA(VA_ARG(args, struct TagItem *));
1975   VA_END(args);
1976
1977   return res;
1978 }
1979 #endif
1980
1981 ///
1982 /// CodesetsFreeA()
1983 LIBPROTO(CodesetsFreeA, void, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, APTR obj), REG(a1, UNUSED struct TagItem *attrs))
1984 {
1985   ENTER();
1986
1987   if(obj != NULL)
1988     freeArbitrateVecPooled(obj);
1989
1990   LEAVE();
1991 }
1992
1993 #if defined(__amigaos4__)
1994 LIBPROTOVA(CodesetsFree, void,  REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, APTR obj), ...)
1995 {
1996   VA_LIST args;
1997
1998   VA_START(args, obj);
1999   CodesetsFreeA(obj, VA_ARG(args, struct TagItem *));
2000   VA_END(args);
2001 }
2002 #endif
2003
2004 ///
2005 /// CodesetsSetDefaultA()
2006 LIBPROTO(CodesetsSetDefaultA, struct codeset *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
2007 {
2008   struct codeset *codeset;
2009
2010   ENTER();
2011
2012   ObtainSemaphoreShared(&CodesetsBase->libSem);
2013
2014   if((codeset = codesetsFind(&CodesetsBase->codesets, name)) != NULL)
2015   {
2016     ULONG flags;
2017
2018     flags = GVF_SAVE_VAR;
2019     if(GetTagData(CSA_Save, FALSE, attrs))
2020       SET_FLAG(flags, GVF_GLOBAL_ONLY);
2021
2022     SetVar("codeset_default", codeset->name, strlen(codeset->name), flags);
2023   }
2024
2025   ReleaseSemaphore(&CodesetsBase->libSem);
2026
2027   RETURN(codeset);
2028   return codeset;
2029 }
2030
2031 #if defined(__amigaos4__)
2032 LIBPROTOVA(CodesetsSetDefault, struct codeset *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, STRPTR name), ...)
2033 {
2034   struct codeset *cs;
2035   VA_LIST args;
2036
2037   VA_START(args, name);
2038   cs = CodesetsSetDefaultA(name, VA_ARG(args, struct TagItem *));
2039   VA_END(args);
2040
2041   return cs;
2042 }
2043 #endif
2044
2045 ///
2046 /// CodesetsFindA()
2047 LIBPROTO(CodesetsFindA, struct codeset *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
2048 {
2049   struct codeset *codeset = NULL;
2050
2051   ENTER();
2052
2053   ObtainSemaphoreShared(&CodesetsBase->libSem);
2054
2055   // if no name pointer was supplied we have to return
2056   // the default codeset only.
2057   if(name != NULL)
2058   {
2059     // we first walk through our internal list and check if we
2060     // can find the requested codeset
2061     codeset = codesetsFind(&CodesetsBase->codesets, name);
2062
2063     if(codeset == NULL)
2064     {
2065       struct TagItem *tstate = attrs;
2066       struct TagItem *tag;
2067
2068       // now we walk through our taglist and check if the user
2069       // supplied
2070       while((tag = NextTagItem((APTR)&tstate)) != NULL)
2071       {
2072         if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
2073         {
2074           struct codesetList *csList = (struct codesetList *)tag->ti_Data;
2075
2076           if((codeset = codesetsFind(csList, name)) != NULL)
2077             break;
2078         }
2079       }
2080     }
2081   }
2082
2083   // check if we found something or not.
2084   if(codeset == NULL && GetTagData(CSA_FallbackToDefault, TRUE, attrs))
2085     codeset = defaultCodeset(FALSE);
2086
2087   ReleaseSemaphore(&CodesetsBase->libSem);
2088
2089   RETURN(codeset);
2090   return codeset;
2091 }
2092
2093 #if defined(__amigaos4__)
2094 LIBPROTOVA(CodesetsFind, struct codeset *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, STRPTR name), ...)
2095 {
2096   struct codeset *cs;
2097   VA_LIST args;
2098
2099   VA_START(args, name);
2100   cs = CodesetsFindA(name, VA_ARG(args, struct TagItem *));
2101   VA_END(args);
2102
2103   return cs;
2104 }
2105 #endif
2106
2107 ///
2108 /// CodesetsFindBestA()
2109 LIBPROTO(CodesetsFindBestA, struct codeset *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
2110 {
2111   struct codeset *codeset = NULL;
2112   char *text;
2113   ULONG textLen;
2114
2115   ENTER();
2116
2117   ObtainSemaphoreShared(&CodesetsBase->libSem);
2118
2119   text = (char *)GetTagData(CSA_Source, 0, attrs);
2120   textLen = GetTagData(CSA_SourceLen, text != NULL ? strlen(text) : 0, attrs);
2121
2122   if(text != NULL && textLen != 0)
2123   {
2124     int numErrors = 0;
2125     ULONG csFamily = GetTagData(CSA_CodesetFamily, CSV_CodesetFamily_Latin, attrs);
2126     int *errorPtr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
2127
2128     codeset = codesetsFindBest(attrs, csFamily, text, textLen, &numErrors);
2129
2130     if(errorPtr != NULL)
2131       *errorPtr = numErrors;
2132
2133     // if we still haven't got the codeset we fallback to the default
2134     if(codeset == NULL && GetTagData(CSA_FallbackToDefault, FALSE, attrs))
2135       codeset = defaultCodeset(FALSE);
2136   }
2137
2138   ReleaseSemaphore(&CodesetsBase->libSem);
2139
2140   RETURN(codeset);
2141   return codeset;
2142 }
2143
2144 #if defined(__amigaos4__)
2145 LIBPROTOVA(CodesetsFindBest,  struct codeset *, REG(a6, UNUSED __BASE_OR_IFACE), ...)
2146 {
2147   struct codeset *cs;
2148   VA_LIST args;
2149
2150   VA_START(args, ICodesets);
2151   cs = CodesetsFindBestA(VA_ARG(args, struct TagItem *));
2152   VA_END(args);
2153
2154   return cs;
2155 }
2156 #endif
2157
2158 ///
2159 /// CodesetsUTF8Len()
2160 // Returns the number of characters a utf8 string has. This is not
2161 // identically with the size of memory is required to hold the string.
2162 LIBPROTO(CodesetsUTF8Len, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF8 *str))
2163 {
2164   int len = 0;
2165   unsigned char c;
2166
2167   ENTER();
2168
2169   if(str != NULL)
2170   {
2171     while((c = *str++))
2172     {
2173       len++;
2174       str += trailingBytesForUTF8[c];
2175     }
2176   }
2177
2178   RETURN((ULONG)len);
2179   return (ULONG)len;
2180 }
2181
2182 ///
2183 /// CodesetsStrLenA()
2184 LIBPROTO(CodesetsStrLenA, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, STRPTR str), REG(a1, struct TagItem *attrs))
2185 {
2186   ULONG res = 0;
2187
2188   ENTER();
2189
2190   if(str != NULL)
2191   {
2192     struct codeset *codeset;
2193     int            len;
2194     STRPTR         src;
2195     int            utf;
2196
2197     if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2198       codeset = defaultCodeset(TRUE);
2199
2200     if(codeset == CodesetsBase->utf32Codeset)
2201     {
2202       utf = 32;
2203       len = utf32_strlen((UTF32 *)str);
2204     }
2205     else if(codeset == CodesetsBase->utf16Codeset)
2206     {
2207       utf = 16;
2208       len = utf16_strlen((UTF16 *)str);
2209     }
2210     else
2211     {
2212       utf = 0;
2213       len = strlen(str);
2214     }
2215
2216     len = GetTagData(CSA_SourceLen, len, attrs);
2217
2218     src = str;
2219
2220     if(utf != 0)
2221     {
2222       void *srcend = src + len;
2223       UTF8 *dstlen = NULL;
2224       union TypeAliases srcAlias;
2225       union TypeAliases dstAlias;
2226
2227       srcAlias.strptr = &src;
2228       dstAlias.utf8 = &dstlen;
2229
2230       switch(utf)
2231       {
2232         case 16:
2233           CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, NULL, 0);
2234         break;
2235
2236         case 32:
2237           CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, NULL, 0);
2238         break;
2239       }
2240       res = (IPTR)dstlen;
2241     }
2242     else
2243     {
2244       UBYTE c;
2245
2246       res = 0;
2247
2248       while((c = *src++) != '\0' && len != 0)
2249       {
2250         res += codeset->table[c].utf8[0];
2251         len--;
2252       }
2253     }
2254   }
2255
2256   RETURN(res);
2257   return res;
2258 }
2259
2260 #if defined(__amigaos4__)
2261 LIBPROTOVA(CodesetsStrLen, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, STRPTR str), ...)
2262 {
2263   ULONG res;
2264   VA_LIST args;
2265
2266   VA_START(args, str);
2267   res = CodesetsStrLenA(str, VA_ARG(args, struct TagItem *));
2268   VA_END(args);
2269
2270   return res;
2271 }
2272 #endif
2273
2274 ///
2275 /// CodesetsUTF8ToStrA()
2276 // Converts an UTF8 string to a given charset. Return the number of bytes
2277 // written to dest excluding the NULL byte (which is always ensured by this
2278 // function; it means a NULL str will produce "" as dest; anyway you should
2279 // check NULL str to not waste your time!).
2280 LIBPROTO(CodesetsUTF8ToStrA, STRPTR, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
2281 {
2282   UTF8 *src;
2283   ULONG srcLen;
2284   ULONG destLen = 0;
2285   ULONG *destLenPtr;
2286   ULONG n = 0;
2287   STRPTR dest = NULL;
2288
2289   ENTER();
2290
2291   if((src = (UTF8 *)GetTagData(CSA_Source, 0, attrs)) != NULL &&
2292      (srcLen = GetTagData(CSA_SourceLen, src != NULL ? strlen((char *)src) : 0, attrs)) > 0)
2293   {
2294     struct convertMsg msg;
2295     struct codeset *codeset;
2296     struct Hook *destHook;
2297     struct Hook *mapForeignCharsHook;
2298     char buf[256];
2299     STRPTR destIter = NULL;
2300     char *b = NULL;
2301     int i = 0;
2302     unsigned char *s = src;
2303     unsigned char *e = (src+srcLen);
2304     int numConvErrors = 0;
2305     int *numConvErrorsPtr;
2306     BOOL mapForeignChars;
2307     APTR pool = NULL;
2308     struct SignalSemaphore *sem = NULL;
2309     int utf;
2310     ULONG char_size;
2311
2312     // get some more optional attributes
2313     destHook = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2314     destLen = GetTagData(CSA_DestLen, 0, attrs);
2315     numConvErrorsPtr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
2316     mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2317     mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, 0, attrs);
2318
2319     // get the destination codeset pointer
2320     if((codeset = (struct codeset *)GetTagData(CSA_DestCodeset, 0, attrs)) == NULL)
2321       codeset = defaultCodeset(TRUE);
2322     if(codeset == CodesetsBase->utf32Codeset)
2323     {
2324       utf = 32;
2325       char_size = 4;
2326     }
2327     else if(codeset == CodesetsBase->utf16Codeset)
2328     {
2329       utf = 16;
2330       char_size = 2;
2331     }
2332     else
2333     {
2334       utf = 0;
2335       char_size = 1;
2336     }
2337
2338     // first we make sure we allocate enough memory
2339     // for our destination buffer
2340     if(destHook != NULL)
2341     {
2342       if(destLen < 16 || destLen > sizeof(buf))
2343         destLen = sizeof(buf);
2344
2345       msg.state = CSV_Translating;
2346       b = buf;
2347       i = 0;
2348     }
2349     else
2350     {
2351       // in case the user wants us to dynamically generate the
2352       // destination buffer we do it right now
2353       if((dest = (STRPTR)GetTagData(CSA_Dest, 0, attrs)) == NULL ||
2354          GetTagData(CSA_AllocIfNeeded, TRUE, attrs) != FALSE)
2355       {
2356         ULONG len = 0;
2357
2358         // calculate the destLen
2359         if(utf)
2360         {
2361           void *dstlen = NULL;
2362           union TypeAliases srcAlias;
2363           union TypeAliases dstAlias;
2364
2365           srcAlias.uchar = &s;
2366           dstAlias.voidptr = &dstlen;
2367
2368           switch(utf)
2369           {
2370             case 16:
2371               CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, NULL, 0);
2372             break;
2373
2374             case 32:
2375               CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, NULL, 0);
2376             break;
2377           }
2378           len = (IPTR)dstlen;
2379         }
2380         else
2381         {
2382           while(s < e)
2383           {
2384             unsigned char c = *s++;
2385
2386             len++;
2387             s += trailingBytesForUTF8[c];
2388           }
2389         }
2390
2391         if(dest == NULL || (destLen < len+1))
2392         {
2393           if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2394           {
2395             if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2396               ObtainSemaphore(sem);
2397
2398             // allocate the destination buffer
2399             dest = allocVecPooled(pool, len+char_size);
2400
2401             if(sem != NULL)
2402               ReleaseSemaphore(sem);
2403           }
2404           else
2405             dest = allocArbitrateVecPooled(len+char_size);
2406
2407           destLen = len+char_size;
2408         }
2409
2410         if(dest == NULL)
2411         {
2412           RETURN(NULL);
2413           return NULL;
2414         }
2415       }
2416
2417       destIter = dest;
2418     }
2419
2420     // now we convert the src string to the
2421     // destination buffer.
2422     s = src;
2423     if(utf != 0)
2424     {
2425       void *dstend;
2426
2427       if(destHook != NULL)
2428       {
2429         ULONG r = CSR_TargetExhausted;
2430
2431         dstend = b + destLen - char_size;
2432         do
2433         {
2434           union TypeAliases srcAlias;
2435           union TypeAliases dstAlias;
2436
2437           srcAlias.uchar = &s;
2438           dstAlias.schar = &b;
2439
2440           switch(utf)
2441           {
2442             case 16:
2443               r = CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, dstend, 0);
2444             break;
2445
2446             case 32:
2447               r = CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, dstend, 0);
2448             break;
2449           }
2450           b[0] = 0;
2451           if(char_size > 1)
2452             b[1] = 0;
2453           if(r != CSR_TargetExhausted)
2454             msg.state = CSV_End;
2455           msg.len = b-buf;
2456           CallHookPkt(destHook,&msg,buf);
2457
2458           b  = buf;
2459           n += msg.len;
2460         }
2461         while(r == CSR_TargetExhausted);
2462       }
2463       else
2464       {
2465         union TypeAliases srcAlias;
2466         union TypeAliases dstAlias;
2467
2468         srcAlias.uchar = &s;
2469         dstAlias.strptr = &destIter;
2470         dstend = destIter + destLen - char_size;
2471         switch(utf)
2472         {
2473           case 16:
2474             CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, dstend, 0);
2475           break;
2476
2477           case 32:
2478             CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, dstend, 0);
2479           break;
2480         }
2481         n = destIter-dest;
2482       }
2483     }
2484     else
2485     {
2486       for(;;n++)
2487       {
2488         if(destHook == NULL && n >= destLen-1)
2489           break;
2490
2491         // convert until we reach the end of the
2492         // source buffer.
2493         if(s < e)
2494         {
2495           unsigned char c = *s;
2496           unsigned char d = '?';
2497           const char *repstr = NULL;
2498           int replen = 0;
2499
2500           // check if the char is a >7bit char
2501           if(c > 127)
2502           {
2503             struct single_convert *f;
2504             int lenAdd = trailingBytesForUTF8[c];
2505             int lenStr = lenAdd+1;
2506             unsigned char *src = s;
2507
2508             do
2509             {
2510               // start each iteration with "no replacement found yet"
2511               repstr = NULL;
2512               replen = 0;
2513
2514               // search in the UTF8 conversion table of the current charset if
2515               // we have a replacement character for the char sequence starting at s
2516               BIN_SEARCH(codeset->table_sorted, 0, 255, strncmp((char *)src, (char *)codeset->table_sorted[m].utf8+1, lenStr), f);
2517
2518               if(f != NULL)
2519               {
2520                 d = f->code;
2521                 replen = -1;
2522
2523                 break;
2524               }
2525               else
2526               {
2527                 // the analysed char sequence (s) is not convertable to a
2528                 // single visible char replacement, so we normally have to put
2529                 // a ? sign as a "unknown char" sign at the very position.
2530                 //
2531                 // For convienence we, however, allow users to replace these
2532                 // UTF8 characters with char sequences that "looklike" the
2533                 // original char.
2534                 if(mapForeignChars == TRUE)
2535                   replen = mapUTF8toASCII(&repstr, src, lenStr);
2536
2537                 // call the hook only, if the internal table yielded no suitable
2538                 // replacement
2539                 if(replen == 0 && mapForeignCharsHook != NULL)
2540                 {
2541                   struct replaceMsg rmsg;
2542
2543                   rmsg.dst = (char **)&repstr;
2544                   rmsg.src = src;
2545                   rmsg.srclen = lenStr;
2546                   replen = CallHookPkt(mapForeignCharsHook, &rmsg, NULL);
2547                 }
2548
2549                 if(replen < 0)
2550                 {
2551                   D(DBF_UTF, "got UTF8 replacement (%ld)", replen);
2552
2553                   // stay in the loop as long as one replacement function delivers
2554                   // further UTF8 replacement sequences
2555                   src = (unsigned char *)repstr;
2556                   // remember the length of the replaced string, as we might do another
2557                   // iteration in the loop which might result in a further replacement
2558                   lenStr = -replen;
2559                 }
2560                 else if(replen == 0)
2561                 {
2562                   D(DBF_UTF, "found no ASCII replacement for UTF8 string (%ld)", replen);
2563                   repstr = NULL;
2564                 }
2565                 else
2566                   D(DBF_UTF, "got replacement string '%s' (%ld)", repstr ? repstr : "<null>", replen);
2567               }
2568             }
2569             while(replen < 0);
2570
2571             if(repstr == NULL || replen == 0)
2572             {
2573               if(replen >= 0)
2574               {
2575                 d = '?';
2576                 numConvErrors++;
2577               }
2578             }
2579
2580             s += lenAdd;
2581           }
2582           else
2583             d = c;
2584
2585           if(destHook != NULL)
2586           {
2587             if(replen > 1)
2588             {
2589               while(replen > 0)
2590               {
2591                 *b++ = *repstr;
2592                 repstr++;
2593                 i++;
2594                 replen--;
2595
2596                 if(i%(destLen-1)==0)
2597                 {
2598                   *b = '\0';
2599                   msg.len = i;
2600                   CallHookPkt(destHook, &msg, buf);
2601
2602                   b  = buf;
2603                   *b = '\0';
2604                   i  = 0;
2605                 }
2606               }
2607             }
2608             else
2609             {
2610               *b++ = replen > 0 ? *repstr : d;
2611               i++;
2612             }
2613
2614             if(i%(destLen-1)==0)
2615             {
2616               *b = '\0';
2617               msg.len = i;
2618               CallHookPkt(destHook, &msg, buf);
2619
2620               b  = buf;
2621               *b = '\0';
2622               i  = 0;
2623             }
2624           }
2625           else
2626           {
2627             if(replen > 1)
2628             {
2629               ULONG destPos = destIter-dest;
2630
2631               if(pool != NULL)
2632               {
2633                 if(sem != NULL)
2634                   ObtainSemaphore(sem);
2635
2636                 // allocate the destination buffer
2637                 dest = reallocVecPooled(pool, dest, destLen, destLen+replen-1);
2638
2639                 if(sem != NULL)
2640                   ReleaseSemaphore(sem);
2641               }
2642               else
2643                 dest = reallocArbitrateVecPooled(dest, destLen, destLen+replen-1);
2644
2645               if(dest == NULL)
2646               {
2647                 RETURN(NULL);
2648                 return NULL;
2649               }
2650
2651               destIter = dest+destPos;
2652               memcpy(destIter, repstr, replen);
2653
2654               // adjust our loop pointer and destination length
2655               destIter += replen;
2656               destLen += replen-1;
2657             }
2658             else if(replen == 1)
2659               *destIter++ = *repstr;
2660             else
2661               *destIter++ = d;
2662           }
2663
2664           s++;
2665         }
2666         else
2667           break;
2668       }
2669
2670       if(destHook != NULL)
2671       {
2672         msg.state = CSV_End;
2673         msg.len   = i;
2674         *b        = '\0';
2675         CallHookPkt(destHook,&msg,buf);
2676       }
2677       else
2678         *destIter = '\0';
2679     }
2680
2681     // let us write the number of conversion errors
2682     // to the proper variable pointer, if wanted
2683     if(numConvErrorsPtr != NULL)
2684       *numConvErrorsPtr = numConvErrors;
2685   }
2686
2687   // put the final length of our destination buffer
2688   // into the destLenPtr
2689   if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
2690   {
2691     if(destLen > 0)
2692       *destLenPtr = destLen-1;
2693     else
2694       *destLenPtr = 0;
2695   }
2696
2697   RETURN(dest);
2698   return dest;
2699 }
2700
2701 #if defined(__amigaos4__)
2702 LIBPROTOVA(CodesetsUTF8ToStr, STRPTR, REG(a6, UNUSED __BASE_OR_IFACE), ...)
2703 {
2704   STRPTR res;
2705   VA_LIST args;
2706
2707   VA_START(args, ICodesets);
2708   res = CodesetsUTF8ToStrA(VA_ARG(args, struct TagItem *));
2709   VA_END(args);
2710
2711   return res;
2712 }
2713 #endif
2714
2715 ///
2716 /// CodesetsUTF8CreateA()
2717 // Converts a string and a charset to an UTF8. Returns the UTF8.
2718 // If a destination hook is supplied always return 0.
2719 // If from is NULL, it returns NULL and doesn't call the hook.
2720 LIBPROTO(CodesetsUTF8CreateA, UTF8 *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
2721 {
2722   UTF8   *from;
2723   UTF8   *dest;
2724   struct codeset *codeset;
2725   ULONG  fromLen, *destLenPtr;
2726   ULONG  n;
2727   int    utf;
2728
2729   ENTER();
2730
2731   dest = NULL;
2732   n    = 0;
2733
2734   if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2735     codeset = defaultCodeset(TRUE);
2736   if(codeset == CodesetsBase->utf32Codeset)
2737     utf = 32;
2738   else if(codeset == CodesetsBase->utf16Codeset)
2739     utf = 16;
2740   else
2741     utf = 0;
2742
2743   from = (UTF8 *)GetTagData(CSA_Source, 0, attrs);
2744   if(from != NULL)
2745   {
2746     switch(utf)
2747     {
2748       case 32:
2749         fromLen = utf32_strlen((UTF32 *)from);
2750       break;
2751
2752       case 16:
2753         fromLen = utf16_strlen((UTF16 *)from);
2754       break;
2755
2756       default:
2757         fromLen = strlen((char *)from);
2758       break;
2759     }
2760   }
2761   else
2762     fromLen = 0;
2763   fromLen = GetTagData(CSA_SourceLen, fromLen, attrs);
2764
2765   if(from != NULL && fromLen != 0)
2766   {
2767     struct convertMsg       msg;
2768     struct Hook    *hook;
2769     ULONG          destLen;
2770     int            i = 0;
2771     TEXT           buf[256];
2772     STRPTR         src, destPtr = NULL, b = NULL;
2773     ULONG          c;
2774
2775     hook    = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2776     destLen = GetTagData(CSA_DestLen, 0, attrs);
2777
2778     if(hook != NULL)
2779     {
2780       if(destLen<16 || destLen>sizeof(buf))
2781         destLen = sizeof(buf);
2782
2783       msg.state = CSV_Translating;
2784       b = buf;
2785       i = 0;
2786     }
2787     else
2788     {
2789       if((dest = (UTF8 *)GetTagData(CSA_Dest, 0, attrs)) != NULL ||
2790          GetTagData(CSA_AllocIfNeeded, TRUE, attrs))
2791       {
2792         ULONG len;
2793
2794         src = (STRPTR)from;
2795
2796         if(utf != 0)
2797         {
2798           void *srcend = src + fromLen;
2799           UTF8 *dstlen = NULL;
2800           union TypeAliases srcAlias;
2801           union TypeAliases dstAlias;
2802
2803           srcAlias.strptr = &src;
2804           dstAlias.utf8 = &dstlen;
2805
2806           switch(utf)
2807           {
2808             case 16:
2809               CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, NULL, 0);
2810             break;
2811
2812             case 32:
2813               CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, NULL, 0);
2814             break;
2815           }
2816           len = (IPTR)dstlen;
2817         }
2818         else
2819         {
2820           ULONG flen = fromLen;
2821
2822           len = 0;
2823           while((c = *src++) != '\0' && flen != 0)
2824           {
2825             len += codeset->table[c].utf8[0];
2826             flen--;
2827           }
2828         }
2829         D(DBF_UTF, "Calculated output UTF-8 buffer length: %lu", len);
2830
2831         if(dest == NULL || (destLen<len+1))
2832         {
2833           APTR                   pool;
2834           struct SignalSemaphore *sem;
2835
2836           if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2837           {
2838             if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2839               ObtainSemaphore(sem);
2840
2841             // allocate the destination buffer
2842             dest = allocVecPooled(pool,len+1);
2843
2844             if(sem != NULL)
2845               ReleaseSemaphore(sem);
2846           }
2847           else
2848             dest = allocArbitrateVecPooled(len+1);
2849
2850           destLen  = len;
2851         }
2852
2853         if(dest == NULL)
2854         {
2855           RETURN(NULL);
2856           return NULL;
2857         }
2858       }
2859
2860       destPtr = (STRPTR)dest;
2861     }
2862
2863     src = (STRPTR)from;
2864     if(utf != 0)
2865     {
2866       void *srcend = src + fromLen;
2867       UTF8 *dstend;
2868
2869       if(hook != NULL)
2870       {
2871         ULONG r = CSR_TargetExhausted;
2872         union TypeAliases srcAlias;
2873         union TypeAliases dstAlias;
2874
2875         srcAlias.strptr = &src;
2876         dstAlias.strptr = &b;
2877         dstend = (UTF8 *)(b + destLen - 1);
2878         do
2879         {
2880           switch(utf)
2881           {
2882             case 16:
2883               r = CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, dstend, 0);
2884             break;
2885
2886             case 32:
2887               r = CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, dstend, 0);
2888             break;
2889           }
2890           *b = 0;
2891           if(r != CSR_TargetExhausted)
2892             msg.state = CSV_End;
2893           msg.len = b-buf;
2894           CallHookPkt(hook,&msg,buf);
2895
2896           b  = buf;
2897           n += msg.len;
2898         }
2899         while(r == CSR_TargetExhausted);
2900       }
2901       else
2902       {
2903         union TypeAliases srcAlias;
2904         union TypeAliases dstAlias;
2905
2906         srcAlias.strptr = &src;
2907         dstAlias.strptr = &destPtr;
2908         dstend = (UTF8 *)(destPtr + destLen);
2909         switch(utf)
2910         {
2911           case 16:
2912             CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, dstend, 0);
2913           break;
2914
2915           case 32:
2916             CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, dstend, 0);
2917           break;
2918         }
2919         n = destPtr-(STRPTR)dest;
2920       }
2921     }
2922     else
2923     {
2924       for(; fromLen && (c = *src); src++, fromLen--)
2925       {
2926         UTF8 *utf8_seq;
2927
2928         for(utf8_seq = &codeset->table[c].utf8[1]; (c = *utf8_seq); utf8_seq++)
2929         {
2930           if(hook != NULL)
2931           {
2932             *b++ = c;
2933             i++;
2934
2935             if(i%(destLen-1)==0)
2936             {
2937               *b = 0;
2938               msg.len = i;
2939               CallHookPkt(hook,&msg,buf);
2940
2941               b  = buf;
2942               *b = 0;
2943               i  = 0;
2944             }
2945           }
2946           else
2947           {
2948             if(n>=destLen)
2949               break;
2950
2951             *destPtr++ = c;
2952           }
2953
2954           n++;
2955         }
2956       }
2957
2958       if(hook != NULL)
2959       {
2960         msg.state = CSV_End;
2961         msg.len   = i;
2962         *b = 0;
2963         CallHookPkt(hook,&msg,buf);
2964       }
2965       else
2966       {
2967         *destPtr = 0;
2968       }
2969     }
2970   }
2971
2972   if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
2973     *destLenPtr = n;
2974
2975   RETURN(dest);
2976   return dest;
2977 }
2978
2979 #if defined(__amigaos4__)
2980 LIBPROTOVA(CodesetsUTF8Create, UTF8 *, REG(a6, UNUSED __BASE_OR_IFACE), ...)
2981 {
2982   UTF8 *res;
2983   VA_LIST args;
2984
2985   VA_START(args, ICodesets);
2986   res = CodesetsUTF8CreateA(VA_ARG(args, struct TagItem *));
2987   VA_END(args);
2988
2989   return res;
2990 }
2991 #endif
2992
2993 ///
2994 /// CodesetsIsValidUTF8()
2995 #define GOOD_UCS(c) \
2996      ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2997       (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2998
2999 LIBPROTO(CodesetsIsValidUTF8, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, CONST_STRPTR s))
3000 {
3001   CONST_STRPTR t = s;
3002   int n;
3003
3004   ENTER();
3005
3006   while((n = parseUtf8(&t)) != 0)
3007   {
3008     if(!GOOD_UCS(n))
3009     {
3010       RETURN(FALSE);
3011       return FALSE;
3012     }
3013   }
3014
3015   RETURN(TRUE);
3016   return TRUE;
3017 }
3018
3019 ///
3020 /// CodesetsConvertStrA()
3021 // Converts a given string from one source Codeset to a given destination
3022 // codeset and returns the convert string
3023 LIBPROTO(CodesetsConvertStrA, STRPTR, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
3024 {
3025   struct codeset *srcCodeset;
3026   STRPTR srcStr = NULL;
3027   STRPTR dstStr = NULL;
3028   ULONG srcLen = 0;
3029   ULONG dstLen = 0;
3030   ULONG charSize = 0;
3031
3032   ENTER();
3033
3034   // get the ptr to the src string we want to convert
3035   // from the source codeset to the dest codeset.
3036   srcStr = (STRPTR)GetTagData(CSA_Source, 0, attrs);
3037
3038   // get the pointer to the codeset in which the src string is encoded
3039   if((srcCodeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
3040     srcCodeset = defaultCodeset(TRUE);
3041
3042   if(srcStr != NULL)
3043   {
3044     if(srcCodeset == CodesetsBase->utf32Codeset)
3045     {
3046       srcLen = utf32_strlen((UTF32 *)srcStr);
3047       charSize = sizeof(UTF32);
3048     }
3049     else if(srcCodeset == CodesetsBase->utf16Codeset)
3050     {
3051       srcLen = utf16_strlen((UTF16 *)srcStr);
3052       charSize = sizeof(UTF16);
3053     }
3054     else
3055     {
3056       srcLen = strlen(srcStr);
3057       charSize = sizeof(char);
3058     }
3059   }
3060   else
3061     srcLen = 0;
3062   srcLen = GetTagData(CSA_SourceLen, srcLen, attrs);
3063
3064   if(srcStr != NULL && srcLen > 0)
3065   {
3066     struct codeset *dstCodeset;
3067
3068     // get the pointer to the codeset in which the dst string should be encoded
3069     if((dstCodeset = (struct codeset *)GetTagData(CSA_DestCodeset, 0, attrs)) == NULL)
3070       dstCodeset = defaultCodeset(TRUE);
3071
3072     D(DBF_UTF, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset->name, dstCodeset->name);
3073
3074     if(srcCodeset != NULL && dstCodeset != NULL)
3075     {
3076       // check that the user didn't supplied the very same codeset
3077       // or otherwise a conversion is not required.
3078       if(srcCodeset != dstCodeset)
3079       {
3080         BOOL utf8Create = FALSE;
3081         BOOL strCreate = FALSE;
3082         UTF8 *utf8str;
3083         ULONG utf8strLen = 0;
3084         ULONG *destLenPtr = NULL;
3085         BOOL mapForeignChars;
3086         struct Hook *mapForeignCharsHook;
3087
3088         mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
3089         mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, 0, attrs);
3090
3091         // if the source codeset is UTF-8 we don't have to use the UTF8Create()
3092         // function and can directly call the UTF8ToStr() function
3093         if(srcCodeset != CodesetsBase->utf8Codeset)
3094         {
3095           struct TagItem tags[] = { { CSA_SourceCodeset,  (IPTR)srcCodeset   },
3096                                     { CSA_Source,         (IPTR)srcStr       },
3097                                     { CSA_SourceLen,      srcLen             },
3098                                     { CSA_DestLenPtr,     (IPTR)&utf8strLen  },
3099                                     { TAG_DONE,           0                  } };
3100
3101           utf8str = CodesetsUTF8CreateA((struct TagItem *)&tags[0]);
3102
3103           utf8Create = TRUE;
3104         }
3105         else
3106         {
3107           utf8str = (UTF8 *)srcStr;
3108           utf8strLen = srcLen;
3109         }
3110
3111         // in case the destination codeset is UTF-8 we don't have to actually
3112         // use the UTF8ToStr() function and can immediately return our
3113         // UTF8 string
3114         if(utf8str != NULL && utf8strLen > 0 && dstCodeset != CodesetsBase->utf8Codeset)
3115         {
3116           struct TagItem tags[] = { { CSA_DestCodeset,          (IPTR)dstCodeset           },
3117                                     { CSA_Source,               (IPTR)utf8str              },
3118                                     { CSA_SourceLen,            utf8strLen                 },
3119                                     { CSA_DestLenPtr,           (IPTR)&dstLen              },
3120                                     { CSA_MapForeignChars,      mapForeignChars            },
3121                                     { CSA_MapForeignCharsHook,  (IPTR)mapForeignCharsHook  },
3122                                     { TAG_DONE,                 0                          } };
3123
3124           dstStr = CodesetsUTF8ToStrA((struct TagItem *)&tags[0]);
3125
3126           strCreate = TRUE;
3127         }
3128         else
3129         {
3130           dstStr = (STRPTR)utf8str;
3131           dstLen = utf8strLen;
3132         }
3133
3134         D(DBF_UTF, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr, srcLen,
3135                                                                                                      dstStr, dstLen,
3136                                                                                                      utf8Create,
3137                                                                                                      strCreate);
3138
3139         // if everything was successfull we can go and finalize everything
3140         if(dstStr != NULL && utf8str != NULL)
3141         {
3142           // as the conversion was a two way pass we have to either free the
3143           // memory of the utf8 string or not
3144           if(utf8Create == TRUE && strCreate == TRUE)
3145             CodesetsFreeA(utf8str, NULL);
3146
3147           // if the user wants to be informed abour the length
3148           // of our destination string we store the length now in the supplied ptr.
3149           if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
3150             *destLenPtr = dstLen;
3151
3152           D(DBF_UTF, "successfully converted string with len %ld", dstLen);
3153         }
3154         else
3155         {
3156           W(DBF_ALWAYS, "an error occurred while trying to convert a string");
3157
3158           // free all memory in case the conversion didn't work out
3159           if(utf8Create == TRUE && utf8str != NULL)
3160             CodesetsFreeA(utf8str, NULL);
3161
3162           if(strCreate == TRUE && dstStr != NULL)
3163             CodesetsFreeA(dstStr, NULL);
3164
3165           dstStr = NULL;
3166         }
3167       }
3168       else
3169       {
3170         // we got the same source and destination codesets passed in
3171         // instead of failing silently we just create a copy of the source string
3172         ULONG *destLenPtr = NULL;
3173
3174         // allocate memory for the destination string, including a trailing NUL byte
3175         if((dstStr = allocArbitrateVecPooled(srcLen + charSize)) != NULL)
3176         {
3177           // just copy the source string without any further modification
3178           // we must use memcpy() as the source string could be UTF16/32 encoded and
3179           // thus strcpy() would not do what we want.
3180           memcpy(dstStr, srcStr, srcLen + charSize);
3181           dstLen = srcLen;
3182           D(DBF_UTF, "successfully copied string with len %ld", dstLen);
3183         }
3184         else
3185           W(DBF_ALWAYS, "no memory for dest string");
3186
3187         // if the user wants to be informed abour the length
3188         // of our destination string we store the length now in the supplied ptr.
3189         if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
3190           *destLenPtr = dstLen;
3191       }
3192     }
3193   }
3194
3195   RETURN(dstStr);
3196   return dstStr;
3197 }
3198
3199 #if defined(__amigaos4__)
3200 LIBPROTOVA(CodesetsConvertStr, STRPTR, REG(a6, UNUSED __BASE_OR_IFACE), ...)
3201 {
3202   STRPTR res;
3203   VA_LIST args;
3204
3205   VA_START(args, ICodesets);
3206   res = CodesetsConvertStrA(VA_ARG(args, struct TagItem *));
3207   VA_END(args);
3208
3209   return res;
3210 }
3211 #endif
3212
3213 ///
3214 /// CodesetsFreeVecPooledA()
3215 LIBPROTO(CodesetsFreeVecPooledA, void, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, APTR pool), REG(a1, APTR mem), REG(a2, struct TagItem *attrs))
3216 {
3217   ENTER();
3218
3219   if(pool != NULL && mem != NULL)
3220   {
3221     struct SignalSemaphore *sem;
3222
3223     if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
3224       ObtainSemaphore(sem);
3225
3226     freeVecPooled(pool,mem);
3227
3228     if(sem != NULL)
3229       ReleaseSemaphore(sem);
3230   }
3231
3232   LEAVE();
3233 }
3234
3235 #if defined(__amigaos4__)
3236 LIBPROTOVA(CodesetsFreeVecPooled, void, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, APTR pool), REG(a1, APTR mem), ...)
3237 {
3238   VA_LIST args;
3239
3240   VA_START(args, mem);
3241   CodesetsFreeVecPooledA(pool, mem, VA_ARG(args, struct TagItem *));
3242   VA_END(args);
3243 }
3244 #endif
3245
3246 ///
3247 /// CodesetsListCreateA()
3248 LIBPROTO(CodesetsListCreateA, struct codesetList *, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
3249 {
3250   struct codesetList *csList = NULL;
3251
3252   ENTER();
3253
3254   // no matter what, we create a codesets list we will return to the user
3255   if((csList = allocArbitrateVecPooled(sizeof(struct codesetList))) != NULL)
3256   {
3257     BOOL scanProgDir = TRUE;
3258     struct TagItem *tstate = attrs;
3259     struct TagItem *tag;
3260
3261     // initialize the new private codeset list and put it into a separate list
3262     NewList((struct List *)csList);
3263
3264     // first we get the path of the directory from which we go
3265     // and scan for charset tables from
3266     while((tag = NextTagItem((APTR)&tstate)) != NULL)
3267     {
3268       switch(tag->ti_Tag)
3269       {
3270         case CSA_CodesetDir:
3271         {
3272           codesetsScanDir(csList, (STRPTR)tag->ti_Data);
3273
3274           scanProgDir = FALSE;
3275         }
3276         break;
3277
3278         case CSA_CodesetFile:
3279         {
3280           codesetsReadTable(csList, (STRPTR)tag->ti_Data);
3281
3282           scanProgDir = FALSE;
3283         }
3284         break;
3285
3286         case CSA_SourceCodeset:
3287         {
3288           struct codeset *cs = (struct codeset *)tag->ti_Data;
3289
3290           AddTail((struct List *)csList, (struct Node *)&cs->node);
3291
3292           scanProgDir = FALSE;
3293         }
3294         break;
3295       }
3296     }
3297
3298     // in case the user also wants us to scan PROGDIR:
3299     // we do so
3300     if(scanProgDir == TRUE)
3301       codesetsScanDir(csList, "PROGDIR:Charsets");
3302   }
3303
3304   RETURN(csList);
3305   return csList;
3306 }
3307
3308 #if defined(__amigaos4__)
3309 LIBPROTOVA(CodesetsListCreate, struct codesetList *, REG(a6, UNUSED __BASE_OR_IFACE), ...)
3310 {
3311   struct codesetList *res;
3312   VA_LIST args;
3313
3314   VA_START(args, ICodesets);
3315   res = CodesetsListCreateA(VA_ARG(args, struct TagItem *));
3316   VA_END(args);
3317
3318   return res;
3319 }
3320 #endif
3321
3322 ///
3323 /// CodesetsListDeleteA()
3324 LIBPROTO(CodesetsListDeleteA, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
3325 {
3326   BOOL result = FALSE;
3327   struct TagItem *tstate = attrs;
3328   struct TagItem *tag;
3329   BOOL freeCodesets;
3330
3331   ENTER();
3332
3333   // check if the caller wants us also to free the codesets
3334   freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3335
3336   // now we iterate through or tagItems and see what the
3337   // user wants to remove from the list
3338   while((tag = NextTagItem((APTR)&tstate)) != NULL)
3339   {
3340     switch(tag->ti_Tag)
3341     {
3342       case CSA_CodesetList:
3343       {
3344         struct codesetList *csList = (struct codesetList *)tag->ti_Data;
3345
3346         if(csList != NULL)
3347         {
3348           // cleanup the codesets within the list
3349           if(freeCodesets == TRUE)
3350             codesetsCleanup(csList);
3351
3352           // then free the list itICodesets
3353           freeArbitrateVecPooled(csList);
3354
3355           result = TRUE;
3356         }
3357       }
3358       break;
3359     }
3360   }
3361
3362   RETURN(result);
3363   return result;
3364 }
3365
3366 #if defined(__amigaos4__)
3367 LIBPROTOVA(CodesetsListDelete, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), ...)
3368 {
3369   BOOL result;
3370   VA_LIST args;
3371
3372   VA_START(args, ICodesets);
3373   result = CodesetsListDeleteA(VA_ARG(args, struct TagItem *));
3374   VA_END(args);
3375
3376   return result;
3377 }
3378 #endif
3379
3380 ///
3381 /// CodesetsListAddA()
3382 LIBPROTO(CodesetsListAddA, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct codesetList *csList), REG(a1, struct TagItem *attrs))
3383 {
3384   BOOL result = FALSE;
3385
3386   ENTER();
3387
3388   if(csList != NULL)
3389   {
3390     struct TagItem *tstate = attrs;
3391     struct TagItem *tag;
3392
3393     // now we iterate through or tagItems and see if the user
3394     // wants to scan a whole directory or just adds a file.
3395     while((tag = NextTagItem((APTR)&tstate)) != NULL)
3396     {
3397       switch(tag->ti_Tag)
3398       {
3399         case CSA_CodesetDir:
3400         {
3401           codesetsScanDir(csList, (STRPTR)tag->ti_Data);
3402           result = TRUE;
3403         }
3404         break;
3405
3406         case CSA_CodesetFile:
3407         {
3408           codesetsReadTable(csList, (STRPTR)tag->ti_Data);
3409           result = TRUE;
3410         }
3411         break;
3412
3413         case CSA_SourceCodeset:
3414         {
3415           struct codeset *cs = (struct codeset *)tag->ti_Data;
3416
3417           AddTail((struct List *)csList, (struct Node *)&cs->node);
3418           result = TRUE;
3419         }
3420         break;
3421       }
3422     }
3423   }
3424
3425   RETURN(result);
3426   return result;
3427 }
3428
3429 #if defined(__amigaos4__)
3430 LIBPROTOVA(CodesetsListAdd, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct codesetList *csList), ...)
3431 {
3432   BOOL result;
3433   VA_LIST args;
3434
3435   VA_START(args, csList);
3436   result = CodesetsListAddA(csList, VA_ARG(args, struct TagItem *));
3437   VA_END(args);
3438
3439   return result;
3440 }
3441 #endif
3442
3443 ///
3444 /// CodesetsListRemoveA()
3445 LIBPROTO(CodesetsListRemoveA, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, struct TagItem *attrs))
3446 {
3447   BOOL result = FALSE;
3448   struct TagItem *tstate = attrs;
3449   struct TagItem *tag;
3450   BOOL freeCodesets;
3451
3452   ENTER();
3453
3454   // check if the caller wants us also to free the codesets
3455   freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3456
3457   // now we iterate through or tagItems and see what the
3458   // user wants to remove from the list
3459   while((tag = NextTagItem((APTR)&tstate)) != NULL)
3460   {
3461     switch(tag->ti_Tag)
3462     {
3463       case CSA_SourceCodeset:
3464       {
3465         struct codeset *removeCS = (struct codeset *)tag->ti_Data;
3466
3467         if(removeCS != NULL)
3468         {
3469           struct Node *node;
3470           BOOL isExternalNode = TRUE;
3471
3472           ObtainSemaphore(&CodesetsBase->libSem);
3473
3474           // iterate over our internal list an check whether the given
3475           // node is part of that list
3476           for(node = GetHead((struct List *)&CodesetsBase->codesets); node != NULL; node = GetSucc(node))
3477           {
3478             if((struct codeset *)node == removeCS)
3479             {
3480               isExternalNode = FALSE;
3481               break;
3482             }
3483           }
3484
3485           ReleaseSemaphore(&CodesetsBase->libSem);
3486
3487           if(isExternalNode == TRUE)
3488           {
3489             Remove((struct Node *)removeCS);
3490
3491             // free all codesets data if requested
3492             if(freeCodesets == TRUE)
3493             {
3494               if(removeCS->name != NULL)
3495                 freeArbitrateVecPooled(removeCS->name);
3496               if(removeCS->alt_name != NULL)
3497                 freeArbitrateVecPooled(removeCS->alt_name);
3498               if(removeCS->characterization != NULL)
3499                 freeArbitrateVecPooled(removeCS->characterization);
3500
3501               freeArbitrateVecPooled(removeCS);
3502             }
3503
3504             result = TRUE;
3505           }
3506           else
3507             W(DBF_ALWAYS, "user tried to remove an internal codeset!");
3508         }
3509       }
3510       break;
3511     }
3512   }
3513
3514   RETURN(result);
3515   return result;
3516 }
3517
3518 #if defined(__amigaos4__)
3519 LIBPROTOVA(CodesetsListRemove, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), ...)
3520 {
3521   BOOL result;
3522   VA_LIST args;
3523
3524   VA_START(args, ICodesets);
3525   result = CodesetsListRemoveA(VA_ARG(args, struct TagItem *));
3526   VA_END(args);
3527
3528   return result;
3529 }
3530 #endif
3531
3532 ///
3533
3534 /**************************************************************************/