Fixed binary search: no more infinite loops when vendor is unknown.
[tangerine.git] / workbench / libs / codesetslib / src / codesets.c
blob0e63e4acee329e9090fea7479f5a7f20e62ff723
1 /***************************************************************************
3 codesets.library - Amiga shared library for handling different codesets
4 Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
5 Copyright (C) 2005-2009 by codesets.library Open Source Team
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 codesets.library project: http://sourceforge.net/projects/codesetslib/
19 Most of the code included in this file was relicensed from GPL to LGPL
20 from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
21 with full permissions by its authors.
23 $Id$
25 ***************************************************************************/
27 #include "lib.h"
29 #include <clib/alib_protos.h>
31 #include <diskfont/glyph.h>
32 #include <diskfont/diskfonttag.h>
33 #include <proto/diskfont.h>
34 #include <ctype.h>
35 #include <limits.h>
37 #ifdef __MORPHOS__
38 #include <proto/keymap.h>
39 #include <proto/locale.h>
40 #endif
42 #include "codesets_table.h"
43 #include "convertUTF.h"
44 #include "codepages.h"
46 #include "SDI_stdarg.h"
48 #include "debug.h"
50 /**************************************************************************/
52 /// BIN_SEARCH()
53 // search a sorted array in O(log n) e.g.
54 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
55 #define BIN_SEARCH(array,low,high,compare,result) \
57 int l = low;\
58 int h = high;\
59 int m = (low+high)/2;\
60 result = NULL;\
61 while (l<=h)\
63 int d = compare;\
64 if (!d){ result = &array[m]; break; }\
65 if (d < 0) h = m - 1;\
66 else l = m + 1;\
67 m = (l + h)/2;\
71 ///
72 /// mystrdup()
73 static STRPTR
74 mystrdup(const char *str)
76 STRPTR newStr = NULL;
78 ENTER();
80 if(str != NULL)
82 int len;
84 if((len = strlen(str)) > 0)
86 if((newStr = allocArbitrateVecPooled(len+1)) != NULL)
87 strlcpy(newStr, str, len+1);
91 RETURN(newStr);
92 return newStr;
94 ///
95 /// mystrndup()
96 static STRPTR
97 mystrndup(const char *str1, int n)
99 STRPTR dest;
101 ENTER();
103 if((dest = allocArbitrateVecPooled(n+1)) != NULL)
105 if(str1 != NULL)
106 strlcpy(dest, str1, n+1);
107 else
108 dest[0] = '\0';
110 dest[n] = '\0';
113 RETURN(dest);
114 return dest;
117 /// readLine()
118 static ULONG
119 readLine(BPTR fh, char *buf, ULONG size)
121 char *c;
123 ENTER();
125 if((c = FGets(fh, buf, size)) == NULL)
127 RETURN(FALSE);
128 return FALSE;
131 for(; *c; c++)
133 if(*c == '\n' || *c == '\r')
135 *c = '\0';
136 break;
140 RETURN(TRUE);
141 return TRUE;
144 /// getConfigItem()
145 static const char * getConfigItem(const char *buf, const char *item, int len)
147 ENTER();
149 if(strnicmp(buf, item, len) == 0)
151 UBYTE c;
153 buf += len;
155 /* skip spaces */
156 while((c = *buf) != '\0' && isspace(c))
157 buf++;
159 if(*buf != '=')
161 RETURN(NULL);
162 return NULL;
165 buf++;
167 /* skip spaces */
168 while((c = *buf) != '\0' && isspace(c))
169 buf++;
171 RETURN(buf);
172 return buf;
175 RETURN(NULL);
176 return NULL;
179 /// parseUtf8()
180 static int
181 parseUtf8(STRPTR *ps)
183 STRPTR s = *ps;
184 int wc, n, i;
186 ENTER();
188 if(*s<0x80)
190 *ps = s+1;
192 RETURN(*s);
193 return *s;
196 if(*s<0xc2)
198 RETURN(-1);
199 return -1;
201 else
203 if(*s<0xe0)
205 if((s[1] & 0xc0)!=0x80)
207 RETURN(-1);
208 return -1;
211 *ps = s+2;
213 RETURN(((s[0] & 0x1f)<<6) | (s[1] & 0x3f));
214 return ((s[0] & 0x1f)<<6) | (s[1] & 0x3f);
216 else
218 if(*s<0xf0)
220 n = 3;
222 else
224 if(*s<0xf8)
226 n = 4;
228 else
230 if(*s<0xfc)
232 n = 5;
234 else
236 if(*s<0xfe)
238 n = 6;
240 else
242 RETURN(-1);
243 return -1;
251 wc = *s++ & ((1<<(7-n))-1);
253 for(i = 1; i<n; i++)
255 if((*s & 0xc0) != 0x80)
257 RETURN(-1);
258 return -1;
261 wc = (wc << 6) | (*s++ & 0x3f);
264 if(wc < (1 << (5 * n - 4)))
266 RETURN(-1);
267 return -1;
270 *ps = s;
272 RETURN(wc);
273 return wc;
277 /// countCodesets()
278 static int
279 countCodesets(struct codesetList *csList)
281 struct MinNode *node, *succ;
282 int num;
284 for(node = csList->list.mlh_Head, num = 0; (succ = node->mln_Succ); node = succ)
285 ++num;
287 return num;
291 /// mapUTF8toASCII()
292 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
293 // function is used to replace these unknown sequences with lookalike characters that
294 // still make the text more readable. For more replacement see
295 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
297 // The conversion table in this function is partly borrowed from the awebcharset plugin
298 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
300 struct UTF8Replacement
302 const char *utf8; // the original UTF8 string we are going to replace
303 const int utf8len; // the length of the UTF8 string
304 const char *rep; // pointer to the replacement string
305 const int replen; // the length of the replacement string (minus for signalling an UTF8 string)
308 static int compareUTF8Replacements(const void *p1, const void *p2)
310 struct UTF8Replacement *key = (struct UTF8Replacement *)p1;
311 struct UTF8Replacement *rep = (struct UTF8Replacement *)p2;
312 int cmp;
314 // compare the length first, after that compare the strings
315 cmp = key->utf8len - rep->utf8len;
316 if(cmp == 0)
317 cmp = memcmp(key->utf8, rep->utf8, key->utf8len);
319 return cmp;
322 static int mapUTF8toASCII(const char **dst, const unsigned char *src, const int utf8len)
324 int len = 0;
325 struct UTF8Replacement key = { (char *)src, utf8len, NULL, 0 };
326 struct UTF8Replacement *rep;
328 static struct UTF8Replacement const utf8map[] =
330 // U+0100 ... U+017F (Latin Extended-A)
331 { "\xC4\x80", 2, "A", 1 }, // U+0100 -> A (LATIN CAPITAL LETTER A WITH MACRON)
332 { "\xC4\x81", 2, "a", 1 }, // U+0101 -> a (LATIN SMALL LETTER A WITH MACRON)
333 { "\xC4\x82", 2, "A", 1 }, // U+0102 -> A (LATIN CAPITAL LETTER A WITH BREVE)
334 { "\xC4\x83", 2, "a", 1 }, // U+0103 -> a (LATIN SMALL LETTER A WITH BREVE)
335 { "\xC4\x84", 2, "A", 1 }, // U+0104 -> A (LATIN CAPITAL LETTER A WITH OGONEK)
336 { "\xC4\x85", 2, "a", 1 }, // U+0105 -> a (LATIN SMALL LETTER A WITH OGONEK)
337 { "\xC4\x86", 2, "C", 1 }, // U+0106 -> C (LATIN CAPITAL LETTER C WITH ACUTE)
338 { "\xC4\x87", 2, "c", 1 }, // U+0107 -> c (LATIN SMALL LETTER C WITH ACUTE)
339 { "\xC4\x88", 2, "C", 1 }, // U+0108 -> C (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
340 { "\xC4\x89", 2, "c", 1 }, // U+0109 -> c (LATIN SMALL LETTER C WITH CIRCUMFLEX)
341 { "\xC4\x8A", 2, "C", 1 }, // U+010A -> C (LATIN CAPITAL LETTER C WITH DOT ABOVE)
342 { "\xC4\x8B", 2, "c", 1 }, // U+010B -> c (LATIN SMALL LETTER C WITH DOT ABOVE)
343 { "\xC4\x8C", 2, "C", 1 }, // U+010C -> C (LATIN CAPITAL LETTER C WITH CARON)
344 { "\xC4\x8D", 2, "c", 1 }, // U+010D -> c (LATIN SMALL LETTER C WITH CARON)
345 { "\xC4\x8E", 2, "D", 1 }, // U+010E -> D (LATIN CAPITAL LETTER D WITH CARON)
346 { "\xC4\x8F", 2, "d", 1 }, // U+010F -> d (LATIN SMALL LETTER D WITH CARON)
347 { "\xC4\x90", 2, "D", 1 }, // U+0110 -> D (LATIN CAPITAL LETTER D WITH STROKE)
348 { "\xC4\x91", 2, "d", 1 }, // U+0111 -> d (LATIN SMALL LETTER D WITH STROKE)
349 { "\xC4\x92", 2, "E", 1 }, // U+0112 -> E (LATIN CAPITAL LETTER E WITH MACRON)
350 { "\xC4\x93", 2, "e", 1 }, // U+0113 -> e (LATIN SMALL LETTER E WITH MACRON)
351 { "\xC4\x94", 2, "E", 1 }, // U+0114 -> E (LATIN CAPITAL LETTER E WITH BREVE)
352 { "\xC4\x95", 2, "e", 1 }, // U+0115 -> e (LATIN SMALL LETTER E WITH BREVE)
353 { "\xC4\x96", 2, "E", 1 }, // U+0116 -> E (LATIN CAPITAL LETTER E WITH DOT ABOVE)
354 { "\xC4\x97", 2, "e", 1 }, // U+0117 -> e (LATIN SMALL LETTER E WITH DOT ABOVE)
355 { "\xC4\x98", 2, "E", 1 }, // U+0118 -> E (LATIN CAPITAL LETTER E WITH OGONEK)
356 { "\xC4\x99", 2, "e", 1 }, // U+0119 -> e (LATIN SMALL LETTER E WITH OGONEK)
357 { "\xC4\x9A", 2, "E", 1 }, // U+011A -> E (LATIN CAPITAL LETTER E WITH CARON)
358 { "\xC4\x9B", 2, "e", 1 }, // U+011B -> e (LATIN SMALL LETTER E WITH CARON)
359 { "\xC4\x9C", 2, "G", 1 }, // U+011C -> G (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
360 { "\xC4\x9D", 2, "g", 1 }, // U+011D -> g (LATIN SMALL LETTER G WITH CIRCUMFLEX)
361 { "\xC4\x9E", 2, "G", 1 }, // U+011E -> G (LATIN CAPITAL LETTER G WITH BREVE)
362 { "\xC4\x9F", 2, "g", 1 }, // U+011F -> g (LATIN SMALL LETTER G WITH BREVE)
363 { "\xC4\xA0", 2, "G", 1 }, // U+0120 -> G (LATIN CAPITAL LETTER G WITH DOT ABOVE)
364 { "\xC4\xA1", 2, "g", 1 }, // U+0121 -> g (LATIN SMALL LETTER G WITH DOT ABOVE)
365 { "\xC4\xA2", 2, "G", 1 }, // U+0122 -> G (LATIN CAPITAL LETTER G WITH CEDILLA)
366 { "\xC4\xA3", 2, "g", 1 }, // U+0123 -> g (LATIN SMALL LETTER G WITH CEDILLA)
367 { "\xC4\xA4", 2, "H", 1 }, // U+0124 -> H (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
368 { "\xC4\xA5", 2, "h", 1 }, // U+0125 -> h (LATIN SMALL LETTER H WITH CIRCUMFLEX)
369 { "\xC4\xA6", 2, "H", 1 }, // U+0126 -> H (LATIN CAPITAL LETTER H WITH STROKE)
370 { "\xC4\xA7", 2, "h", 1 }, // U+0127 -> h (LATIN SMALL LETTER H WITH STROKE)
371 { "\xC4\xA8", 2, "I", 1 }, // U+0128 -> I (LATIN CAPITAL LETTER I WITH TILDE)
372 { "\xC4\xA9", 2, "i", 1 }, // U+0129 -> i (LATIN SMALL LETTER I WITH TILDE)
373 { "\xC4\xAA", 2, "I", 1 }, // U+012A -> I (LATIN CAPITAL LETTER I WITH MACRON)
374 { "\xC4\xAB", 2, "i", 1 }, // U+012B -> i (LATIN SMALL LETTER I WITH MACRON)
375 { "\xC4\xAC", 2, "I", 1 }, // U+012C -> I (LATIN CAPITAL LETTER I WITH BREVE)
376 { "\xC4\xAD", 2, "i", 1 }, // U+012D -> i (LATIN SMALL LETTER I WITH BREVE)
377 { "\xC4\xAE", 2, "I", 1 }, // U+012E -> I (LATIN CAPITAL LETTER I WITH OGONEK)
378 { "\xC4\xAF", 2, "i", 1 }, // U+012F -> i (LATIN SMALL LETTER I WITH OGONEK)
379 { "\xC4\xB0", 2, "I", 1 }, // U+0130 -> I (LATIN CAPITAL LETTER I WITH DOT ABOVE)
380 { "\xC4\xB1", 2, "i", 1 }, // U+0131 -> i (LATIN SMALL LETTER DOTLESS I)
381 { "\xC4\xB2", 2, "Ij", 2 }, // U+0132 -> Ij (LATIN CAPITAL LIGATURE IJ)
382 { "\xC4\xB3", 2, "ij", 2 }, // U+0133 -> ij (LATIN SMALL LIGATURE IJ)
383 { "\xC4\xB4", 2, "J", 1 }, // U+0134 -> J (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
384 { "\xC4\xB5", 2, "j", 1 }, // U+0135 -> j (LATIN SMALL LETTER J WITH CIRCUMFLEX)
385 { "\xC4\xB6", 2, "K", 1 }, // U+0136 -> K (LATIN CAPITAL LETTER K WITH CEDILLA)
386 { "\xC4\xB7", 2, "k", 1 }, // U+0137 -> k (LATIN SMALL LETTER K WITH CEDILLA)
387 { "\xC4\xB8", 2, "k", 1 }, // U+0138 -> k (LATIN SMALL LETTER KRA)
388 { "\xC4\xB9", 2, "L", 1 }, // U+0139 -> L (LATIN CAPITAL LETTER L WITH ACUTE)
389 { "\xC4\xBA", 2, "l", 1 }, // U+013A -> l (LATIN SMALL LETTER L WITH ACUTE)
390 { "\xC4\xBB", 2, "L", 1 }, // U+013B -> L (LATIN CAPITAL LETTER L WITH CEDILLA)
391 { "\xC4\xBC", 2, "l", 1 }, // U+013C -> l (LATIN SMALL LETTER L WITH CEDILLA)
392 { "\xC4\xBD", 2, "L", 1 }, // U+013D -> L (LATIN CAPITAL LETTER L WITH CARON)
393 { "\xC4\xBE", 2, "l", 1 }, // U+013E -> l (LATIN SMALL LETTER L WITH CARON)
394 { "\xC4\xBF", 2, "L", 1 }, // U+013F -> L (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
395 { "\xC5\x80", 2, "l", 1 }, // U+0140 -> l (LATIN SMALL LETTER L WITH MIDDLE DOT)
396 { "\xC5\x81", 2, "L", 1 }, // U+0141 -> L (LATIN CAPITAL LETTER L WITH STROKE)
397 { "\xC5\x82", 2, "l", 1 }, // U+0142 -> l (LATIN SMALL LETTER L WITH STROKE)
398 { "\xC5\x83", 2, "N", 1 }, // U+0143 -> N (LATIN CAPITAL LETTER N WITH ACUTE)
399 { "\xC5\x84", 2, "n", 1 }, // U+0144 -> n (LATIN SMALL LETTER N WITH ACUTE)
400 { "\xC5\x85", 2, "N", 1 }, // U+0145 -> N (LATIN CAPITAL LETTER N WITH CEDILLA)
401 { "\xC5\x86", 2, "n", 1 }, // U+0146 -> n (LATIN SMALL LETTER N WITH CEDILLA)
402 { "\xC5\x87", 2, "N", 1 }, // U+0147 -> N (LATIN CAPITAL LETTER N WITH CARON)
403 { "\xC5\x88", 2, "n", 1 }, // U+0148 -> n (LATIN SMALL LETTER N WITH CARON)
404 { "\xC5\x89", 2, "'n", 2 }, // U+0149 -> 'n (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
405 { "\xC5\x8A", 2, "Ng", 2 }, // U+014A -> Ng (LATIN CAPITAL LETTER ENG)
406 { "\xC5\x8B", 2, "ng", 2 }, // U+014B -> ng (LATIN SMALL LETTER ENG)
407 { "\xC5\x8C", 2, "O", 1 }, // U+014C -> O (LATIN CAPITAL LETTER O WITH MACRON)
408 { "\xC5\x8D", 2, "o", 1 }, // U+014D -> o (LATIN SMALL LETTER O WITH MACRON)
409 { "\xC5\x8E", 2, "O", 1 }, // U+014E -> O (LATIN CAPITAL LETTER O WITH BREVE)
410 { "\xC5\x8F", 2, "o", 1 }, // U+014F -> o (LATIN SMALL LETTER O WITH BREVE)
411 { "\xC5\x90", 2, "O", 1 }, // U+0150 -> O (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
412 { "\xC5\x91", 2, "o", 1 }, // U+0151 -> o (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
413 { "\xC5\x92", 2, "Oe", 2 }, // U+0152 -> Oe (LATIN CAPITAL LIGATURE OE)
414 { "\xC5\x93", 2, "oe", 2 }, // U+0153 -> oe (LATIN SMALL LIGATURE OE)
415 { "\xC5\x94", 2, "R", 1 }, // U+0154 -> R (LATIN CAPITAL LETTER R WITH ACUTE)
416 { "\xC5\x95", 2, "r", 1 }, // U+0155 -> r (LATIN SMALL LETTER R WITH ACUTE)
417 { "\xC5\x96", 2, "R", 1 }, // U+0156 -> R (LATIN CAPITAL LETTER R WITH CEDILLA)
418 { "\xC5\x97", 2, "r", 1 }, // U+0157 -> r (LATIN SMALL LETTER R WITH CEDILLA)
419 { "\xC5\x98", 2, "R", 1 }, // U+0158 -> R (LATIN CAPITAL LETTER R WITH CARON)
420 { "\xC5\x99", 2, "r", 1 }, // U+0159 -> r (LATIN SMALL LETTER R WITH CARON)
421 { "\xC5\x9A", 2, "S", 1 }, // U+015A -> S (LATIN CAPITAL LETTER S WITH ACUTE)
422 { "\xC5\x9B", 2, "s", 1 }, // U+015B -> s (LATIN SMALL LETTER S WITH ACUTE)
423 { "\xC5\x9C", 2, "S", 1 }, // U+015C -> S (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
424 { "\xC5\x9D", 2, "s", 1 }, // U+015D -> s (LATIN SMALL LETTER S WITH CIRCUMFLEX)
425 { "\xC5\x9E", 2, "S", 1 }, // U+015E -> S (LATIN CAPITAL LETTER S WITH CEDILLA)
426 { "\xC5\x9F", 2, "s", 1 }, // U+015F -> s (LATIN SMALL LETTER S WITH CEDILLA)
427 { "\xC5\xA0", 2, "S", 1 }, // U+0160 -> S (LATIN CAPITAL LETTER S WITH CARON)
428 { "\xC5\xA1", 2, "s", 1 }, // U+0161 -> s (LATIN SMALL LETTER S WITH CARON)
429 { "\xC5\xA2", 2, "T", 1 }, // U+0162 -> T (LATIN CAPITAL LETTER T WITH CEDILLA)
430 { "\xC5\xA3", 2, "t", 1 }, // U+0163 -> t (LATIN SMALL LETTER T WITH CEDILLA)
431 { "\xC5\xA4", 2, "T", 1 }, // U+0164 -> T (LATIN CAPITAL LETTER T WITH CARON)
432 { "\xC5\xA5", 2, "t", 1 }, // U+0165 -> t (LATIN SMALL LETTER T WITH CARON)
433 { "\xC5\xA6", 2, "T", 1 }, // U+0166 -> T (LATIN CAPITAL LETTER T WITH STROKE)
434 { "\xC5\xA7", 2, "t", 1 }, // U+0167 -> t (LATIN SMALL LETTER T WITH STROKE)
435 { "\xC5\xA8", 2, "U", 1 }, // U+0168 -> U (LATIN CAPITAL LETTER U WITH TILDE)
436 { "\xC5\xA9", 2, "u", 1 }, // U+0169 -> u (LATIN SMALL LETTER U WITH TILDE)
437 { "\xC5\xAA", 2, "U", 1 }, // U+016A -> U (LATIN CAPITAL LETTER U WITH MACRON)
438 { "\xC5\xAB", 2, "u", 1 }, // U+016B -> u (LATIN SMALL LETTER U WITH MACRON)
439 { "\xC5\xAC", 2, "U", 1 }, // U+016C -> U (LATIN CAPITAL LETTER U WITH BREVE)
440 { "\xC5\xAD", 2, "u", 1 }, // U+016D -> u (LATIN SMALL LETTER U WITH BREVE)
441 { "\xC5\xAE", 2, "U", 1 }, // U+016E -> U (LATIN CAPITAL LETTER U WITH RING ABOVE)
442 { "\xC5\xAF", 2, "u", 1 }, // U+016F -> u (LATIN SMALL LETTER U WITH RING ABOVE)
443 { "\xC5\xB0", 2, "U", 1 }, // U+0170 -> U (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
444 { "\xC5\xB1", 2, "u", 1 }, // U+0171 -> u (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
445 { "\xC5\xB2", 2, "U", 1 }, // U+0172 -> U (LATIN CAPITAL LETTER U WITH OGONEK)
446 { "\xC5\xB3", 2, "u", 1 }, // U+0173 -> u (LATIN SMALL LETTER U WITH OGONEK)
447 { "\xC5\xB4", 2, "W", 1 }, // U+0174 -> W (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
448 { "\xC5\xB5", 2, "w", 1 }, // U+0175 -> w (LATIN SMALL LETTER W WITH CIRCUMFLEX)
449 { "\xC5\xB6", 2, "Y", 1 }, // U+0176 -> Y (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
450 { "\xC5\xB7", 2, "y", 1 }, // U+0177 -> y (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
451 { "\xC5\xB8", 2, "Y", 1 }, // U+0178 -> Y (LATIN CAPITAL LETTER Y WITH DIAERESIS)
452 { "\xC5\xB9", 2, "Z", 1 }, // U+0179 -> Z (LATIN CAPITAL LETTER Z WITH ACUTE)
453 { "\xC5\xBA", 2, "z", 1 }, // U+017A -> z (LATIN SMALL LETTER Z WITH ACUTE)
454 { "\xC5\xBB", 2, "Z", 1 }, // U+017B -> Z (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
455 { "\xC5\xBC", 2, "z", 1 }, // U+017C -> z (LATIN SMALL LETTER Z WITH DOT ABOVE)
456 { "\xC5\xBD", 2, "Z", 1 }, // U+017D -> Z (LATIN CAPITAL LETTER Z WITH CARON)
457 { "\xC5\xBE", 2, "z", 1 }, // U+017E -> z (LATIN SMALL LETTER Z WITH CARON)
458 { "\xC5\xBF", 2, "s", 1 }, // U+017F -> s (LATIN SMALL LETTER LONG S
460 // U+2000 ... U+206F (General Punctuation)
461 { "\xE2\x80\x90", 3, "-", 1 }, // U+2010 -> - (HYPHEN)
462 { "\xE2\x80\x91", 3, "-", 1 }, // U+2011 -> - (NON-BREAKING HYPHEN)
463 { "\xE2\x80\x92", 3, "--", 2 }, // U+2012 -> -- (FIGURE DASH)
464 { "\xE2\x80\x93", 3, "--", 2 }, // U+2013 -> -- (EN DASH)
465 { "\xE2\x80\x94", 3, "---", 3 }, // U+2014 -> --- (EM DASH)
466 { "\xE2\x80\x95", 3, "---", 3 }, // U+2015 -> --- (HORIZONTAL BAR)
467 { "\xE2\x80\x96", 3, "||", 2 }, // U+2016 -> || (DOUBLE VERTICAL LINE)
468 { "\xE2\x80\x97", 3, "_", 1 }, // U+2017 -> _ (DOUBLE LOW LINE)
469 { "\xE2\x80\x98", 3, "`", 1 }, // U+2018 -> ` (LEFT SINGLE QUOTATION MARK)
470 { "\xE2\x80\x99", 3, "'", 1 }, // U+2019 -> ' (RIGHT SINGLE QUOTATION MARK)
471 { "\xE2\x80\x9A", 3, ",", 1 }, // U+201A -> , (SINGLE LOW-9 QUOTATION MARK)
472 { "\xE2\x80\x9B", 3, "'", 1 }, // U+201B -> ' (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
473 { "\xE2\x80\x9C", 3, "\"", 1 }, // U+201C -> " (LEFT DOUBLE QUOTATION MARK)
474 { "\xE2\x80\x9D", 3, "\"", 1 }, // U+201D -> " (RIGHT DOUBLE QUOTATION MARK)
475 { "\xE2\x80\x9E", 3, ",,", 2 }, // U+201E -> ,, (DOUBLE LOW-9 QUOTATION MARK)
476 { "\xE2\x80\x9F", 3, "``", 2 }, // U+201F -> `` (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
477 { "\xE2\x80\xA0", 3, "+", 1 }, // U+2020 -> + (DAGGER)
478 { "\xE2\x80\xA1", 3, "+", 1 }, // U+2021 -> + (DOUBLE DAGGER)
479 { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7 (BULLET) -> (MIDDLE POINT)
480 { "\xE2\x80\xA3", 3, ".", 1 }, // U+2023 -> . (TRIANGULAR BULLET)
481 { "\xE2\x80\xA4", 3, ".", 1 }, // U+2024 -> . (ONE DOT LEADER)
482 { "\xE2\x80\xA5", 3, "..", 2 }, // U+2025 -> .. (TWO DOT LEADER)
483 { "\xE2\x80\xA6", 3, "...", 3 }, // U+2026 -> ... (HORIZONTAL ELLIPSIS)
484 { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7 (HYPHENATION POINT) -> (MIDDLE POINT)
485 { "\xE2\x80\xB0", 3, "%.", 2 }, // U+2030 -> %. (PER MILLE SIGN)
486 { "\xE2\x80\xB1", 3, "%..", 3 }, // U+2031 -> %.. (PER TEN THOUSAND SIGN)
487 { "\xE2\x80\xB2", 3, "'", 1 }, // U+2032 -> ` (PRIME)
488 { "\xE2\x80\xB3", 3, "''", 2 }, // U+2033 -> '' (DOUBLE PRIME)
489 { "\xE2\x80\xB4", 3, "'''", 3 }, // U+2034 -> ''' (TRIPLE PRIME)
490 { "\xE2\x80\xB5", 3, "`", 1 }, // U+2035 -> ` (REVERSED PRIME)
491 { "\xE2\x80\xB6", 3, "``", 2 }, // U+2036 -> `` (REVERSED DOUBLE PRIME)
492 { "\xE2\x80\xB7", 3, "```", 3 }, // U+2037 -> ``` (REVERSED TRIPLE PRIME)
493 { "\xE2\x80\xB8", 3, "^", 1 }, // U+2038 -> ^ (CARET)
494 { "\xE2\x80\xB9", 3, "<", 1 }, // U+2039 -> < (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
495 { "\xE2\x80\xBA", 3, ">", 1 }, // U+203A -> > (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
496 { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7 (REFERENCE MARK) -> (MULTIPLICATION SIGN)
497 { "\xE2\x80\xBC", 3, "!!", 2 }, // U+203C -> !! (DOUBLE EXCLAMATION MARK)
498 { "\xE2\x80\xBD", 3, "?", 1 }, // U+203D -> ? (INTERROBANG)
499 { "\xE2\x81\x82", 3, "*", 1 }, // U+2042 -> * (ASTERISM)
500 { "\xE2\x81\x83", 3, ".", 1 }, // U+2043 -> . (HYPHEN BULLET)
501 { "\xE2\x81\x84", 3, "/", 1 }, // U+2044 -> / (FRACTION SLASH)
502 { "\xE2\x81\x87", 3, "??", 2 }, // U+2047 -> ?? (DOUBLE QUESTION MARK)
503 { "\xE2\x81\x88", 3, "?!", 2 }, // U+2048 -> ?! (QUESTION EXCLAMATION MARK)
504 { "\xE2\x81\x89", 3, "!?", 2 }, // U+2049 -> !? (EXCLAMATION QUESTION MARK)
505 { "\xE2\x81\x8E", 3, "*", 1 }, // U+204E -> * (LOW ASTERISK)
506 { "\xE2\x81\x8F", 3, ";", 1 }, // U+204F -> ; (REVERSED SEMICOLON)
507 { "\xE2\x81\x91", 3, "*", 1 }, // U+2051 -> * (TWO ASTERISKS ALIGNED VERTICALLY)
508 { "\xE2\x81\x92", 3, "-", 1 }, // U+2052 -> - (COMMERCIAL MINUS SIGN)
509 { "\xE2\x81\x93", 3, "~", 1 }, // U+2053 -> ~ (SWUNG DASH)
510 { "\xE2\x81\x95", 3, "*", 1 }, // U+2055 -> * (FLOWER PUNCTUATION MARK)
511 { "\xE2\x81\x97", 3, "''''", 4 }, // U+2057 -> '''' (QUADRUPLE PRIME)
512 { "\xE2\x81\x9A", 3, ":", 1 }, // U+205A -> : (TWO DOT PUNCTUATION)
513 { "\xE2\x81\x9C", 3, "+", 1 }, // U+205C -> + (DOTTED CROSS)
515 // U+20A0 ... U+20CF (Currency Symbols)
516 { "\xE2\x82\xA0", 3, "ECU", 3 }, // U+20A0 -> ECU (EURO-CURRENCY SIGN)
517 { "\xE2\x82\xA1", 3, "CRC", 3 }, // U+20A1 -> CRC (COLON SIGN)
518 { "\xE2\x82\xA2", 3, "BRC", 3 }, // U+20A2 -> BRC (CRUZEIRO SIGN)
519 { "\xE2\x82\xA3", 3, "BEF", 3 }, // U+20A3 -> BEF (FRENCH FRANC SIGN)
520 { "\xE2\x82\xA4", 3, "ITL", 3 }, // U+20A4 -> ITL (LIRA SIGN)
521 { "\xE2\x82\xA6", 3, "NGN", 3 }, // U+20A6 -> NGN (NEIRA SIGN)
522 { "\xE2\x82\xA7", 3, "ESP", 3 }, // U+20A7 -> ESP (PESETA SIGN)
523 { "\xE2\x82\xA8", 3, "MVQ", 3 }, // U+20A8 -> MVQ (RUPEE SIGN)
524 { "\xE2\x82\xA9", 3, "KPW", 3 }, // U+20A9 -> KPW (WON SIGN)
525 { "\xE2\x82\xAA", 3, "ILS", 3 }, // U+20AA -> ILS (NEW SHEQEL SIGN)
526 { "\xE2\x82\xAB", 3, "VNC", 3 }, // U+20AB -> VNC (DONG SIGN)
527 { "\xE2\x82\xAC", 3, "EUR", 3 }, // U+20AC -> EUR (EURO SIGN)
528 { "\xE2\x82\xAD", 3, "LAK", 3 }, // U+20AD -> LAK (KIP SIGN)
529 { "\xE2\x82\xAE", 3, "MNT", 3 }, // U+20AE -> MNT (TUGRIK SIGN)
530 { "\xE2\x82\xAF", 3, "GRD", 3 }, // U+20AF -> GRD (DRACHMA SIGN)
531 { "\xE2\x82\xB0", 3, "Pf", 2 }, // U+20B0 -> Pf (GERMAN PENNY SIGN)
532 { "\xE2\x82\xB1", 3, "P", 1 }, // U+20B1 -> P (PESO SIGN)
533 { "\xE2\x82\xB2", 3, "PYG", 3 }, // U+20B2 -> PYG (GUARANI SIGN)
534 { "\xE2\x82\xB3", 3, "ARA", 3 }, // U+20B3 -> ARA (AUSTRAL SIGN)
535 { "\xE2\x82\xB4", 3, "UAH", 3 }, // U+20B4 -> UAH (HRYVNIA SIGN)
536 { "\xE2\x82\xB5", 3, "GHS", 3 }, // U+20B5 -> GHS (CEDI SIGN)
538 // U+2190 ... U+21FF (Arrows)
539 { "\xE2\x86\x90", 3, "<-", 2 }, // U+2190 -> <- (LEFTWARDS ARROW)
540 { "\xE2\x86\x92", 3, "->", 2 }, // U+2192 -> -> (RIGHTWARDS ARROW)
543 ENTER();
545 // start with no replacement string
546 *dst = NULL;
548 // perform a binary search in the lookup table
549 if((rep = bsearch(&key, utf8map, sizeof(utf8map) / sizeof(utf8map[0]), sizeof(utf8map[0]), compareUTF8Replacements)) != NULL)
551 // if we found something, then copy this over to the result variables
552 *dst = rep->rep;
553 len = rep->replen;
556 RETURN(len);
557 return len;
561 /// matchCodesetAlias()
563 struct CodesetAliases
565 const char *MIMEname; // The official and correct MIME name for a codeset
566 const char *Aliases; // A space separated array with well-known aliases
569 const struct CodesetAliases codesetAliases[] =
571 // MIME name Aliases
572 { "Amiga-1251", "Ami1251 Amiga1251" },
573 { "AmigaPL", "AmiPL Amiga-PL" },
574 { "ISO-8859-1", "ISO8859-1 8859-1" },
575 { "ISO-8859-2", "ISO8859-2 8859-2" },
576 { "ISO-8859-3", "ISO8859-3 8859-3" },
577 { "ISO-8859-4", "ISO8859-4 8859-4" },
578 { "ISO-8859-5", "ISO8859-5 8859-5" },
579 { "ISO-8859-6", "ISO8859-6 8859-6" },
580 { "ISO-8859-7", "ISO8859-7 8859-7" },
581 { "ISO-8859-8", "ISO8859-8 8859-8" },
582 { "ISO-8859-9", "ISO8859-9 8859-9" },
583 { "ISO-8859-10", "ISO8859-10 8859-10" },
584 { "ISO-8859-11", "ISO8859-11 8859-11" },
585 { "ISO-8859-12", "ISO8859-12 8859-12" },
586 { "ISO-8859-13", "ISO8859-13 8859-13" },
587 { "ISO-8859-14", "ISO8859-14 8859-14" },
588 { "ISO-8859-15", "ISO8859-15 8859-15" },
589 { "ISO-8859-16", "ISO8859-16 8859-16" },
590 { "ISO-8859-10", "ISO8859-10 8859-10" },
591 { "KOI8-R", "KOI8R" },
592 { "US-ASCII", "ASCII" },
593 { "UTF-8", "UTF8 UTF" },
594 { "UTF-16", "UTF16" },
595 { "UTF-32", "UTF32" },
596 { "windows-1250", "cp1250 windows1250" },
597 { "windows-1251", "cp1251 windows1251" },
598 { "windows-1252", "cp1252 windows1252" },
599 { "windows-1253", "cp1253 windows1253" },
600 { "windows-1254", "cp1254 windows1254" },
601 { "windows-1255", "cp1255 windows1255" },
602 { "windows-1256", "cp1256 windows1256" },
603 { "windows-1257", "cp1257 windows1257" },
604 { NULL, NULL, }
607 static char *matchCodesetAlias(const char *search)
609 char *result = NULL;
610 size_t len = strlen(search);
611 int i;
613 ENTER();
615 for(i=0; codesetAliases[i].MIMEname != NULL; i++)
617 BOOL found = FALSE;
619 // search the MIMEname first
620 if(stricmp(search, codesetAliases[i].MIMEname) == 0)
621 found = TRUE;
622 else
624 const char *s = codesetAliases[i].Aliases;
626 // loop through space separated list of aliases
627 while(s != NULL && *s != '\0')
629 if(strnicmp(search, s, len) == 0)
631 found = TRUE;
632 break;
635 if((s = strpbrk(s, " ")) != NULL)
636 s++;
640 if(found == TRUE)
642 result = (char *)codesetAliases[i].MIMEname;
644 break;
648 RETURN(result);
649 return result;
654 /**************************************************************************/
656 /// defaultCodeset()
657 static struct codeset *
658 defaultCodeset(BOOL useSemaphore)
660 char buf[256];
661 struct codeset *codeset;
663 ENTER();
665 if(useSemaphore == TRUE)
666 ObtainSemaphoreShared(&CodesetsBase->libSem);
668 buf[0] = '\0';
669 GetVar("codeset_default",buf,sizeof(buf),GVF_GLOBAL_ONLY);
671 if(buf[0] == '\0' || (codeset = codesetsFind(&CodesetsBase->codesets,buf)) == NULL)
672 codeset = CodesetsBase->systemCodeset;
674 if(useSemaphore == TRUE)
675 ReleaseSemaphore(&CodesetsBase->libSem);
677 RETURN(codeset);
678 return codeset;
681 /// codesetsCmpUnicode()
682 // The compare function
683 static int
684 codesetsCmpUnicode(struct single_convert *arg1,struct single_convert *arg2)
686 return strcmp((char*)&arg1->utf8[1], (char*)&arg2->utf8[1]);
689 /// codesetsReadTable()
691 #define ITEM_STANDARD "Standard"
692 #define ITEM_ALTSTANDARD "AltStandard"
693 #define ITEM_READONLY "ReadOnly"
694 #define ITEM_CHARACTERIZATION "Characterization"
696 // Reads a coding table and adds it
697 static BOOL
698 codesetsReadTable(struct codesetList *csList, STRPTR name)
700 BPTR fh;
701 BOOL res = FALSE;
703 ENTER();
705 D(DBF_STARTUP, "trying to fetch charset file '%s'...", name);
707 if((fh = Open(name, MODE_OLDFILE)))
709 struct codeset *codeset;
711 if((codeset = (struct codeset *)allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) != NULL)
713 int i;
714 char buf[512];
716 memset(codeset,0,sizeof(struct codeset));
718 for(i = 0; i<256; i++)
719 codeset->table[i].code = codeset->table[i].ucs4 = i;
721 while(readLine(fh, buf, 512*sizeof(char)))
723 const char *result;
725 if(buf[0]=='#')
726 continue;
728 if((result = getConfigItem(buf, ITEM_STANDARD, strlen(ITEM_STANDARD))))
729 codeset->name = mystrdup(result);
730 else if(codeset->name == NULL) // a valid file starts with standard and nothing else!!
731 break;
732 else if((result = getConfigItem(buf,ITEM_ALTSTANDARD,strlen(ITEM_ALTSTANDARD))))
733 codeset->alt_name = mystrdup(result);
734 else if((result = getConfigItem(buf,ITEM_READONLY,strlen(ITEM_READONLY))))
735 codeset->read_only = !!atoi(result);
736 else if((result = getConfigItem(buf,ITEM_CHARACTERIZATION,strlen(ITEM_CHARACTERIZATION))))
738 if((result[0]=='_') && (result[1]=='(') && (result[2]=='"'))
740 char *end = strchr(result + 3, '"');
742 if(end)
743 codeset->characterization = mystrndup(result+3,end-(result+3));
745 else
746 codeset->characterization = mystrdup(result);
748 else
750 char *p = buf;
751 int fmt2 = 0;
753 if((*p=='=') || (fmt2 = ((*p=='0') || (*(p+1)=='x'))))
755 p++;
756 p += fmt2;
758 i = strtol((const char *)p,(char **)&p,16);
759 if(i>0 && i<256)
761 while(isspace(*p)) p++;
763 if(!strnicmp(p, "U+", 2))
765 p += 2;
766 codeset->table[i].ucs4 = strtol((const char *)p,(char **)&p,16);
768 else
770 if(*p!='#')
771 codeset->table[i].ucs4 = strtol((const char *)p,(char **)&p,0);
778 // check if there is not already codeset with the same name in here
779 if(codeset->name != NULL && !(codesetsFind(csList, codeset->name)))
781 for(i=0; i<256; i++)
783 UTF32 src = codeset->table[i].ucs4, *src_ptr = &src;
784 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
786 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
787 *dest_ptr = 0;
788 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)(&codeset->table[i].utf8[1]);
791 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
792 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), (int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
793 AddTail((struct List *)csList, (struct Node *)&codeset->node);
795 res = TRUE;
797 else
799 // cleanup
800 if(codeset->name) freeArbitrateVecPooled(codeset->name);
801 if(codeset->alt_name) freeArbitrateVecPooled(codeset->alt_name);
802 if(codeset->characterization) freeArbitrateVecPooled(codeset->characterization);
803 freeArbitrateVecPooled(codeset);
807 Close(fh);
810 RETURN(res);
811 return res;
814 /// codesetsScanDir()
815 static void
816 codesetsScanDir(struct codesetList *csList, const char *dirPath)
818 ENTER();
820 if(dirPath != NULL && dirPath[0] != '\0')
822 #if defined(__amigaos4__)
823 APTR dirContext;
825 if((dirContext = ObtainDirContextTags(EX_StringNameInput, dirPath,
826 EX_DataFields, EXF_NAME|EXF_TYPE,
827 TAG_END)) != NULL)
829 struct ExamineData *exd;
831 D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
833 while((exd = ExamineDir(dirContext)) != NULL)
835 if(EXD_IS_FILE(exd))
837 char filePath[620];
839 strlcpy(filePath, dirPath, sizeof(filePath));
840 AddPart(filePath, exd->Name, sizeof(filePath));
842 D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
844 codesetsReadTable(csList, filePath);
848 ReleaseDirContext(dirContext);
850 #else
851 BPTR dirLock;
853 if((dirLock = Lock(dirPath, ACCESS_READ)))
855 struct ExAllControl *eac;
857 D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
859 if((eac = AllocDosObject(DOS_EXALLCONTROL, NULL)) != NULL)
861 struct ExAllData *ead;
862 struct ExAllData *eabuffer;
863 LONG more;
865 eac->eac_LastKey = 0;
866 eac->eac_MatchString = NULL;
867 eac->eac_MatchFunc = NULL;
869 if((eabuffer = allocVecPooled(CodesetsBase->pool, 10*sizeof(struct ExAllData))) != NULL)
871 char filePath[620];
875 more = ExAll(dirLock, eabuffer, 10*sizeof(struct ExAllData), ED_TYPE, eac);
876 if(!more && IoErr() != ERROR_NO_MORE_ENTRIES)
877 break;
879 if(eac->eac_Entries == 0)
880 continue;
882 ead = (struct ExAllData *)eabuffer;
885 // we only take that ead if it is a file (ed_Type < 0)
886 if(ead->ed_Type < 0)
888 strlcpy(filePath, dirPath, sizeof(filePath));
889 AddPart(filePath, (char *)ead->ed_Name, sizeof(filePath));
891 D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
893 codesetsReadTable(csList, filePath);
896 while((ead = ead->ed_Next));
898 while(more);
900 freeVecPooled(CodesetsBase->pool, eabuffer);
903 FreeDosObject(DOS_EXALLCONTROL, eac);
906 UnLock(dirLock);
908 #endif
911 LEAVE();
915 /// codesetsInit()
916 // Initialized and loads the codesets
917 BOOL
918 codesetsInit(struct codesetList *csList)
920 struct codeset *codeset = NULL;
921 UTF32 src;
922 int i;
923 #if defined(__amigaos4__)
924 ULONG nextMIB = 3;
925 #endif
927 ENTER();
929 ObtainSemaphore(&CodesetsBase->poolSem);
931 NewList((struct List *)&CodesetsBase->codesets);
933 // to make the list of the supported codesets complete we also add fake
934 // 'UTF-8' , 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
935 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
936 goto end;
938 codeset->name = mystrdup("UTF-8");
939 codeset->alt_name = mystrdup("UTF8");
940 codeset->characterization = mystrdup("Unicode");
941 codeset->read_only = 0;
942 AddTail((struct List *)csList, (struct Node *)&codeset->node);
943 CodesetsBase->utf8Codeset = codeset;
945 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
946 goto end;
948 codeset->name = mystrdup("UTF-16");
949 codeset->alt_name = mystrdup("UTF16");
950 codeset->characterization = mystrdup("16-bit Unicode");
951 codeset->read_only = 0;
952 AddTail((struct List *)csList, (struct Node *)&codeset->node);
953 CodesetsBase->utf16Codeset = codeset;
955 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
956 goto end;
958 codeset->name = mystrdup("UTF-32");
959 codeset->alt_name = mystrdup("UTF32");
960 codeset->characterization = mystrdup("32-bit Unicode");
961 codeset->read_only = 0;
962 AddTail((struct List *)csList, (struct Node *)&codeset->node);
963 CodesetsBase->utf32Codeset = codeset;
965 // on AmigaOS4 we can use diskfont.library to inquire charset information as
966 // it comes with a quite rich implementation of different charsets.
967 #if defined(__amigaos4__)
970 char *mimename;
971 char *ianaName;
972 ULONG *mapTable;
973 ULONG curMIB = nextMIB;
975 nextMIB = ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NEXTNUMBER);
976 if(nextMIB == 0)
977 break;
979 mapTable = (ULONG *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MAPTABLE);
980 mimename = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MIMENAME);
981 ianaName = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NAME);
982 if(mapTable != NULL && mimename != NULL && codesetsFind(csList, mimename) == NULL)
984 D(DBF_STARTUP, "loading charset '%s' from diskfont.library...", mimename);
986 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
987 goto end;
989 codeset->name = mystrdup(mimename);
990 codeset->alt_name = NULL;
991 codeset->characterization = mystrdup(ianaName);
992 codeset->read_only = 0;
994 for(i=0; i<256; i++)
996 UTF32 *src_ptr = &src;
997 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
999 src = mapTable[i];
1001 codeset->table[i].code = i;
1002 codeset->table[i].ucs4 = src;
1003 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1004 *dest_ptr = 0;
1005 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1008 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1009 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1, const void *arg2))codesetsCmpUnicode);
1011 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1014 while(TRUE);
1015 #endif
1017 #if defined(__MORPHOS__)
1019 struct Library *KeymapBase;
1020 struct Library *LocaleBase;
1022 if((KeymapBase = OpenLibrary("keymap.library", 51)) != NULL)
1024 if((LocaleBase = OpenLibrary("locale.library", 51)) != NULL)
1026 struct KeyMap *keymap = AskKeyMapDefault();
1027 CONST_STRPTR name = GetKeyMapCodepage(keymap);
1029 if(name != NULL && keymap != NULL) // Legacy keymaps dont have codepage or Unicode mappings
1031 D(DBF_STARTUP, "loading charset '%s' from keymap.library...", name);
1033 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) != NULL)
1035 codeset->name = mystrdup(name);
1036 codeset->alt_name = NULL;
1037 codeset->characterization = mystrdup(name); // No more information available
1038 codeset->read_only = 0;
1040 for(i=0; i<256; i++)
1042 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1043 LONG rc;
1045 codeset->table[i].code = i;
1046 codeset->table[i].ucs4 = src = ToUCS4(i, keymap);
1047 rc = ConvertUCS4ToUTF8((CONST_WSTRPTR)&src, dest_ptr, 1);
1048 dest_ptr[rc] = 0;
1049 codeset->table[i].utf8[0] = rc;
1052 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1053 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1, const void *arg2))codesetsCmpUnicode);
1055 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1057 else
1058 goto end;
1061 CloseLibrary(LocaleBase);
1064 CloseLibrary(KeymapBase);
1067 #endif
1069 D(DBF_STARTUP, "loading charsets from Libs:Charsets...");
1071 // we try to walk to the LIBS:Charsets directory on our own and readin our
1072 // own charset tables
1073 codesetsScanDir(csList, "LIBS:Charsets");
1076 // now we go and initialize our internally supported codesets but only if
1077 // we have not already loaded a charset with the same name
1079 D(DBF_STARTUP, "initializing internal charsets...");
1081 // ISO-8859-1 + EURO
1082 if(codesetsFind(csList, "ISO-8859-1 + Euro") == NULL)
1084 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1085 goto end;
1087 codeset->name = mystrdup("ISO-8859-1 + Euro");
1088 codeset->alt_name = NULL;
1089 codeset->characterization = mystrdup("West European (with EURO)");
1090 codeset->read_only = 1;
1091 for(i = 0; i<256; i++)
1093 UTF32 *src_ptr = &src;
1094 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1096 if(i==164)
1097 src = 0x20AC; /* the EURO sign */
1098 else
1099 src = i;
1101 codeset->table[i].code = i;
1102 codeset->table[i].ucs4 = src;
1103 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1104 *dest_ptr = 0;
1105 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1107 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1108 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1, const void *arg2))codesetsCmpUnicode);
1109 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1112 // ISO-8859-1
1113 if(codesetsFind(csList, "ISO-8859-1") == NULL)
1115 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1116 goto end;
1118 codeset->name = mystrdup("ISO-8859-1");
1119 codeset->alt_name = mystrdup("ISO8859-1");
1120 codeset->characterization = mystrdup("West European");
1121 codeset->read_only = 0;
1122 for(i = 0; i<256; i++)
1124 UTF32 *src_ptr = &src;
1125 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1127 src = i;
1129 codeset->table[i].code = i;
1130 codeset->table[i].ucs4 = src;
1131 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1132 *dest_ptr = 0;
1133 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1135 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1136 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1137 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1140 // ISO-8859-2
1141 if(codesetsFind(csList, "ISO-8859-2") == NULL)
1143 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1144 goto end;
1146 codeset->name = mystrdup("ISO-8859-2");
1147 codeset->alt_name = mystrdup("ISO8859-2");
1148 codeset->characterization = mystrdup("Central/East European");
1149 codeset->read_only = 0;
1150 for(i = 0; i<256; i++)
1152 UTF32 *src_ptr = &src;
1153 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1155 if(i<0xa0)
1156 src = i;
1157 else
1158 src = iso_8859_2_to_ucs4[i-0xa0];
1160 codeset->table[i].code = i;
1161 codeset->table[i].ucs4 = src;
1162 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr,dest_ptr+6, CSF_StrictConversion);
1163 *dest_ptr = 0;
1164 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1166 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1167 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1168 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1171 // ISO-8859-3
1172 if(codesetsFind(csList, "ISO-8859-3") == NULL)
1174 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1175 goto end;
1177 codeset->name = mystrdup("ISO-8859-3");
1178 codeset->alt_name = mystrdup("ISO8859-3");
1179 codeset->characterization = mystrdup("South European");
1180 codeset->read_only = 0;
1181 for(i = 0; i<256; i++)
1183 UTF32 *src_ptr = &src;
1184 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1186 if(i<0xa0)
1187 src = i;
1188 else
1189 src = iso_8859_3_to_ucs4[i-0xa0];
1191 codeset->table[i].code = i;
1192 codeset->table[i].ucs4 = src;
1193 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1194 *dest_ptr = 0;
1195 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1197 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1198 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1199 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1202 // ISO-8859-4
1203 if(codesetsFind(csList, "ISO-8859-4") == NULL)
1205 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1206 goto end;
1208 codeset->name = mystrdup("ISO-8859-4");
1209 codeset->alt_name = mystrdup("ISO8859-4");
1210 codeset->characterization = mystrdup("North European");
1211 codeset->read_only = 0;
1212 for(i = 0; i<256; i++)
1214 UTF32 *src_ptr = &src;
1215 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1217 if(i<0xa0)
1218 src = i;
1219 else
1220 src = iso_8859_4_to_ucs4[i-0xa0];
1222 codeset->table[i].code = i;
1223 codeset->table[i].ucs4 = src;
1224 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1225 *dest_ptr = 0;
1226 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1228 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1229 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1, const void *arg2))codesetsCmpUnicode);
1230 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1233 // ISO-8859-5
1234 if(codesetsFind(csList, "ISO-8859-5") == NULL)
1236 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1237 goto end;
1239 codeset->name = mystrdup("ISO-8859-5");
1240 codeset->alt_name = mystrdup("ISO8859-5");
1241 codeset->characterization = mystrdup("Slavic languages");
1242 codeset->read_only = 0;
1243 for(i = 0; i<256; i++)
1245 UTF32 *src_ptr = &src;
1246 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1248 if(i<0xa0)
1249 src = i;
1250 else
1251 src = iso_8859_5_to_ucs4[i-0xa0];
1253 codeset->table[i].code = i;
1254 codeset->table[i].ucs4 = src;
1255 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1256 *dest_ptr = 0;
1257 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1259 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1260 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1261 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1264 // ISO-8859-9
1265 if(codesetsFind(csList, "ISO-8859-9") == NULL)
1267 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1268 goto end;
1270 codeset->name = mystrdup("ISO-8859-9");
1271 codeset->alt_name = mystrdup("ISO8859-9");
1272 codeset->characterization = mystrdup("Turkish");
1273 codeset->read_only = 0;
1274 for(i = 0; i<256; i++)
1276 UTF32 *src_ptr = &src;
1277 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1279 if(i<0xa0)
1280 src = i;
1281 else
1282 src = iso_8859_9_to_ucs4[i-0xa0];
1284 codeset->table[i].code = i;
1285 codeset->table[i].ucs4 = src;
1286 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1287 *dest_ptr = 0;
1288 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1290 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1291 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1292 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1295 // ISO-8859-15
1296 if(codesetsFind(csList, "ISO-8859-15") == NULL)
1298 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1299 goto end;
1301 codeset->name = mystrdup("ISO-8859-15");
1302 codeset->alt_name = mystrdup("ISO8859-15");
1303 codeset->characterization = mystrdup("West European II");
1304 codeset->read_only = 0;
1305 for(i = 0; i<256; i++)
1307 UTF32 *src_ptr = &src;
1308 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1310 if(i<0xa0)
1311 src = i;
1312 else
1313 src = iso_8859_15_to_ucs4[i-0xa0];
1315 codeset->table[i].code = i;
1316 codeset->table[i].ucs4 = src;
1317 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1318 *dest_ptr = 0;
1319 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1321 memcpy(codeset->table_sorted,codeset->table,sizeof (codeset->table));
1322 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1323 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1326 // ISO-8859-16
1327 if(codesetsFind(csList, "ISO-8859-16") == NULL)
1329 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1330 goto end;
1332 codeset->name = mystrdup("ISO-8859-16");
1333 codeset->alt_name = mystrdup("ISO8869-16");
1334 codeset->characterization = mystrdup("South-Eastern European");
1335 codeset->read_only = 0;
1336 for(i=0;i<256;i++)
1338 UTF32 *src_ptr = &src;
1339 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1341 if(i < 0xa0)
1342 src = i;
1343 else
1344 src = iso_8859_16_to_ucs4[i-0xa0];
1346 codeset->table[i].code = i;
1347 codeset->table[i].ucs4 = src;
1348 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1349 *dest_ptr = 0;
1350 codeset->table[i].utf8[0] = (ULONG)dest_ptr - (ULONG)&codeset->table[i].utf8[1];
1352 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1353 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), (int (*)(const void *arg1, const void *arg2))codesetsCmpUnicode);
1354 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1357 // KOI8-R
1358 if(codesetsFind(csList, "KOI8-R") == NULL)
1360 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1361 goto end;
1363 codeset->name = mystrdup("KOI8-R");
1364 codeset->alt_name = mystrdup("KOI8R");
1365 codeset->characterization = mystrdup("Russian");
1366 codeset->read_only = 0;
1367 for(i = 0; i<256; i++)
1369 UTF32 *src_ptr = &src;
1370 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1372 if(i<0x80)
1373 src = i;
1374 else
1375 src = koi8r_to_ucs4[i-0x80];
1377 codeset->table[i].code = i;
1378 codeset->table[i].ucs4 = src;
1379 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1380 *dest_ptr = 0;
1381 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1383 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1384 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1385 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1388 // AmigaPL
1389 if(codesetsFind(csList, "AmigaPL") == NULL)
1391 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1392 goto end;
1394 codeset->name = mystrdup("AmigaPL");
1395 codeset->alt_name = mystrdup("AmiPL");
1396 codeset->characterization = mystrdup("Polish (Amiga)");
1397 codeset->read_only = 1;
1398 for(i=0; i<256; i++)
1400 UTF32 *src_ptr = &src;
1401 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1403 if(i<0xa0)
1404 src = i;
1405 else
1406 src = amigapl_to_ucs4[i-0xa0];
1408 codeset->table[i].code = i;
1409 codeset->table[i].ucs4 = src;
1410 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1411 *dest_ptr = 0;
1412 codeset->table[i].utf8[0] = (ULONG)dest_ptr-(ULONG)&codeset->table[i].utf8[1];
1414 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1415 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1,const void *arg2))codesetsCmpUnicode);
1416 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1419 // Amiga-1251
1420 if(codesetsFind(csList, "Amiga-1251") == NULL)
1422 if((codeset = allocVecPooled(CodesetsBase->pool, sizeof(struct codeset))) == NULL)
1423 goto end;
1425 codeset->name = mystrdup("Amiga-1251");
1426 codeset->alt_name = mystrdup("Ami1251");
1427 codeset->characterization = mystrdup("Cyrillic (Amiga)");
1428 codeset->read_only = 1;
1429 for(i=0; i<256; i++)
1431 UTF32 *src_ptr = &src;
1432 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1434 if(i < 0xa0)
1435 src = i;
1436 else
1437 src = amiga1251_to_ucs4[i-0xa0];
1439 codeset->table[i].code = i;
1440 codeset->table[i].ucs4 = src;
1441 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1442 *dest_ptr = 0;
1443 codeset->table[i].utf8[0] = (char*)dest_ptr - (char*)&codeset->table[i].utf8[1];
1445 memcpy(codeset->table_sorted,codeset->table,sizeof(codeset->table));
1446 qsort(codeset->table_sorted,256,sizeof(codeset->table[0]),(int (*)(const void *arg1, const void *arg2))codesetsCmpUnicode);
1447 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1450 end:
1451 ReleaseSemaphore(&CodesetsBase->poolSem);
1453 RETURN(codeset != 0);
1454 return codeset != NULL;
1458 /// codesetsCleanup()
1459 // Cleanup the memory for the codeset
1460 void
1461 codesetsCleanup(struct codesetList *csList)
1463 struct codeset *code;
1465 ENTER();
1467 while((code = (struct codeset *)RemHead((struct List *)csList)))
1469 if(code->name) freeArbitrateVecPooled(code->name);
1470 if(code->alt_name) freeArbitrateVecPooled(code->alt_name);
1471 if(code->characterization) freeArbitrateVecPooled(code->characterization);
1473 freeArbitrateVecPooled(code);
1476 LEAVE();
1480 /// codesetsFind()
1481 // Returns the given codeset.
1482 struct codeset *
1483 codesetsFind(struct codesetList *csList, const char *name)
1485 struct codeset *res = NULL;
1487 ENTER();
1489 if(name && *name)
1491 struct codeset *mstate, *succ;
1492 char *matchedName = matchCodesetAlias(name);
1494 if(matchedName != NULL)
1495 name = matchedName;
1497 for(mstate = (struct codeset *)csList->list.mlh_Head; (succ = (struct codeset *)mstate->node.mln_Succ); mstate = succ)
1499 if(stricmp(name, mstate->name) == 0 ||
1500 (mstate->alt_name != NULL && stricmp(name, mstate->alt_name) == 0))
1502 // break out
1503 break;
1507 if(succ)
1508 res = mstate;
1511 RETURN(res);
1512 return res;
1515 /// codesetsFindBest()
1516 // Returns the best codeset for the given text
1517 static struct codeset *
1518 codesetsFindBest(struct TagItem *attrs, ULONG csFamily, STRPTR text, int text_len, int *error_ptr)
1520 struct codeset *best_codeset = NULL;
1521 int best_errors = text_len;
1522 BOOL found = FALSE;
1524 ENTER();
1526 // in case the user specified the codeset family as a
1527 // cyrillic one we go and do our cyrillic specific analysis first
1528 if(csFamily == CSV_CodesetFamily_Cyrillic)
1530 #define NUM_CYRILLIC 3
1532 struct CodesetSearch
1534 const char *name;
1535 const char *data;
1538 struct CodesetSearch search[NUM_CYRILLIC];
1539 unsigned char *p;
1540 unsigned char *tp;
1541 int ctr[NUM_CYRILLIC];
1542 int Nmax;
1543 int NGlob = 1;
1544 int max;
1545 int gr = 0;
1546 int lr = 0;
1548 search[0].name = "windows-1251";
1549 search[0].data = cp1251_data;
1550 search[1].name = "IBM866";
1551 search[1].data = cp866_data;
1552 search[2].name = "KOI8-R";
1553 search[2].data = koi8r_data;
1555 memset(&ctr, 0, sizeof(ctr));
1557 tp = (unsigned char *)text;
1561 int n;
1562 int mid = max = -466725766; // TODO: what's the magic behind this constant?
1563 Nmax = 0;
1565 for(n=0; n < NUM_CYRILLIC; n++)
1567 unsigned char la = 0;
1568 unsigned char *tptr = (unsigned char *)search[n].data;
1570 p = tp;
1574 unsigned char lb = (*p++) ^ 128;
1576 if(!((la | lb) & 128))
1577 ctr[n] += (signed char)tptr[(la << 7) + lb];
1579 la = lb;
1581 while(*p);
1583 if(max < ctr[n])
1585 mid = max;
1586 max = ctr[n];
1587 Nmax = n+1;
1591 tp = p;
1592 if((max >= 500) && ((max-mid) >= 1000))
1594 lr = gr = 1;
1595 NGlob = Nmax;
1598 while((*p) && (!gr));
1600 if(gr || ((!(*p)) && lr))
1601 Nmax = NGlob;
1603 // if our analysis found something, we go and try
1604 // to find the corresponding codeset in out codeset list
1605 if(max != 0)
1607 struct TagItem *tstate = attrs;
1608 struct TagItem *tag;
1610 D(DBF_STARTUP, "identified text as '%s", search[Nmax-1].name);
1612 // now we walk through our taglist and check if the user
1613 // supplied
1614 while((tag = NextTagItem(&tstate)))
1616 if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1618 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1620 if((best_codeset = codesetsFind(csList, search[Nmax-1].name)) != NULL)
1621 break;
1625 // if we still haven't found the matching codeset
1626 // we search the internal list
1627 if(best_codeset == NULL)
1628 best_codeset = codesetsFind(&CodesetsBase->codesets, search[Nmax-1].name);
1630 best_errors = 0;
1632 found = TRUE;
1636 // if we haven't found the best codeset (through the cyrillic analysis
1637 // we go and do the dumb latin search in our codesetlist
1638 if(found == FALSE)
1640 struct TagItem *tstate = attrs;
1641 struct TagItem *tag;
1642 BOOL lastIteration = FALSE;
1644 while((tag = NextTagItem(&tstate)) || (lastIteration = TRUE))
1646 if(lastIteration == TRUE || (tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0))
1648 struct codesetList *csList = (lastIteration ? &CodesetsBase->codesets : (struct codesetList *)tag->ti_Data);
1649 struct codeset *codeset = (struct codeset *)csList->list.mlh_Head;
1651 // the following identification/detection routines is NOT really smart.
1652 // we just see how each UTF8 string is the representation of each char
1653 // in our source text and then check if they are valid or not. As said,
1654 // not very smart, but we don't have anything better right now :(
1656 while(codeset)
1658 if(!codeset->read_only && codeset != CodesetsBase->utf8Codeset)
1660 char *text_ptr = text;
1661 int i;
1662 int errors = 0;
1664 for(i=0; i < text_len; i++)
1666 unsigned char c = *text_ptr++;
1668 if(c)
1670 struct single_convert *f = &codeset->table[c];
1672 if(f->utf8[0] == 0 || f->utf8[1] == 0x00)
1673 errors++;
1675 else
1676 break;
1679 D(DBF_STARTUP, "tried to identify text as '%s' text with %ld of %ld errors", codeset->name, errors, text_len);
1681 if(errors < best_errors)
1683 best_codeset = codeset;
1684 best_errors = errors;
1687 if(best_errors == 0)
1688 break;
1691 codeset = (struct codeset *)codeset->node.mln_Succ;
1694 if(lastIteration)
1695 break;
1700 if(error_ptr)
1701 *error_ptr = best_errors;
1703 RETURN(best_codeset);
1704 return best_codeset;
1708 /**************************************************************************/
1710 /// CodesetsSupportedA()
1711 STRPTR *LIBFUNC
1712 CodesetsSupportedA(REG(a0, UNUSED struct TagItem * attrs))
1714 STRPTR *array = NULL;
1715 struct TagItem *tstate = attrs;
1716 struct TagItem *tag;
1717 int numCodesets;
1719 ENTER();
1721 // first we need to check how many codesets our supplied
1722 // lists carry.
1723 numCodesets = countCodesets(&CodesetsBase->codesets);
1724 while((tag = NextTagItem(&tstate)))
1726 if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1727 numCodesets += countCodesets((struct codesetList *)tag->ti_Data);
1730 // now that we know how many codesets we have in our lists we
1731 // can put their names into our string arrays
1732 if(numCodesets > 0)
1734 if((array = allocArbitrateVecPooled((numCodesets+1)*sizeof(STRPTR))))
1736 struct codeset *code;
1737 struct codeset *succ;
1738 int i=0;
1740 // reset the tstate
1741 tstate = attrs;
1743 ObtainSemaphoreShared(&CodesetsBase->libSem);
1745 // first we walk through the internal codesets list and
1746 // add the names
1747 for(code = (struct codeset *)CodesetsBase->codesets.list.mlh_Head; (succ = (struct codeset *)code->node.mln_Succ); code = succ, i++)
1748 array[i] = code->name;
1750 // then we also iterate through our private codesets list
1751 while((tag = NextTagItem(&tstate)))
1753 if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1755 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1757 for(code = (struct codeset *)csList->list.mlh_Head; (succ = (struct codeset *)code->node.mln_Succ); code = succ, i++)
1758 array[i] = code->name;
1762 array[i] = NULL;
1764 ReleaseSemaphore(&CodesetsBase->libSem);
1768 RETURN(array);
1769 return array;
1773 /// CodesetsFreeA()
1774 void LIBFUNC
1775 CodesetsFreeA(REG(a0, APTR obj),
1776 REG(a1, UNUSED struct TagItem *attrs))
1778 ENTER();
1780 if(obj)
1781 freeArbitrateVecPooled(obj);
1783 LEAVE();
1787 /// CodesetsSetDefaultA()
1788 struct codeset *LIBFUNC
1789 CodesetsSetDefaultA(REG(a0, STRPTR name),
1790 REG(a1, struct TagItem *attrs))
1792 struct codeset *codeset;
1794 ENTER();
1796 ObtainSemaphoreShared(&CodesetsBase->libSem);
1798 if((codeset = codesetsFind(&CodesetsBase->codesets,name)))
1800 ULONG flags;
1802 flags = GVF_SAVE_VAR | (GetTagData(CSA_Save,FALSE,attrs) ? GVF_GLOBAL_ONLY : 0);
1804 SetVar("codeset_default",codeset->name,strlen(codeset->name),flags);
1807 ReleaseSemaphore(&CodesetsBase->libSem);
1809 RETURN(codeset);
1810 return codeset;
1814 /// CodesetsFindA()
1815 struct codeset *LIBFUNC
1816 CodesetsFindA(REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
1818 struct codeset *codeset = NULL;
1820 ENTER();
1822 ObtainSemaphoreShared(&CodesetsBase->libSem);
1824 // if no name pointer was supplied we have to return
1825 // the default codeset only.
1826 if(name != NULL)
1828 // we first walk through our internal list and check if we
1829 // can find the requested codeset
1830 codeset = codesetsFind(&CodesetsBase->codesets, name);
1832 if(codeset == NULL && attrs != NULL)
1834 struct TagItem *tstate = attrs;
1835 struct TagItem *tag;
1837 // now we walk through our taglist and check if the user
1838 // supplied
1839 while((tag = NextTagItem(&tstate)))
1841 if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1843 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1845 if((codeset = codesetsFind(csList, name)) != NULL)
1846 break;
1852 // check if we found something or not.
1853 if(codeset == NULL && (attrs == NULL || GetTagData(CSA_FallbackToDefault, TRUE, attrs)))
1854 codeset = defaultCodeset(FALSE);
1856 ReleaseSemaphore(&CodesetsBase->libSem);
1858 RETURN(codeset);
1859 return codeset;
1863 /// CodesetsFindBestA()
1864 struct codeset *LIBFUNC
1865 CodesetsFindBestA(REG(a0, struct TagItem *attrs))
1867 struct codeset *codeset = NULL;
1869 ENTER();
1871 ObtainSemaphoreShared(&CodesetsBase->libSem);
1873 if(attrs)
1875 char *text = (char *)GetTagData(CSA_Source, 0, attrs);
1876 ULONG text_len = GetTagData(CSA_SourceLen, text != NULL ? strlen(text) : 0, attrs);
1878 if(text != NULL && text_len > 0)
1880 int numErrors = 0;
1881 ULONG csFamily = GetTagData(CSA_CodesetFamily, CSV_CodesetFamily_Latin, attrs);
1882 int *error_ptr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
1883 BOOL defaultFallBack = GetTagData(CSA_FallbackToDefault, FALSE, attrs);
1885 codeset = codesetsFindBest(attrs, csFamily, text, text_len, &numErrors);
1887 if(error_ptr != NULL)
1888 *error_ptr = numErrors;
1890 // if we still haven't got the codeset we fallback to the default
1891 if(codeset == NULL && defaultFallBack == TRUE)
1892 codeset = defaultCodeset(FALSE);
1896 ReleaseSemaphore(&CodesetsBase->libSem);
1898 RETURN(codeset);
1899 return codeset;
1903 /// CodesetsUTF8Len()
1904 // Returns the number of characters a utf8 string has. This is not
1905 // identically with the size of memory is required to hold the string.
1906 ULONG LIBFUNC
1907 CodesetsUTF8Len(REG(a0, UTF8 *str))
1909 int len;
1910 unsigned char c;
1912 ENTER();
1914 if(!str)
1915 return 0;
1917 len = 0;
1919 while((c = *str++))
1921 len++;
1922 str += trailingBytesForUTF8[c];
1925 RETURN((ULONG)len);
1926 return (ULONG)len;
1930 /// CodesetsStrLenA()
1931 ULONG LIBFUNC
1932 CodesetsStrLenA(REG(a0, STRPTR str),
1933 REG(a1, struct TagItem *attrs))
1935 ULONG res = 0;
1937 ENTER();
1939 if(str != NULL)
1941 struct codeset *codeset;
1942 int len;
1943 STRPTR src;
1944 int utf;
1946 if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
1947 codeset = defaultCodeset(TRUE);
1948 if(codeset == CodesetsBase->utf32Codeset)
1950 utf = 32;
1951 len = utf32_strlen((UTF32 *)str);
1953 else if(codeset == CodesetsBase->utf16Codeset)
1955 utf = 16;
1956 len = utf16_strlen((UTF16 *)str);
1958 else
1960 utf = 0;
1961 len = strlen(str);
1964 len = GetTagData(CSA_SourceLen, len, attrs);
1966 src = str;
1968 if(utf != 0)
1970 void *srcend = src + len;
1971 UTF8 *dstlen = NULL;
1973 switch(utf)
1975 case 32:
1976 CodesetsConvertUTF32toUTF8((const UTF32 **)&src, srcend, &dstlen, NULL, 0);
1977 break;
1978 case 16:
1979 CodesetsConvertUTF16toUTF8((const UTF16 **)&src, srcend, &dstlen, NULL, 0);
1980 break;
1982 res = (ULONG)dstlen;
1984 else
1986 UBYTE c;
1988 res = 0;
1990 while((c = *src++) != '\0' && len != 0)
1992 res += codeset->table[c].utf8[0];
1993 len--;
1998 RETURN(res);
1999 return res;
2003 /// CodesetsUTF8ToStrA()
2004 // Converts an UTF8 string to a given charset. Return the number of bytes
2005 // written to dest excluding the NULL byte (which is always ensured by this
2006 // function; it means a NULL str will produce "" as dest; anyway you should
2007 // check NULL str to not waste your time!).
2008 STRPTR LIBFUNC
2009 CodesetsUTF8ToStrA(REG(a0, struct TagItem *attrs))
2011 UTF8 *src;
2012 ULONG srcLen;
2013 ULONG *destLenPtr;
2014 ULONG n = 0;
2015 STRPTR dest = NULL;
2017 ENTER();
2019 if((src = (UTF8 *)GetTagData(CSA_Source, (ULONG)NULL, attrs)) != NULL &&
2020 (srcLen = GetTagData(CSA_SourceLen, src != NULL ? strlen((char *)src) : 0, attrs)) > 0)
2022 struct convertMsg msg;
2023 struct codeset *codeset;
2024 struct Hook *destHook;
2025 struct Hook *mapForeignCharsHook;
2026 char buf[256];
2027 STRPTR destIter = NULL;
2028 char *b = NULL;
2029 ULONG destLen = 0;
2030 int i = 0;
2031 unsigned char *s = src;
2032 unsigned char *e = (src+srcLen);
2033 int numConvErrors = 0;
2034 int *numConvErrorsPtr;
2035 BOOL mapForeignChars;
2036 APTR pool = NULL;
2037 struct SignalSemaphore *sem = NULL;
2038 int utf;
2039 ULONG char_size;
2041 // get some more optional attributes
2042 destHook = (struct Hook *)GetTagData(CSA_DestHook, (ULONG)NULL, attrs);
2043 destLen = GetTagData(CSA_DestLen, 0, attrs);
2044 numConvErrorsPtr = (int *)GetTagData(CSA_ErrPtr, (ULONG)NULL, attrs);
2045 mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2046 mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, (ULONG)NULL, attrs);
2048 // get the destination codeset pointer
2049 if((codeset = (struct codeset *)GetTagData(CSA_DestCodeset, (ULONG)NULL, attrs)) == NULL)
2050 codeset = defaultCodeset(TRUE);
2051 if(codeset == CodesetsBase->utf32Codeset)
2053 utf = 32;
2054 char_size = 4;
2056 else if(codeset == CodesetsBase->utf16Codeset)
2058 utf = 16;
2059 char_size = 2;
2061 else
2063 utf = 0;
2064 char_size = 1;
2067 // first we make sure we allocate enough memory
2068 // for our destination buffer
2069 if(destHook != NULL)
2071 if(destLen < 16 || destLen > sizeof(buf))
2072 destLen = sizeof(buf);
2074 msg.state = CSV_Translating;
2075 b = buf;
2076 i = 0;
2078 else
2080 // in case the user wants us to dynamically generate the
2081 // destination buffer we do it right now
2082 if((dest = (STRPTR)GetTagData(CSA_Dest, (ULONG)NULL, attrs)) == NULL ||
2083 GetTagData(CSA_AllocIfNeeded, TRUE, attrs) != FALSE)
2085 ULONG len = 0;
2087 // calculate the destLen
2088 if(utf)
2090 void *dstlen = NULL;
2092 switch(utf)
2094 case 32:
2095 CodesetsConvertUTF8toUTF32((const UTF8 **)&s, e, (UTF32 **)&dstlen, NULL, 0);
2096 break;
2097 case 16:
2098 CodesetsConvertUTF8toUTF16((const UTF8 **)&s, e, (UTF16 **)&dstlen, NULL, 0);
2099 break;
2101 len = (ULONG)dstlen;
2103 else
2105 while(s < e)
2107 unsigned char c = *s++;
2109 len++;
2110 s += trailingBytesForUTF8[c];
2114 if(dest == NULL || (destLen < len+1))
2116 if((pool = (APTR)GetTagData(CSA_Pool, (ULONG)NULL, attrs)) != NULL)
2118 if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, (ULONG)NULL, attrs)) != NULL)
2119 ObtainSemaphore(sem);
2121 // allocate the destination buffer
2122 dest = allocVecPooled(pool, len+char_size);
2124 if(sem != NULL)
2125 ReleaseSemaphore(sem);
2127 else
2128 dest = allocArbitrateVecPooled(len+char_size);
2130 destLen = len+char_size;
2133 if(dest == NULL)
2135 RETURN(NULL);
2136 return NULL;
2140 destIter = dest;
2143 // now we convert the src string to the
2144 // destination buffer.
2145 s = src;
2146 if (utf)
2148 void *dstend;
2150 if(destHook != NULL)
2152 ULONG r;
2154 dstend = b + destLen - char_size;
2157 switch(utf)
2159 case 32:
2160 r = CodesetsConvertUTF8toUTF32((const UTF8 **)&s, e, (UTF32 **)&b, dstend, 0);
2161 break;
2162 case 16:
2163 r = CodesetsConvertUTF8toUTF16((const UTF8 **)&s, e, (UTF16 **)&b, dstend, 0);
2164 break;
2166 b[0] = 0;
2167 if(char_size > 1)
2168 b[1] = 0;
2169 if(r != CSR_TargetExhausted)
2170 msg.state = CSV_End;
2171 msg.len = b-buf;
2172 CallHookPkt(destHook,&msg,buf);
2174 b = buf;
2175 n += msg.len;
2177 while(r == CSR_TargetExhausted);
2179 else
2181 dstend = destIter + destLen - char_size;
2182 switch(utf)
2184 case 32:
2185 CodesetsConvertUTF8toUTF32((const UTF8 **)&s, e, (UTF32 **)&destIter, dstend, 0);
2186 break;
2187 case 16:
2188 CodesetsConvertUTF8toUTF16((const UTF8 **)&s, e, (UTF16 **)&destIter, dstend, 0);
2189 break;
2191 n = destIter-dest;
2194 else
2196 for(;;n++)
2198 if(destHook == NULL && n >= destLen-1)
2199 break;
2201 // convert until we reach the end of the
2202 // source buffer.
2203 if(s < e)
2205 unsigned char c = *s;
2206 unsigned char d = '?';
2207 const char *repstr = NULL;
2208 int replen = 0;
2210 // check if the char is a >7bit char
2211 if(c > 127)
2213 struct single_convert *f;
2214 int lenAdd = trailingBytesForUTF8[c];
2215 int lenStr = lenAdd+1;
2216 unsigned char *src = s;
2220 // start each iteration with "no replacement found yet"
2221 repstr = NULL;
2222 replen = 0;
2224 // search in the UTF8 conversion table of the current charset if
2225 // we have a replacement character for the char sequence starting at s
2226 BIN_SEARCH(codeset->table_sorted, 0, 255, strncmp((char *)src, (char *)codeset->table_sorted[m].utf8+1, lenStr), f);
2228 if(f != NULL)
2230 d = f->code;
2231 replen = -1;
2233 break;
2235 else
2237 // the analysed char sequence (s) is not convertable to a
2238 // single visible char replacement, so we normally have to put
2239 // a ? sign as a "unknown char" sign at the very position.
2241 // For convienence we, however, allow users to replace these
2242 // UTF8 characters with char sequences that "looklike" the
2243 // original char.
2244 if(mapForeignChars == TRUE)
2245 replen = mapUTF8toASCII(&repstr, src, lenStr);
2247 // call the hook only, if the internal table yielded no suitable
2248 // replacement
2249 if(replen == 0 && mapForeignCharsHook != NULL)
2251 struct replaceMsg rmsg;
2253 rmsg.dst = (char **)&repstr;
2254 rmsg.src = src;
2255 rmsg.srclen = lenStr;
2256 replen = CallHookPkt(mapForeignCharsHook, &rmsg, NULL);
2259 if(replen < 0)
2261 D(DBF_UTF, "got UTF8 replacement (%ld)", replen);
2263 // stay in the loop as long as one replacement function delivers
2264 // further UTF8 replacement sequences
2265 src = (unsigned char *)repstr;
2267 else if(replen == 0)
2269 D(DBF_UTF, "found no ASCII replacement for UTF8 string (%ld)", replen);
2270 repstr = NULL;
2272 else
2273 D(DBF_UTF, "got replacement string '%s' (%ld)", repstr ? repstr : "<null>", replen);
2276 while(replen < 0);
2278 if(repstr == NULL || replen == 0)
2280 if(replen >= 0)
2282 d = '?';
2283 numConvErrors++;
2287 s += lenAdd;
2289 else
2290 d = c;
2292 if(destHook != NULL)
2294 if(replen > 1)
2296 while(replen > 0)
2298 *b++ = *repstr;
2299 repstr++;
2300 i++;
2301 replen--;
2303 if(i%(destLen-1)==0)
2305 *b = '\0';
2306 msg.len = i;
2307 CallHookPkt(destHook, &msg, buf);
2309 b = buf;
2310 *b = '\0';
2311 i = 0;
2315 else
2317 *b++ = replen > 0 ? *repstr : d;
2318 i++;
2321 if(i%(destLen-1)==0)
2323 *b = '\0';
2324 msg.len = i;
2325 CallHookPkt(destHook, &msg, buf);
2327 b = buf;
2328 *b = '\0';
2329 i = 0;
2332 else
2334 if(replen > 1)
2336 ULONG destPos = destIter-dest;
2338 if(pool != NULL)
2340 if(sem != NULL)
2341 ObtainSemaphore(sem);
2343 // allocate the destination buffer
2344 dest = reallocVecPooled(pool, dest, destLen, destLen+replen-1);
2346 if(sem != NULL)
2347 ReleaseSemaphore(sem);
2349 else
2350 dest = reallocArbitrateVecPooled(dest, destLen, destLen+replen-1);
2352 if(dest == NULL)
2354 RETURN(NULL);
2355 return NULL;
2358 destIter = dest+destPos;
2359 memcpy(destIter, repstr, replen);
2361 // adjust our loop pointer and destination length
2362 destIter += replen;
2363 destLen += replen-1;
2365 else if(replen == 1)
2366 *destIter++ = *repstr;
2367 else
2368 *destIter++ = d;
2371 s++;
2373 else
2374 break;
2377 if(destHook != NULL)
2379 msg.state = CSV_End;
2380 msg.len = i;
2381 *b = '\0';
2382 CallHookPkt(destHook,&msg,buf);
2384 else
2385 *destIter = '\0';
2388 // let us write the number of conversion errors
2389 // to the proper variable pointer, if wanted
2390 if(numConvErrorsPtr != NULL)
2391 *numConvErrorsPtr = numConvErrors;
2394 // put the final length of our destination buffer
2395 // into the destLenPtr
2396 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, (ULONG)NULL, attrs)) != NULL)
2397 *destLenPtr = n;
2399 RETURN(dest);
2400 return dest;
2404 /// CodesetsUTF8CreateA()
2405 // Converts a string and a charset to an UTF8. Returns the UTF8.
2406 // If a destination hook is supplied always return 0.
2407 // If from is NULL, it returns NULL and doesn't call the hook.
2408 UTF8 *LIBFUNC
2409 CodesetsUTF8CreateA(REG(a0, struct TagItem *attrs))
2411 UTF8 *from;
2412 UTF8 *dest;
2413 struct codeset *codeset;
2414 ULONG fromLen, *destLenPtr;
2415 ULONG n;
2416 int utf;
2418 ENTER();
2420 dest = NULL;
2421 n = 0;
2423 if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2424 codeset = defaultCodeset(TRUE);
2425 if(codeset == CodesetsBase->utf32Codeset)
2426 utf = 32;
2427 else if(codeset == CodesetsBase->utf16Codeset)
2428 utf = 16;
2429 else
2430 utf = 0;
2432 from = (UTF8*)GetTagData(CSA_Source, 0, attrs);
2433 if(from)
2435 switch(utf)
2437 case 32:
2438 fromLen = utf32_strlen((UTF32 *)from);
2439 break;
2441 case 16:
2442 fromLen = utf16_strlen((UTF16 *)from);
2443 break;
2445 default:
2446 fromLen = strlen((char *)from);
2447 break;
2450 else
2451 fromLen = 0;
2452 fromLen = GetTagData(CSA_SourceLen, fromLen, attrs);
2454 if(from != NULL && fromLen != 0)
2456 struct convertMsg msg;
2457 struct Hook *hook;
2458 ULONG destLen;
2459 int i = 0;
2460 UBYTE buf[256];
2461 UBYTE *src, *destPtr = NULL, *b = NULL, c;
2463 hook = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2464 destLen = GetTagData(CSA_DestLen,0,attrs);
2466 if(hook != NULL)
2468 if(destLen<16 || destLen>sizeof(buf))
2469 destLen = sizeof(buf);
2471 msg.state = CSV_Translating;
2472 b = buf;
2473 i = 0;
2475 else
2477 if((dest = (UTF8*)GetTagData(CSA_Dest, 0, attrs)) != NULL ||
2478 GetTagData(CSA_AllocIfNeeded,TRUE,attrs))
2480 ULONG len;
2482 src = from;
2484 if(utf != 0)
2486 void *srcend = src + fromLen;
2487 UTF8 *dstlen = NULL;
2489 switch(utf)
2491 case 32:
2492 CodesetsConvertUTF32toUTF8((const UTF32 **)&src, srcend, &dstlen, NULL, 0);
2493 break;
2494 case 16:
2495 CodesetsConvertUTF16toUTF8((const UTF16 **)&src, srcend, &dstlen, NULL, 0);
2496 break;
2498 len = (ULONG)dstlen;
2500 else
2502 ULONG flen = fromLen;
2504 len = 0;
2505 while((c = *src++) != '\0' && flen != 0)
2507 len += codeset->table[c].utf8[0];
2508 flen--;
2511 D(DBF_UTF, "Calculated output UTF-8 buffer length: %lu\n", len);
2513 if(dest == NULL || (destLen<len+1))
2515 APTR pool;
2516 struct SignalSemaphore *sem;
2518 if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2520 if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2521 ObtainSemaphore(sem);
2523 // allocate the destination buffer
2524 dest = allocVecPooled(pool,len+1);
2526 if(sem != NULL)
2527 ReleaseSemaphore(sem);
2529 else
2530 dest = allocArbitrateVecPooled(len+1);
2532 destLen = len;
2535 if(dest == NULL)
2537 RETURN(NULL);
2538 return NULL;
2542 destPtr = (UBYTE*)dest;
2545 src = from;
2546 if(utf)
2548 void *srcend = src + fromLen;
2549 UTF8 *dstend;
2551 if(hook != NULL)
2553 ULONG r;
2555 dstend = b + destLen - 1;
2558 switch(utf)
2560 case 32:
2561 r = CodesetsConvertUTF32toUTF8((const UTF32 **)&src, srcend, &b, dstend, 0);
2562 break;
2563 case 16:
2564 r = CodesetsConvertUTF16toUTF8((const UTF16 **)&src, srcend, &b, dstend, 0);
2565 break;
2567 *b = 0;
2568 if(r != CSR_TargetExhausted)
2569 msg.state = CSV_End;
2570 msg.len = b-buf;
2571 CallHookPkt(hook,&msg,buf);
2573 b = buf;
2574 n += msg.len;
2576 while(r == CSR_TargetExhausted);
2578 else
2580 dstend = destPtr + destLen;
2581 switch(utf)
2583 case 32:
2584 CodesetsConvertUTF32toUTF8((const UTF32 **)&src, srcend, &destPtr, dstend, 0);
2585 break;
2586 case 16:
2587 CodesetsConvertUTF16toUTF8((const UTF16 **)&src, srcend, &destPtr, dstend, 0);
2588 break;
2590 n = destPtr-dest;
2593 else
2595 for(; fromLen && (c = *src); src++, fromLen--)
2597 UTF8* utf8_seq;
2599 for(utf8_seq = &codeset->table[c].utf8[1]; (c = *utf8_seq); utf8_seq++)
2601 if(hook != NULL)
2603 *b++ = c;
2604 i++;
2606 if(i%(destLen-1)==0)
2608 *b = 0;
2609 msg.len = i;
2610 CallHookPkt(hook,&msg,buf);
2612 b = buf;
2613 *b = 0;
2614 i = 0;
2617 else
2619 if(n>=destLen)
2620 break;
2622 *destPtr++ = c;
2625 n++;
2629 if(hook != NULL)
2631 msg.state = CSV_End;
2632 msg.len = i;
2633 *b = 0;
2634 CallHookPkt(hook,&msg,buf);
2636 else
2638 *destPtr = 0;
2643 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)))
2644 *destLenPtr = n;
2646 RETURN(dest);
2647 return dest;
2651 /// CodesetsIsValidUTF8()
2652 #define GOOD_UCS(c) \
2653 ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2654 (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2656 BOOL LIBFUNC
2657 CodesetsIsValidUTF8(REG(a0, STRPTR s))
2659 STRPTR t = s;
2660 int n;
2662 ENTER();
2664 while((n = parseUtf8(&t)))
2666 if(!GOOD_UCS(n))
2668 RETURN(FALSE);
2669 return FALSE;
2673 RETURN(TRUE);
2674 return TRUE;
2678 /// CodesetsConvertStrA()
2679 // Converts a given string from one source Codeset to a given destination
2680 // codeset and returns the convert string
2681 STRPTR LIBFUNC
2682 CodesetsConvertStrA(REG(a0, struct TagItem *attrs))
2684 struct codeset *srcCodeset;
2685 STRPTR srcStr = NULL;
2686 STRPTR dstStr = NULL;
2687 ULONG srcLen = 0;
2688 ULONG dstLen = 0;
2690 ENTER();
2692 // get the ptr to the src string we want to convert
2693 // from the source codeset to the dest codeset.
2694 srcStr = (STRPTR)GetTagData(CSA_Source, (ULONG)NULL, attrs);
2696 // get the pointer to the codeset in which the src string is encoded
2697 if((srcCodeset = (struct codeset *)GetTagData(CSA_SourceCodeset, (ULONG)NULL, attrs)) == NULL)
2698 srcCodeset = defaultCodeset(TRUE);
2700 if (srcStr != NULL)
2702 if (srcCodeset == CodesetsBase->utf32Codeset)
2703 srcLen = utf32_strlen((UTF32 *)srcStr);
2704 else if (srcCodeset == CodesetsBase->utf16Codeset)
2705 srcLen = utf16_strlen((UTF16 *)srcStr);
2706 else
2707 srcLen = strlen(srcStr);
2709 else
2710 srcLen = 0;
2711 srcLen = GetTagData(CSA_SourceLen, srcLen, attrs);
2713 if(srcStr != NULL && srcLen > 0)
2715 struct codeset *dstCodeset;
2717 // get the pointer to the codeset in which the dst string should be encoded
2718 if((dstCodeset = (struct codeset *)GetTagData(CSA_DestCodeset, (ULONG)NULL, attrs)) == NULL)
2719 dstCodeset = defaultCodeset(TRUE);
2721 D(DBF_UTF, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset->name, dstCodeset->name);
2723 // check that the user didn't supplied the very same codeset
2724 // or otherwise a conversion is not required.
2725 if(srcCodeset != NULL && dstCodeset != NULL && srcCodeset != dstCodeset)
2727 BOOL utf8Create = FALSE;
2728 BOOL strCreate = FALSE;
2729 UTF8 *utf8str;
2730 ULONG utf8strLen = 0;
2731 ULONG *destLenPtr = NULL;
2732 BOOL mapForeignChars;
2733 struct Hook *mapForeignCharsHook;
2735 mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2736 mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, (ULONG)NULL, attrs);
2738 // if the source codeset is UTF-8 we don't have to use the UTF8Create()
2739 // function and can directly call the UTF8ToStr() function
2740 if(srcCodeset != CodesetsBase->utf8Codeset)
2742 struct TagItem tags[] = { { CSA_SourceCodeset, (ULONG)srcCodeset },
2743 { CSA_Source, (ULONG)srcStr },
2744 { CSA_SourceLen, srcLen },
2745 { CSA_DestLenPtr, (ULONG)&utf8strLen },
2746 { TAG_DONE, 0 } };
2748 utf8str = CodesetsUTF8CreateA((struct TagItem *)&tags[0]);
2750 utf8Create = TRUE;
2752 else
2754 utf8str = (UTF8 *)srcStr;
2755 utf8strLen = srcLen;
2758 // in case the destination codeset is UTF-8 we don't have to actually
2759 // use the UTF8ToStr() function and can immediately return our
2760 // UTF8 string
2761 if(utf8str != NULL && utf8strLen > 0 && dstCodeset != CodesetsBase->utf8Codeset)
2763 struct TagItem tags[] = { { CSA_DestCodeset, (ULONG)dstCodeset },
2764 { CSA_Source, (ULONG)utf8str },
2765 { CSA_SourceLen, utf8strLen },
2766 { CSA_DestLenPtr, (ULONG)&dstLen },
2767 { CSA_MapForeignChars, mapForeignChars },
2768 { CSA_MapForeignCharsHook, (ULONG)mapForeignCharsHook },
2769 { TAG_DONE, 0 } };
2771 dstStr = CodesetsUTF8ToStrA((struct TagItem *)&tags[0]);
2773 strCreate = TRUE;
2775 else
2777 dstStr = (STRPTR)utf8str;
2778 dstLen = utf8strLen;
2781 D(DBF_UTF, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr, srcLen,
2782 dstStr, dstLen,
2783 utf8Create,
2784 strCreate);
2786 // if everything was successfull we can go and finalize everything
2787 if(dstStr != NULL && utf8str != NULL)
2789 // as the conversion was a two way pass we have to either free the
2790 // memory of the utf8 string or not
2791 if(utf8Create == TRUE && strCreate == TRUE)
2792 CodesetsFreeA(utf8str, NULL);
2794 // if the user wants to be informed abour the length
2795 // of our destination string we store the length now in the supplied ptr.
2796 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, (ULONG)NULL, attrs)) != NULL)
2797 *destLenPtr = dstLen;
2799 D(DBF_UTF, "successfully converted string with len %ld", dstLen);
2801 else
2803 W(DBF_ALWAYS, "an error occurred while trying to convert a string");
2805 // free all memory in case the conversion didn't work out
2806 if(utf8Create == TRUE && utf8str != NULL)
2807 CodesetsFreeA(utf8str, NULL);
2809 if(strCreate == TRUE && dstStr != NULL)
2810 CodesetsFreeA(dstStr, NULL);
2812 dstStr = NULL;
2817 RETURN(dstStr);
2818 return dstStr;
2822 /// CodesetsFreeVecPooledA()
2823 void LIBFUNC
2824 CodesetsFreeVecPooledA(REG(a0, APTR pool),
2825 REG(a1, APTR mem),
2826 REG(a2, struct TagItem *attrs))
2828 ENTER();
2830 if(pool && mem)
2832 struct SignalSemaphore *sem;
2834 if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)))
2835 ObtainSemaphore(sem);
2837 freeVecPooled(pool,mem);
2839 if(sem)
2840 ReleaseSemaphore(sem);
2843 LEAVE();
2847 /// CodesetsListCreateA()
2848 struct codesetList *LIBFUNC
2849 CodesetsListCreateA(REG(a0, struct TagItem *attrs))
2851 struct codesetList *csList = NULL;
2853 ENTER();
2855 ObtainSemaphore(&CodesetsBase->poolSem);
2857 // no matter what, we create a codesets list we will return to the user
2858 if((csList = allocVecPooled(CodesetsBase->pool, sizeof(struct codesetList))))
2860 BOOL scanProgDir = TRUE;
2861 struct TagItem *tstate = attrs;
2862 struct TagItem *tag;
2864 // initialize the new private codeset list and put it into a separate list
2865 NewList((struct List *)csList);
2867 // first we get the path of the directory from which we go
2868 // and scan for charset tables from
2869 while((tag = NextTagItem(&tstate)))
2871 switch(tag->ti_Tag)
2873 case CSA_CodesetDir:
2875 codesetsScanDir(csList, (STRPTR)tag->ti_Data);
2877 scanProgDir = FALSE;
2879 break;
2881 case CSA_CodesetFile:
2883 codesetsReadTable(csList, (STRPTR)tag->ti_Data);
2885 scanProgDir = FALSE;
2887 break;
2889 case CSA_SourceCodeset:
2891 struct codeset *cs = (struct codeset *)tag->ti_Data;
2893 AddTail((struct List *)csList, (struct Node *)&cs->node);
2895 scanProgDir = FALSE;
2897 break;
2901 // in case the user also wants us to scan PROGDIR:
2902 // we do so
2903 if(scanProgDir == TRUE)
2904 codesetsScanDir(csList, "PROGDIR:Charsets");
2907 ReleaseSemaphore(&CodesetsBase->poolSem);
2909 RETURN(csList);
2910 return csList;
2914 /// CodesetsListDeleteA()
2915 BOOL LIBFUNC
2916 CodesetsListDeleteA(REG(a0, struct TagItem *attrs))
2918 BOOL result = FALSE;
2919 ENTER();
2921 ObtainSemaphore(&CodesetsBase->poolSem);
2923 if(attrs != NULL)
2925 BOOL freeCodesets;
2926 struct TagItem *tstate = attrs;
2927 struct TagItem *tag;
2929 // check if the caller wants us also to free the codesets
2930 freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
2932 // now we iterate through or tagItems and see what the
2933 // user wants to remove from the list
2934 while((tag = NextTagItem(&tstate)))
2936 switch(tag->ti_Tag)
2938 case CSA_CodesetList:
2940 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
2942 if(csList)
2944 // cleanup the codesets within the list
2945 if(freeCodesets)
2946 codesetsCleanup(csList);
2948 // then free the list itself
2949 freeArbitrateVecPooled(csList);
2951 result = TRUE;
2958 ReleaseSemaphore(&CodesetsBase->poolSem);
2960 RETURN(result);
2961 return result;
2965 /// CodesetsListAddA()
2966 BOOL LIBFUNC
2967 CodesetsListAddA(REG(a0, struct codesetList *csList),
2968 REG(a1, struct TagItem *attrs))
2970 BOOL result = FALSE;
2971 ENTER();
2973 ObtainSemaphore(&CodesetsBase->poolSem);
2975 if(csList != NULL && attrs != NULL)
2977 struct TagItem *tstate = attrs;
2978 struct TagItem *tag;
2980 // now we iterate through or tagItems and see if the user
2981 // wants to scan a whole directory or just adds a file.
2982 while((tag = NextTagItem(&tstate)))
2984 switch(tag->ti_Tag)
2986 case CSA_CodesetDir:
2988 codesetsScanDir(csList, (STRPTR)tag->ti_Data);
2989 result = TRUE;
2991 break;
2993 case CSA_CodesetFile:
2995 codesetsReadTable(csList, (STRPTR)tag->ti_Data);
2996 result = TRUE;
2998 break;
3000 case CSA_SourceCodeset:
3002 struct codeset *cs = (struct codeset *)tag->ti_Data;
3004 AddTail((struct List *)csList, (struct Node *)&cs->node);
3005 result = TRUE;
3007 break;
3012 ReleaseSemaphore(&CodesetsBase->poolSem);
3014 RETURN(result);
3015 return result;
3019 /// CodesetsListRemoveA()
3020 BOOL LIBFUNC
3021 CodesetsListRemoveA(REG(a0, struct TagItem *attrs))
3023 BOOL result = FALSE;
3024 ENTER();
3026 ObtainSemaphore(&CodesetsBase->poolSem);
3028 if(attrs != NULL)
3030 BOOL freeCodesets;
3031 struct TagItem *tstate = attrs;
3032 struct TagItem *tag;
3034 // check if the caller wants us also to free the codesets
3035 freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3037 // now we iterate through or tagItems and see what the
3038 // user wants to remove from the list
3039 while((tag = NextTagItem(&tstate)))
3041 switch(tag->ti_Tag)
3043 case CSA_SourceCodeset:
3045 struct codeset *cs = (struct codeset *)tag->ti_Data;
3047 if(cs)
3049 struct MinNode *mstate = &cs->node;
3051 // before we actually remove the node from its list, we
3052 // have to make sure it isn't part of our internal codesets list
3053 while(mstate->mln_Succ)
3054 mstate = mstate->mln_Succ;
3056 if(mstate != CodesetsBase->codesets.list.mlh_Tail)
3058 Remove((struct Node *)&cs->node);
3060 // free all codesets data if requested.
3061 if(freeCodesets == TRUE)
3063 if(cs->name) freeArbitrateVecPooled(cs->name);
3064 if(cs->alt_name) freeArbitrateVecPooled(cs->alt_name);
3065 if(cs->characterization) freeArbitrateVecPooled(cs->characterization);
3067 freeArbitrateVecPooled(cs);
3070 result = TRUE;
3072 else
3073 W(DBF_ALWAYS, "user tried to remove an internal codesets!");
3076 break;
3081 ReleaseSemaphore(&CodesetsBase->poolSem);
3083 RETURN(result);
3084 return result;
3089 /**************************************************************************/