1 /***************************************************************************
3 codesets.library - Amiga shared library for handling different codesets
4 Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
5 Copyright (C) 2005-2009 by codesets.library Open Source Team
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 codesets.library project: http://sourceforge.net/projects/codesetslib/
19 Most of the code included in this file was relicensed from GPL to LGPL
20 from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
21 with full permissions by its authors.
25 ***************************************************************************/
29 #include <clib/alib_protos.h>
31 #include <diskfont/glyph.h>
32 #include <diskfont/diskfonttag.h>
33 #include <proto/diskfont.h>
38 #include <proto/keymap.h>
39 #include <proto/locale.h>
42 #include "codesets_table.h"
43 #include "convertUTF.h"
44 #include "codepages.h"
46 #include "SDI_stdarg.h"
50 /**************************************************************************/
53 // search a sorted array in O(log n) e.g.
54 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
55 #define BIN_SEARCH(array,low,high,compare,result) \
59 int m = (low+high)/2;\
64 if (!d){ result = &array[m]; break; }\
65 if (d < 0) h = m - 1;\
74 mystrdup(const char *str
)
84 if((len
= strlen(str
)) > 0)
86 if((newStr
= allocArbitrateVecPooled(len
+1)) != NULL
)
87 strlcpy(newStr
, str
, len
+1);
97 mystrndup(const char *str1
, int n
)
103 if((dest
= allocArbitrateVecPooled(n
+1)) != NULL
)
106 strlcpy(dest
, str1
, n
+1);
119 readLine(BPTR fh
, char *buf
, ULONG size
)
125 if((c
= FGets(fh
, buf
, size
)) == NULL
)
133 if(*c
== '\n' || *c
== '\r')
145 static const char * getConfigItem(const char *buf
, const char *item
, int len
)
149 if(strnicmp(buf
, item
, len
) == 0)
156 while((c
= *buf
) != '\0' && isspace(c
))
168 while((c
= *buf
) != '\0' && isspace(c
))
181 parseUtf8(STRPTR
*ps
)
205 if((s
[1] & 0xc0)!=0x80)
213 RETURN(((s
[0] & 0x1f)<<6) | (s
[1] & 0x3f));
214 return ((s
[0] & 0x1f)<<6) | (s
[1] & 0x3f);
251 wc
= *s
++ & ((1<<(7-n
))-1);
255 if((*s
& 0xc0) != 0x80)
261 wc
= (wc
<< 6) | (*s
++ & 0x3f);
264 if(wc
< (1 << (5 * n
- 4)))
279 countCodesets(struct codesetList
*csList
)
281 struct MinNode
*node
, *succ
;
284 for(node
= csList
->list
.mlh_Head
, num
= 0; (succ
= node
->mln_Succ
); node
= succ
)
292 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
293 // function is used to replace these unknown sequences with lookalike characters that
294 // still make the text more readable. For more replacement see
295 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
297 // The conversion table in this function is partly borrowed from the awebcharset plugin
298 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
300 struct UTF8Replacement
302 const char *utf8
; // the original UTF8 string we are going to replace
303 const int utf8len
; // the length of the UTF8 string
304 const char *rep
; // pointer to the replacement string
305 const int replen
; // the length of the replacement string (minus for signalling an UTF8 string)
308 static int compareUTF8Replacements(const void *p1
, const void *p2
)
310 struct UTF8Replacement
*key
= (struct UTF8Replacement
*)p1
;
311 struct UTF8Replacement
*rep
= (struct UTF8Replacement
*)p2
;
314 // compare the length first, after that compare the strings
315 cmp
= key
->utf8len
- rep
->utf8len
;
317 cmp
= memcmp(key
->utf8
, rep
->utf8
, key
->utf8len
);
322 static int mapUTF8toASCII(const char **dst
, const unsigned char *src
, const int utf8len
)
325 struct UTF8Replacement key
= { (char *)src
, utf8len
, NULL
, 0 };
326 struct UTF8Replacement
*rep
;
328 static struct UTF8Replacement
const utf8map
[] =
330 // U+0100 ... U+017F (Latin Extended-A)
331 { "\xC4\x80", 2, "A", 1 }, // U+0100 -> A (LATIN CAPITAL LETTER A WITH MACRON)
332 { "\xC4\x81", 2, "a", 1 }, // U+0101 -> a (LATIN SMALL LETTER A WITH MACRON)
333 { "\xC4\x82", 2, "A", 1 }, // U+0102 -> A (LATIN CAPITAL LETTER A WITH BREVE)
334 { "\xC4\x83", 2, "a", 1 }, // U+0103 -> a (LATIN SMALL LETTER A WITH BREVE)
335 { "\xC4\x84", 2, "A", 1 }, // U+0104 -> A (LATIN CAPITAL LETTER A WITH OGONEK)
336 { "\xC4\x85", 2, "a", 1 }, // U+0105 -> a (LATIN SMALL LETTER A WITH OGONEK)
337 { "\xC4\x86", 2, "C", 1 }, // U+0106 -> C (LATIN CAPITAL LETTER C WITH ACUTE)
338 { "\xC4\x87", 2, "c", 1 }, // U+0107 -> c (LATIN SMALL LETTER C WITH ACUTE)
339 { "\xC4\x88", 2, "C", 1 }, // U+0108 -> C (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
340 { "\xC4\x89", 2, "c", 1 }, // U+0109 -> c (LATIN SMALL LETTER C WITH CIRCUMFLEX)
341 { "\xC4\x8A", 2, "C", 1 }, // U+010A -> C (LATIN CAPITAL LETTER C WITH DOT ABOVE)
342 { "\xC4\x8B", 2, "c", 1 }, // U+010B -> c (LATIN SMALL LETTER C WITH DOT ABOVE)
343 { "\xC4\x8C", 2, "C", 1 }, // U+010C -> C (LATIN CAPITAL LETTER C WITH CARON)
344 { "\xC4\x8D", 2, "c", 1 }, // U+010D -> c (LATIN SMALL LETTER C WITH CARON)
345 { "\xC4\x8E", 2, "D", 1 }, // U+010E -> D (LATIN CAPITAL LETTER D WITH CARON)
346 { "\xC4\x8F", 2, "d", 1 }, // U+010F -> d (LATIN SMALL LETTER D WITH CARON)
347 { "\xC4\x90", 2, "D", 1 }, // U+0110 -> D (LATIN CAPITAL LETTER D WITH STROKE)
348 { "\xC4\x91", 2, "d", 1 }, // U+0111 -> d (LATIN SMALL LETTER D WITH STROKE)
349 { "\xC4\x92", 2, "E", 1 }, // U+0112 -> E (LATIN CAPITAL LETTER E WITH MACRON)
350 { "\xC4\x93", 2, "e", 1 }, // U+0113 -> e (LATIN SMALL LETTER E WITH MACRON)
351 { "\xC4\x94", 2, "E", 1 }, // U+0114 -> E (LATIN CAPITAL LETTER E WITH BREVE)
352 { "\xC4\x95", 2, "e", 1 }, // U+0115 -> e (LATIN SMALL LETTER E WITH BREVE)
353 { "\xC4\x96", 2, "E", 1 }, // U+0116 -> E (LATIN CAPITAL LETTER E WITH DOT ABOVE)
354 { "\xC4\x97", 2, "e", 1 }, // U+0117 -> e (LATIN SMALL LETTER E WITH DOT ABOVE)
355 { "\xC4\x98", 2, "E", 1 }, // U+0118 -> E (LATIN CAPITAL LETTER E WITH OGONEK)
356 { "\xC4\x99", 2, "e", 1 }, // U+0119 -> e (LATIN SMALL LETTER E WITH OGONEK)
357 { "\xC4\x9A", 2, "E", 1 }, // U+011A -> E (LATIN CAPITAL LETTER E WITH CARON)
358 { "\xC4\x9B", 2, "e", 1 }, // U+011B -> e (LATIN SMALL LETTER E WITH CARON)
359 { "\xC4\x9C", 2, "G", 1 }, // U+011C -> G (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
360 { "\xC4\x9D", 2, "g", 1 }, // U+011D -> g (LATIN SMALL LETTER G WITH CIRCUMFLEX)
361 { "\xC4\x9E", 2, "G", 1 }, // U+011E -> G (LATIN CAPITAL LETTER G WITH BREVE)
362 { "\xC4\x9F", 2, "g", 1 }, // U+011F -> g (LATIN SMALL LETTER G WITH BREVE)
363 { "\xC4\xA0", 2, "G", 1 }, // U+0120 -> G (LATIN CAPITAL LETTER G WITH DOT ABOVE)
364 { "\xC4\xA1", 2, "g", 1 }, // U+0121 -> g (LATIN SMALL LETTER G WITH DOT ABOVE)
365 { "\xC4\xA2", 2, "G", 1 }, // U+0122 -> G (LATIN CAPITAL LETTER G WITH CEDILLA)
366 { "\xC4\xA3", 2, "g", 1 }, // U+0123 -> g (LATIN SMALL LETTER G WITH CEDILLA)
367 { "\xC4\xA4", 2, "H", 1 }, // U+0124 -> H (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
368 { "\xC4\xA5", 2, "h", 1 }, // U+0125 -> h (LATIN SMALL LETTER H WITH CIRCUMFLEX)
369 { "\xC4\xA6", 2, "H", 1 }, // U+0126 -> H (LATIN CAPITAL LETTER H WITH STROKE)
370 { "\xC4\xA7", 2, "h", 1 }, // U+0127 -> h (LATIN SMALL LETTER H WITH STROKE)
371 { "\xC4\xA8", 2, "I", 1 }, // U+0128 -> I (LATIN CAPITAL LETTER I WITH TILDE)
372 { "\xC4\xA9", 2, "i", 1 }, // U+0129 -> i (LATIN SMALL LETTER I WITH TILDE)
373 { "\xC4\xAA", 2, "I", 1 }, // U+012A -> I (LATIN CAPITAL LETTER I WITH MACRON)
374 { "\xC4\xAB", 2, "i", 1 }, // U+012B -> i (LATIN SMALL LETTER I WITH MACRON)
375 { "\xC4\xAC", 2, "I", 1 }, // U+012C -> I (LATIN CAPITAL LETTER I WITH BREVE)
376 { "\xC4\xAD", 2, "i", 1 }, // U+012D -> i (LATIN SMALL LETTER I WITH BREVE)
377 { "\xC4\xAE", 2, "I", 1 }, // U+012E -> I (LATIN CAPITAL LETTER I WITH OGONEK)
378 { "\xC4\xAF", 2, "i", 1 }, // U+012F -> i (LATIN SMALL LETTER I WITH OGONEK)
379 { "\xC4\xB0", 2, "I", 1 }, // U+0130 -> I (LATIN CAPITAL LETTER I WITH DOT ABOVE)
380 { "\xC4\xB1", 2, "i", 1 }, // U+0131 -> i (LATIN SMALL LETTER DOTLESS I)
381 { "\xC4\xB2", 2, "Ij", 2 }, // U+0132 -> Ij (LATIN CAPITAL LIGATURE IJ)
382 { "\xC4\xB3", 2, "ij", 2 }, // U+0133 -> ij (LATIN SMALL LIGATURE IJ)
383 { "\xC4\xB4", 2, "J", 1 }, // U+0134 -> J (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
384 { "\xC4\xB5", 2, "j", 1 }, // U+0135 -> j (LATIN SMALL LETTER J WITH CIRCUMFLEX)
385 { "\xC4\xB6", 2, "K", 1 }, // U+0136 -> K (LATIN CAPITAL LETTER K WITH CEDILLA)
386 { "\xC4\xB7", 2, "k", 1 }, // U+0137 -> k (LATIN SMALL LETTER K WITH CEDILLA)
387 { "\xC4\xB8", 2, "k", 1 }, // U+0138 -> k (LATIN SMALL LETTER KRA)
388 { "\xC4\xB9", 2, "L", 1 }, // U+0139 -> L (LATIN CAPITAL LETTER L WITH ACUTE)
389 { "\xC4\xBA", 2, "l", 1 }, // U+013A -> l (LATIN SMALL LETTER L WITH ACUTE)
390 { "\xC4\xBB", 2, "L", 1 }, // U+013B -> L (LATIN CAPITAL LETTER L WITH CEDILLA)
391 { "\xC4\xBC", 2, "l", 1 }, // U+013C -> l (LATIN SMALL LETTER L WITH CEDILLA)
392 { "\xC4\xBD", 2, "L", 1 }, // U+013D -> L (LATIN CAPITAL LETTER L WITH CARON)
393 { "\xC4\xBE", 2, "l", 1 }, // U+013E -> l (LATIN SMALL LETTER L WITH CARON)
394 { "\xC4\xBF", 2, "L", 1 }, // U+013F -> L (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
395 { "\xC5\x80", 2, "l", 1 }, // U+0140 -> l (LATIN SMALL LETTER L WITH MIDDLE DOT)
396 { "\xC5\x81", 2, "L", 1 }, // U+0141 -> L (LATIN CAPITAL LETTER L WITH STROKE)
397 { "\xC5\x82", 2, "l", 1 }, // U+0142 -> l (LATIN SMALL LETTER L WITH STROKE)
398 { "\xC5\x83", 2, "N", 1 }, // U+0143 -> N (LATIN CAPITAL LETTER N WITH ACUTE)
399 { "\xC5\x84", 2, "n", 1 }, // U+0144 -> n (LATIN SMALL LETTER N WITH ACUTE)
400 { "\xC5\x85", 2, "N", 1 }, // U+0145 -> N (LATIN CAPITAL LETTER N WITH CEDILLA)
401 { "\xC5\x86", 2, "n", 1 }, // U+0146 -> n (LATIN SMALL LETTER N WITH CEDILLA)
402 { "\xC5\x87", 2, "N", 1 }, // U+0147 -> N (LATIN CAPITAL LETTER N WITH CARON)
403 { "\xC5\x88", 2, "n", 1 }, // U+0148 -> n (LATIN SMALL LETTER N WITH CARON)
404 { "\xC5\x89", 2, "'n", 2 }, // U+0149 -> 'n (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
405 { "\xC5\x8A", 2, "Ng", 2 }, // U+014A -> Ng (LATIN CAPITAL LETTER ENG)
406 { "\xC5\x8B", 2, "ng", 2 }, // U+014B -> ng (LATIN SMALL LETTER ENG)
407 { "\xC5\x8C", 2, "O", 1 }, // U+014C -> O (LATIN CAPITAL LETTER O WITH MACRON)
408 { "\xC5\x8D", 2, "o", 1 }, // U+014D -> o (LATIN SMALL LETTER O WITH MACRON)
409 { "\xC5\x8E", 2, "O", 1 }, // U+014E -> O (LATIN CAPITAL LETTER O WITH BREVE)
410 { "\xC5\x8F", 2, "o", 1 }, // U+014F -> o (LATIN SMALL LETTER O WITH BREVE)
411 { "\xC5\x90", 2, "O", 1 }, // U+0150 -> O (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
412 { "\xC5\x91", 2, "o", 1 }, // U+0151 -> o (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
413 { "\xC5\x92", 2, "Oe", 2 }, // U+0152 -> Oe (LATIN CAPITAL LIGATURE OE)
414 { "\xC5\x93", 2, "oe", 2 }, // U+0153 -> oe (LATIN SMALL LIGATURE OE)
415 { "\xC5\x94", 2, "R", 1 }, // U+0154 -> R (LATIN CAPITAL LETTER R WITH ACUTE)
416 { "\xC5\x95", 2, "r", 1 }, // U+0155 -> r (LATIN SMALL LETTER R WITH ACUTE)
417 { "\xC5\x96", 2, "R", 1 }, // U+0156 -> R (LATIN CAPITAL LETTER R WITH CEDILLA)
418 { "\xC5\x97", 2, "r", 1 }, // U+0157 -> r (LATIN SMALL LETTER R WITH CEDILLA)
419 { "\xC5\x98", 2, "R", 1 }, // U+0158 -> R (LATIN CAPITAL LETTER R WITH CARON)
420 { "\xC5\x99", 2, "r", 1 }, // U+0159 -> r (LATIN SMALL LETTER R WITH CARON)
421 { "\xC5\x9A", 2, "S", 1 }, // U+015A -> S (LATIN CAPITAL LETTER S WITH ACUTE)
422 { "\xC5\x9B", 2, "s", 1 }, // U+015B -> s (LATIN SMALL LETTER S WITH ACUTE)
423 { "\xC5\x9C", 2, "S", 1 }, // U+015C -> S (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
424 { "\xC5\x9D", 2, "s", 1 }, // U+015D -> s (LATIN SMALL LETTER S WITH CIRCUMFLEX)
425 { "\xC5\x9E", 2, "S", 1 }, // U+015E -> S (LATIN CAPITAL LETTER S WITH CEDILLA)
426 { "\xC5\x9F", 2, "s", 1 }, // U+015F -> s (LATIN SMALL LETTER S WITH CEDILLA)
427 { "\xC5\xA0", 2, "S", 1 }, // U+0160 -> S (LATIN CAPITAL LETTER S WITH CARON)
428 { "\xC5\xA1", 2, "s", 1 }, // U+0161 -> s (LATIN SMALL LETTER S WITH CARON)
429 { "\xC5\xA2", 2, "T", 1 }, // U+0162 -> T (LATIN CAPITAL LETTER T WITH CEDILLA)
430 { "\xC5\xA3", 2, "t", 1 }, // U+0163 -> t (LATIN SMALL LETTER T WITH CEDILLA)
431 { "\xC5\xA4", 2, "T", 1 }, // U+0164 -> T (LATIN CAPITAL LETTER T WITH CARON)
432 { "\xC5\xA5", 2, "t", 1 }, // U+0165 -> t (LATIN SMALL LETTER T WITH CARON)
433 { "\xC5\xA6", 2, "T", 1 }, // U+0166 -> T (LATIN CAPITAL LETTER T WITH STROKE)
434 { "\xC5\xA7", 2, "t", 1 }, // U+0167 -> t (LATIN SMALL LETTER T WITH STROKE)
435 { "\xC5\xA8", 2, "U", 1 }, // U+0168 -> U (LATIN CAPITAL LETTER U WITH TILDE)
436 { "\xC5\xA9", 2, "u", 1 }, // U+0169 -> u (LATIN SMALL LETTER U WITH TILDE)
437 { "\xC5\xAA", 2, "U", 1 }, // U+016A -> U (LATIN CAPITAL LETTER U WITH MACRON)
438 { "\xC5\xAB", 2, "u", 1 }, // U+016B -> u (LATIN SMALL LETTER U WITH MACRON)
439 { "\xC5\xAC", 2, "U", 1 }, // U+016C -> U (LATIN CAPITAL LETTER U WITH BREVE)
440 { "\xC5\xAD", 2, "u", 1 }, // U+016D -> u (LATIN SMALL LETTER U WITH BREVE)
441 { "\xC5\xAE", 2, "U", 1 }, // U+016E -> U (LATIN CAPITAL LETTER U WITH RING ABOVE)
442 { "\xC5\xAF", 2, "u", 1 }, // U+016F -> u (LATIN SMALL LETTER U WITH RING ABOVE)
443 { "\xC5\xB0", 2, "U", 1 }, // U+0170 -> U (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
444 { "\xC5\xB1", 2, "u", 1 }, // U+0171 -> u (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
445 { "\xC5\xB2", 2, "U", 1 }, // U+0172 -> U (LATIN CAPITAL LETTER U WITH OGONEK)
446 { "\xC5\xB3", 2, "u", 1 }, // U+0173 -> u (LATIN SMALL LETTER U WITH OGONEK)
447 { "\xC5\xB4", 2, "W", 1 }, // U+0174 -> W (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
448 { "\xC5\xB5", 2, "w", 1 }, // U+0175 -> w (LATIN SMALL LETTER W WITH CIRCUMFLEX)
449 { "\xC5\xB6", 2, "Y", 1 }, // U+0176 -> Y (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
450 { "\xC5\xB7", 2, "y", 1 }, // U+0177 -> y (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
451 { "\xC5\xB8", 2, "Y", 1 }, // U+0178 -> Y (LATIN CAPITAL LETTER Y WITH DIAERESIS)
452 { "\xC5\xB9", 2, "Z", 1 }, // U+0179 -> Z (LATIN CAPITAL LETTER Z WITH ACUTE)
453 { "\xC5\xBA", 2, "z", 1 }, // U+017A -> z (LATIN SMALL LETTER Z WITH ACUTE)
454 { "\xC5\xBB", 2, "Z", 1 }, // U+017B -> Z (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
455 { "\xC5\xBC", 2, "z", 1 }, // U+017C -> z (LATIN SMALL LETTER Z WITH DOT ABOVE)
456 { "\xC5\xBD", 2, "Z", 1 }, // U+017D -> Z (LATIN CAPITAL LETTER Z WITH CARON)
457 { "\xC5\xBE", 2, "z", 1 }, // U+017E -> z (LATIN SMALL LETTER Z WITH CARON)
458 { "\xC5\xBF", 2, "s", 1 }, // U+017F -> s (LATIN SMALL LETTER LONG S
460 // U+2000 ... U+206F (General Punctuation)
461 { "\xE2\x80\x90", 3, "-", 1 }, // U+2010 -> - (HYPHEN)
462 { "\xE2\x80\x91", 3, "-", 1 }, // U+2011 -> - (NON-BREAKING HYPHEN)
463 { "\xE2\x80\x92", 3, "--", 2 }, // U+2012 -> -- (FIGURE DASH)
464 { "\xE2\x80\x93", 3, "--", 2 }, // U+2013 -> -- (EN DASH)
465 { "\xE2\x80\x94", 3, "---", 3 }, // U+2014 -> --- (EM DASH)
466 { "\xE2\x80\x95", 3, "---", 3 }, // U+2015 -> --- (HORIZONTAL BAR)
467 { "\xE2\x80\x96", 3, "||", 2 }, // U+2016 -> || (DOUBLE VERTICAL LINE)
468 { "\xE2\x80\x97", 3, "_", 1 }, // U+2017 -> _ (DOUBLE LOW LINE)
469 { "\xE2\x80\x98", 3, "`", 1 }, // U+2018 -> ` (LEFT SINGLE QUOTATION MARK)
470 { "\xE2\x80\x99", 3, "'", 1 }, // U+2019 -> ' (RIGHT SINGLE QUOTATION MARK)
471 { "\xE2\x80\x9A", 3, ",", 1 }, // U+201A -> , (SINGLE LOW-9 QUOTATION MARK)
472 { "\xE2\x80\x9B", 3, "'", 1 }, // U+201B -> ' (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
473 { "\xE2\x80\x9C", 3, "\"", 1 }, // U+201C -> " (LEFT DOUBLE QUOTATION MARK)
474 { "\xE2\x80\x9D", 3, "\"", 1 }, // U+201D -> " (RIGHT DOUBLE QUOTATION MARK)
475 { "\xE2\x80\x9E", 3, ",,", 2 }, // U+201E -> ,, (DOUBLE LOW-9 QUOTATION MARK)
476 { "\xE2\x80\x9F", 3, "``", 2 }, // U+201F -> `` (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
477 { "\xE2\x80\xA0", 3, "+", 1 }, // U+2020 -> + (DAGGER)
478 { "\xE2\x80\xA1", 3, "+", 1 }, // U+2021 -> + (DOUBLE DAGGER)
479 { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7 (BULLET) -> (MIDDLE POINT)
480 { "\xE2\x80\xA3", 3, ".", 1 }, // U+2023 -> . (TRIANGULAR BULLET)
481 { "\xE2\x80\xA4", 3, ".", 1 }, // U+2024 -> . (ONE DOT LEADER)
482 { "\xE2\x80\xA5", 3, "..", 2 }, // U+2025 -> .. (TWO DOT LEADER)
483 { "\xE2\x80\xA6", 3, "...", 3 }, // U+2026 -> ... (HORIZONTAL ELLIPSIS)
484 { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7 (HYPHENATION POINT) -> (MIDDLE POINT)
485 { "\xE2\x80\xB0", 3, "%.", 2 }, // U+2030 -> %. (PER MILLE SIGN)
486 { "\xE2\x80\xB1", 3, "%..", 3 }, // U+2031 -> %.. (PER TEN THOUSAND SIGN)
487 { "\xE2\x80\xB2", 3, "'", 1 }, // U+2032 -> ` (PRIME)
488 { "\xE2\x80\xB3", 3, "''", 2 }, // U+2033 -> '' (DOUBLE PRIME)
489 { "\xE2\x80\xB4", 3, "'''", 3 }, // U+2034 -> ''' (TRIPLE PRIME)
490 { "\xE2\x80\xB5", 3, "`", 1 }, // U+2035 -> ` (REVERSED PRIME)
491 { "\xE2\x80\xB6", 3, "``", 2 }, // U+2036 -> `` (REVERSED DOUBLE PRIME)
492 { "\xE2\x80\xB7", 3, "```", 3 }, // U+2037 -> ``` (REVERSED TRIPLE PRIME)
493 { "\xE2\x80\xB8", 3, "^", 1 }, // U+2038 -> ^ (CARET)
494 { "\xE2\x80\xB9", 3, "<", 1 }, // U+2039 -> < (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
495 { "\xE2\x80\xBA", 3, ">", 1 }, // U+203A -> > (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
496 { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7 (REFERENCE MARK) -> (MULTIPLICATION SIGN)
497 { "\xE2\x80\xBC", 3, "!!", 2 }, // U+203C -> !! (DOUBLE EXCLAMATION MARK)
498 { "\xE2\x80\xBD", 3, "?", 1 }, // U+203D -> ? (INTERROBANG)
499 { "\xE2\x81\x82", 3, "*", 1 }, // U+2042 -> * (ASTERISM)
500 { "\xE2\x81\x83", 3, ".", 1 }, // U+2043 -> . (HYPHEN BULLET)
501 { "\xE2\x81\x84", 3, "/", 1 }, // U+2044 -> / (FRACTION SLASH)
502 { "\xE2\x81\x87", 3, "??", 2 }, // U+2047 -> ?? (DOUBLE QUESTION MARK)
503 { "\xE2\x81\x88", 3, "?!", 2 }, // U+2048 -> ?! (QUESTION EXCLAMATION MARK)
504 { "\xE2\x81\x89", 3, "!?", 2 }, // U+2049 -> !? (EXCLAMATION QUESTION MARK)
505 { "\xE2\x81\x8E", 3, "*", 1 }, // U+204E -> * (LOW ASTERISK)
506 { "\xE2\x81\x8F", 3, ";", 1 }, // U+204F -> ; (REVERSED SEMICOLON)
507 { "\xE2\x81\x91", 3, "*", 1 }, // U+2051 -> * (TWO ASTERISKS ALIGNED VERTICALLY)
508 { "\xE2\x81\x92", 3, "-", 1 }, // U+2052 -> - (COMMERCIAL MINUS SIGN)
509 { "\xE2\x81\x93", 3, "~", 1 }, // U+2053 -> ~ (SWUNG DASH)
510 { "\xE2\x81\x95", 3, "*", 1 }, // U+2055 -> * (FLOWER PUNCTUATION MARK)
511 { "\xE2\x81\x97", 3, "''''", 4 }, // U+2057 -> '''' (QUADRUPLE PRIME)
512 { "\xE2\x81\x9A", 3, ":", 1 }, // U+205A -> : (TWO DOT PUNCTUATION)
513 { "\xE2\x81\x9C", 3, "+", 1 }, // U+205C -> + (DOTTED CROSS)
515 // U+20A0 ... U+20CF (Currency Symbols)
516 { "\xE2\x82\xA0", 3, "ECU", 3 }, // U+20A0 -> ECU (EURO-CURRENCY SIGN)
517 { "\xE2\x82\xA1", 3, "CRC", 3 }, // U+20A1 -> CRC (COLON SIGN)
518 { "\xE2\x82\xA2", 3, "BRC", 3 }, // U+20A2 -> BRC (CRUZEIRO SIGN)
519 { "\xE2\x82\xA3", 3, "BEF", 3 }, // U+20A3 -> BEF (FRENCH FRANC SIGN)
520 { "\xE2\x82\xA4", 3, "ITL", 3 }, // U+20A4 -> ITL (LIRA SIGN)
521 { "\xE2\x82\xA6", 3, "NGN", 3 }, // U+20A6 -> NGN (NEIRA SIGN)
522 { "\xE2\x82\xA7", 3, "ESP", 3 }, // U+20A7 -> ESP (PESETA SIGN)
523 { "\xE2\x82\xA8", 3, "MVQ", 3 }, // U+20A8 -> MVQ (RUPEE SIGN)
524 { "\xE2\x82\xA9", 3, "KPW", 3 }, // U+20A9 -> KPW (WON SIGN)
525 { "\xE2\x82\xAA", 3, "ILS", 3 }, // U+20AA -> ILS (NEW SHEQEL SIGN)
526 { "\xE2\x82\xAB", 3, "VNC", 3 }, // U+20AB -> VNC (DONG SIGN)
527 { "\xE2\x82\xAC", 3, "EUR", 3 }, // U+20AC -> EUR (EURO SIGN)
528 { "\xE2\x82\xAD", 3, "LAK", 3 }, // U+20AD -> LAK (KIP SIGN)
529 { "\xE2\x82\xAE", 3, "MNT", 3 }, // U+20AE -> MNT (TUGRIK SIGN)
530 { "\xE2\x82\xAF", 3, "GRD", 3 }, // U+20AF -> GRD (DRACHMA SIGN)
531 { "\xE2\x82\xB0", 3, "Pf", 2 }, // U+20B0 -> Pf (GERMAN PENNY SIGN)
532 { "\xE2\x82\xB1", 3, "P", 1 }, // U+20B1 -> P (PESO SIGN)
533 { "\xE2\x82\xB2", 3, "PYG", 3 }, // U+20B2 -> PYG (GUARANI SIGN)
534 { "\xE2\x82\xB3", 3, "ARA", 3 }, // U+20B3 -> ARA (AUSTRAL SIGN)
535 { "\xE2\x82\xB4", 3, "UAH", 3 }, // U+20B4 -> UAH (HRYVNIA SIGN)
536 { "\xE2\x82\xB5", 3, "GHS", 3 }, // U+20B5 -> GHS (CEDI SIGN)
538 // U+2190 ... U+21FF (Arrows)
539 { "\xE2\x86\x90", 3, "<-", 2 }, // U+2190 -> <- (LEFTWARDS ARROW)
540 { "\xE2\x86\x92", 3, "->", 2 }, // U+2192 -> -> (RIGHTWARDS ARROW)
545 // start with no replacement string
548 // perform a binary search in the lookup table
549 if((rep
= bsearch(&key
, utf8map
, sizeof(utf8map
) / sizeof(utf8map
[0]), sizeof(utf8map
[0]), compareUTF8Replacements
)) != NULL
)
551 // if we found something, then copy this over to the result variables
561 /// matchCodesetAlias()
563 struct CodesetAliases
565 const char *MIMEname
; // The official and correct MIME name for a codeset
566 const char *Aliases
; // A space separated array with well-known aliases
569 const struct CodesetAliases codesetAliases
[] =
572 { "Amiga-1251", "Ami1251 Amiga1251" },
573 { "AmigaPL", "AmiPL Amiga-PL" },
574 { "ISO-8859-1", "ISO8859-1 8859-1" },
575 { "ISO-8859-2", "ISO8859-2 8859-2" },
576 { "ISO-8859-3", "ISO8859-3 8859-3" },
577 { "ISO-8859-4", "ISO8859-4 8859-4" },
578 { "ISO-8859-5", "ISO8859-5 8859-5" },
579 { "ISO-8859-6", "ISO8859-6 8859-6" },
580 { "ISO-8859-7", "ISO8859-7 8859-7" },
581 { "ISO-8859-8", "ISO8859-8 8859-8" },
582 { "ISO-8859-9", "ISO8859-9 8859-9" },
583 { "ISO-8859-10", "ISO8859-10 8859-10" },
584 { "ISO-8859-11", "ISO8859-11 8859-11" },
585 { "ISO-8859-12", "ISO8859-12 8859-12" },
586 { "ISO-8859-13", "ISO8859-13 8859-13" },
587 { "ISO-8859-14", "ISO8859-14 8859-14" },
588 { "ISO-8859-15", "ISO8859-15 8859-15" },
589 { "ISO-8859-16", "ISO8859-16 8859-16" },
590 { "ISO-8859-10", "ISO8859-10 8859-10" },
591 { "KOI8-R", "KOI8R" },
592 { "US-ASCII", "ASCII" },
593 { "UTF-8", "UTF8 UTF" },
594 { "UTF-16", "UTF16" },
595 { "UTF-32", "UTF32" },
596 { "windows-1250", "cp1250 windows1250" },
597 { "windows-1251", "cp1251 windows1251" },
598 { "windows-1252", "cp1252 windows1252" },
599 { "windows-1253", "cp1253 windows1253" },
600 { "windows-1254", "cp1254 windows1254" },
601 { "windows-1255", "cp1255 windows1255" },
602 { "windows-1256", "cp1256 windows1256" },
603 { "windows-1257", "cp1257 windows1257" },
607 static char *matchCodesetAlias(const char *search
)
610 size_t len
= strlen(search
);
615 for(i
=0; codesetAliases
[i
].MIMEname
!= NULL
; i
++)
619 // search the MIMEname first
620 if(stricmp(search
, codesetAliases
[i
].MIMEname
) == 0)
624 const char *s
= codesetAliases
[i
].Aliases
;
626 // loop through space separated list of aliases
627 while(s
!= NULL
&& *s
!= '\0')
629 if(strnicmp(search
, s
, len
) == 0)
635 if((s
= strpbrk(s
, " ")) != NULL
)
642 result
= (char *)codesetAliases
[i
].MIMEname
;
654 /**************************************************************************/
657 static struct codeset
*
658 defaultCodeset(BOOL useSemaphore
)
661 struct codeset
*codeset
;
665 if(useSemaphore
== TRUE
)
666 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
669 GetVar("codeset_default",buf
,sizeof(buf
),GVF_GLOBAL_ONLY
);
671 if(buf
[0] == '\0' || (codeset
= codesetsFind(&CodesetsBase
->codesets
,buf
)) == NULL
)
672 codeset
= CodesetsBase
->systemCodeset
;
674 if(useSemaphore
== TRUE
)
675 ReleaseSemaphore(&CodesetsBase
->libSem
);
681 /// codesetsCmpUnicode()
682 // The compare function
684 codesetsCmpUnicode(struct single_convert
*arg1
,struct single_convert
*arg2
)
686 return strcmp((char*)&arg1
->utf8
[1], (char*)&arg2
->utf8
[1]);
689 /// codesetsReadTable()
691 #define ITEM_STANDARD "Standard"
692 #define ITEM_ALTSTANDARD "AltStandard"
693 #define ITEM_READONLY "ReadOnly"
694 #define ITEM_CHARACTERIZATION "Characterization"
696 // Reads a coding table and adds it
698 codesetsReadTable(struct codesetList
*csList
, STRPTR name
)
705 D(DBF_STARTUP
, "trying to fetch charset file '%s'...", name
);
707 if((fh
= Open(name
, MODE_OLDFILE
)))
709 struct codeset
*codeset
;
711 if((codeset
= (struct codeset
*)allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) != NULL
)
716 memset(codeset
,0,sizeof(struct codeset
));
718 for(i
= 0; i
<256; i
++)
719 codeset
->table
[i
].code
= codeset
->table
[i
].ucs4
= i
;
721 while(readLine(fh
, buf
, 512*sizeof(char)))
728 if((result
= getConfigItem(buf
, ITEM_STANDARD
, strlen(ITEM_STANDARD
))))
729 codeset
->name
= mystrdup(result
);
730 else if(codeset
->name
== NULL
) // a valid file starts with standard and nothing else!!
732 else if((result
= getConfigItem(buf
,ITEM_ALTSTANDARD
,strlen(ITEM_ALTSTANDARD
))))
733 codeset
->alt_name
= mystrdup(result
);
734 else if((result
= getConfigItem(buf
,ITEM_READONLY
,strlen(ITEM_READONLY
))))
735 codeset
->read_only
= !!atoi(result
);
736 else if((result
= getConfigItem(buf
,ITEM_CHARACTERIZATION
,strlen(ITEM_CHARACTERIZATION
))))
738 if((result
[0]=='_') && (result
[1]=='(') && (result
[2]=='"'))
740 char *end
= strchr(result
+ 3, '"');
743 codeset
->characterization
= mystrndup(result
+3,end
-(result
+3));
746 codeset
->characterization
= mystrdup(result
);
753 if((*p
=='=') || (fmt2
= ((*p
=='0') || (*(p
+1)=='x'))))
758 i
= strtol((const char *)p
,(char **)&p
,16);
761 while(isspace(*p
)) p
++;
763 if(!strnicmp(p
, "U+", 2))
766 codeset
->table
[i
].ucs4
= strtol((const char *)p
,(char **)&p
,16);
771 codeset
->table
[i
].ucs4
= strtol((const char *)p
,(char **)&p
,0);
778 // check if there is not already codeset with the same name in here
779 if(codeset
->name
!= NULL
&& !(codesetsFind(csList
, codeset
->name
)))
783 UTF32 src
= codeset
->table
[i
].ucs4
, *src_ptr
= &src
;
784 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
786 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
788 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)(&codeset
->table
[i
].utf8
[1]);
791 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
792 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), (int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
793 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
800 if(codeset
->name
) freeArbitrateVecPooled(codeset
->name
);
801 if(codeset
->alt_name
) freeArbitrateVecPooled(codeset
->alt_name
);
802 if(codeset
->characterization
) freeArbitrateVecPooled(codeset
->characterization
);
803 freeArbitrateVecPooled(codeset
);
814 /// codesetsScanDir()
816 codesetsScanDir(struct codesetList
*csList
, const char *dirPath
)
820 if(dirPath
!= NULL
&& dirPath
[0] != '\0')
822 #if defined(__amigaos4__)
825 if((dirContext
= ObtainDirContextTags(EX_StringNameInput
, dirPath
,
826 EX_DataFields
, EXF_NAME
|EXF_TYPE
,
829 struct ExamineData
*exd
;
831 D(DBF_STARTUP
, "scanning directory '%s' for codesets tables", dirPath
);
833 while((exd
= ExamineDir(dirContext
)) != NULL
)
839 strlcpy(filePath
, dirPath
, sizeof(filePath
));
840 AddPart(filePath
, exd
->Name
, sizeof(filePath
));
842 D(DBF_STARTUP
, "about to read codeset table '%s'", filePath
);
844 codesetsReadTable(csList
, filePath
);
848 ReleaseDirContext(dirContext
);
853 if((dirLock
= Lock(dirPath
, ACCESS_READ
)))
855 struct ExAllControl
*eac
;
857 D(DBF_STARTUP
, "scanning directory '%s' for codesets tables", dirPath
);
859 if((eac
= AllocDosObject(DOS_EXALLCONTROL
, NULL
)) != NULL
)
861 struct ExAllData
*ead
;
862 struct ExAllData
*eabuffer
;
865 eac
->eac_LastKey
= 0;
866 eac
->eac_MatchString
= NULL
;
867 eac
->eac_MatchFunc
= NULL
;
869 if((eabuffer
= allocVecPooled(CodesetsBase
->pool
, 10*sizeof(struct ExAllData
))) != NULL
)
875 more
= ExAll(dirLock
, eabuffer
, 10*sizeof(struct ExAllData
), ED_TYPE
, eac
);
876 if(!more
&& IoErr() != ERROR_NO_MORE_ENTRIES
)
879 if(eac
->eac_Entries
== 0)
882 ead
= (struct ExAllData
*)eabuffer
;
885 // we only take that ead if it is a file (ed_Type < 0)
888 strlcpy(filePath
, dirPath
, sizeof(filePath
));
889 AddPart(filePath
, (char *)ead
->ed_Name
, sizeof(filePath
));
891 D(DBF_STARTUP
, "about to read codeset table '%s'", filePath
);
893 codesetsReadTable(csList
, filePath
);
896 while((ead
= ead
->ed_Next
));
900 freeVecPooled(CodesetsBase
->pool
, eabuffer
);
903 FreeDosObject(DOS_EXALLCONTROL
, eac
);
916 // Initialized and loads the codesets
918 codesetsInit(struct codesetList
*csList
)
920 struct codeset
*codeset
= NULL
;
923 #if defined(__amigaos4__)
929 ObtainSemaphore(&CodesetsBase
->poolSem
);
931 NewList((struct List
*)&CodesetsBase
->codesets
);
933 // to make the list of the supported codesets complete we also add fake
934 // 'UTF-8' , 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
935 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
938 codeset
->name
= mystrdup("UTF-8");
939 codeset
->alt_name
= mystrdup("UTF8");
940 codeset
->characterization
= mystrdup("Unicode");
941 codeset
->read_only
= 0;
942 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
943 CodesetsBase
->utf8Codeset
= codeset
;
945 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
948 codeset
->name
= mystrdup("UTF-16");
949 codeset
->alt_name
= mystrdup("UTF16");
950 codeset
->characterization
= mystrdup("16-bit Unicode");
951 codeset
->read_only
= 0;
952 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
953 CodesetsBase
->utf16Codeset
= codeset
;
955 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
958 codeset
->name
= mystrdup("UTF-32");
959 codeset
->alt_name
= mystrdup("UTF32");
960 codeset
->characterization
= mystrdup("32-bit Unicode");
961 codeset
->read_only
= 0;
962 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
963 CodesetsBase
->utf32Codeset
= codeset
;
965 // on AmigaOS4 we can use diskfont.library to inquire charset information as
966 // it comes with a quite rich implementation of different charsets.
967 #if defined(__amigaos4__)
973 ULONG curMIB
= nextMIB
;
975 nextMIB
= ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_NEXTNUMBER
);
979 mapTable
= (ULONG
*)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_MAPTABLE
);
980 mimename
= (char *)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_MIMENAME
);
981 ianaName
= (char *)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_NAME
);
982 if(mapTable
!= NULL
&& mimename
!= NULL
&& codesetsFind(csList
, mimename
) == NULL
)
984 D(DBF_STARTUP
, "loading charset '%s' from diskfont.library...", mimename
);
986 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
989 codeset
->name
= mystrdup(mimename
);
990 codeset
->alt_name
= NULL
;
991 codeset
->characterization
= mystrdup(ianaName
);
992 codeset
->read_only
= 0;
996 UTF32
*src_ptr
= &src
;
997 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1001 codeset
->table
[i
].code
= i
;
1002 codeset
->table
[i
].ucs4
= src
;
1003 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1005 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1008 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1009 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
, const void *arg2
))codesetsCmpUnicode
);
1011 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1017 #if defined(__MORPHOS__)
1019 struct Library
*KeymapBase
;
1020 struct Library
*LocaleBase
;
1022 if((KeymapBase
= OpenLibrary("keymap.library", 51)) != NULL
)
1024 if((LocaleBase
= OpenLibrary("locale.library", 51)) != NULL
)
1026 struct KeyMap
*keymap
= AskKeyMapDefault();
1027 CONST_STRPTR name
= GetKeyMapCodepage(keymap
);
1029 if(name
!= NULL
&& keymap
!= NULL
) // Legacy keymaps dont have codepage or Unicode mappings
1031 D(DBF_STARTUP
, "loading charset '%s' from keymap.library...", name
);
1033 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) != NULL
)
1035 codeset
->name
= mystrdup(name
);
1036 codeset
->alt_name
= NULL
;
1037 codeset
->characterization
= mystrdup(name
); // No more information available
1038 codeset
->read_only
= 0;
1040 for(i
=0; i
<256; i
++)
1042 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1045 codeset
->table
[i
].code
= i
;
1046 codeset
->table
[i
].ucs4
= src
= ToUCS4(i
, keymap
);
1047 rc
= ConvertUCS4ToUTF8((CONST_WSTRPTR
)&src
, dest_ptr
, 1);
1049 codeset
->table
[i
].utf8
[0] = rc
;
1052 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1053 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
, const void *arg2
))codesetsCmpUnicode
);
1055 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1061 CloseLibrary(LocaleBase
);
1064 CloseLibrary(KeymapBase
);
1069 D(DBF_STARTUP
, "loading charsets from Libs:Charsets...");
1071 // we try to walk to the LIBS:Charsets directory on our own and readin our
1072 // own charset tables
1073 codesetsScanDir(csList
, "LIBS:Charsets");
1076 // now we go and initialize our internally supported codesets but only if
1077 // we have not already loaded a charset with the same name
1079 D(DBF_STARTUP
, "initializing internal charsets...");
1081 // ISO-8859-1 + EURO
1082 if(codesetsFind(csList
, "ISO-8859-1 + Euro") == NULL
)
1084 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1087 codeset
->name
= mystrdup("ISO-8859-1 + Euro");
1088 codeset
->alt_name
= NULL
;
1089 codeset
->characterization
= mystrdup("West European (with EURO)");
1090 codeset
->read_only
= 1;
1091 for(i
= 0; i
<256; i
++)
1093 UTF32
*src_ptr
= &src
;
1094 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1097 src
= 0x20AC; /* the EURO sign */
1101 codeset
->table
[i
].code
= i
;
1102 codeset
->table
[i
].ucs4
= src
;
1103 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1105 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1107 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1108 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
, const void *arg2
))codesetsCmpUnicode
);
1109 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1113 if(codesetsFind(csList
, "ISO-8859-1") == NULL
)
1115 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1118 codeset
->name
= mystrdup("ISO-8859-1");
1119 codeset
->alt_name
= mystrdup("ISO8859-1");
1120 codeset
->characterization
= mystrdup("West European");
1121 codeset
->read_only
= 0;
1122 for(i
= 0; i
<256; i
++)
1124 UTF32
*src_ptr
= &src
;
1125 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1129 codeset
->table
[i
].code
= i
;
1130 codeset
->table
[i
].ucs4
= src
;
1131 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1133 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1135 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1136 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1137 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1141 if(codesetsFind(csList
, "ISO-8859-2") == NULL
)
1143 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1146 codeset
->name
= mystrdup("ISO-8859-2");
1147 codeset
->alt_name
= mystrdup("ISO8859-2");
1148 codeset
->characterization
= mystrdup("Central/East European");
1149 codeset
->read_only
= 0;
1150 for(i
= 0; i
<256; i
++)
1152 UTF32
*src_ptr
= &src
;
1153 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1158 src
= iso_8859_2_to_ucs4
[i
-0xa0];
1160 codeset
->table
[i
].code
= i
;
1161 codeset
->table
[i
].ucs4
= src
;
1162 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
,dest_ptr
+6, CSF_StrictConversion
);
1164 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1166 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1167 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1168 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1172 if(codesetsFind(csList
, "ISO-8859-3") == NULL
)
1174 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1177 codeset
->name
= mystrdup("ISO-8859-3");
1178 codeset
->alt_name
= mystrdup("ISO8859-3");
1179 codeset
->characterization
= mystrdup("South European");
1180 codeset
->read_only
= 0;
1181 for(i
= 0; i
<256; i
++)
1183 UTF32
*src_ptr
= &src
;
1184 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1189 src
= iso_8859_3_to_ucs4
[i
-0xa0];
1191 codeset
->table
[i
].code
= i
;
1192 codeset
->table
[i
].ucs4
= src
;
1193 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1195 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1197 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1198 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1199 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1203 if(codesetsFind(csList
, "ISO-8859-4") == NULL
)
1205 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1208 codeset
->name
= mystrdup("ISO-8859-4");
1209 codeset
->alt_name
= mystrdup("ISO8859-4");
1210 codeset
->characterization
= mystrdup("North European");
1211 codeset
->read_only
= 0;
1212 for(i
= 0; i
<256; i
++)
1214 UTF32
*src_ptr
= &src
;
1215 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1220 src
= iso_8859_4_to_ucs4
[i
-0xa0];
1222 codeset
->table
[i
].code
= i
;
1223 codeset
->table
[i
].ucs4
= src
;
1224 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1226 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1228 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1229 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
, const void *arg2
))codesetsCmpUnicode
);
1230 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1234 if(codesetsFind(csList
, "ISO-8859-5") == NULL
)
1236 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1239 codeset
->name
= mystrdup("ISO-8859-5");
1240 codeset
->alt_name
= mystrdup("ISO8859-5");
1241 codeset
->characterization
= mystrdup("Slavic languages");
1242 codeset
->read_only
= 0;
1243 for(i
= 0; i
<256; i
++)
1245 UTF32
*src_ptr
= &src
;
1246 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1251 src
= iso_8859_5_to_ucs4
[i
-0xa0];
1253 codeset
->table
[i
].code
= i
;
1254 codeset
->table
[i
].ucs4
= src
;
1255 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1257 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1259 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1260 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1261 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1265 if(codesetsFind(csList
, "ISO-8859-9") == NULL
)
1267 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1270 codeset
->name
= mystrdup("ISO-8859-9");
1271 codeset
->alt_name
= mystrdup("ISO8859-9");
1272 codeset
->characterization
= mystrdup("Turkish");
1273 codeset
->read_only
= 0;
1274 for(i
= 0; i
<256; i
++)
1276 UTF32
*src_ptr
= &src
;
1277 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1282 src
= iso_8859_9_to_ucs4
[i
-0xa0];
1284 codeset
->table
[i
].code
= i
;
1285 codeset
->table
[i
].ucs4
= src
;
1286 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1288 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1290 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1291 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1292 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1296 if(codesetsFind(csList
, "ISO-8859-15") == NULL
)
1298 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1301 codeset
->name
= mystrdup("ISO-8859-15");
1302 codeset
->alt_name
= mystrdup("ISO8859-15");
1303 codeset
->characterization
= mystrdup("West European II");
1304 codeset
->read_only
= 0;
1305 for(i
= 0; i
<256; i
++)
1307 UTF32
*src_ptr
= &src
;
1308 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1313 src
= iso_8859_15_to_ucs4
[i
-0xa0];
1315 codeset
->table
[i
].code
= i
;
1316 codeset
->table
[i
].ucs4
= src
;
1317 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1319 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1321 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof (codeset
->table
));
1322 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1323 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1327 if(codesetsFind(csList
, "ISO-8859-16") == NULL
)
1329 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1332 codeset
->name
= mystrdup("ISO-8859-16");
1333 codeset
->alt_name
= mystrdup("ISO8869-16");
1334 codeset
->characterization
= mystrdup("South-Eastern European");
1335 codeset
->read_only
= 0;
1338 UTF32
*src_ptr
= &src
;
1339 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1344 src
= iso_8859_16_to_ucs4
[i
-0xa0];
1346 codeset
->table
[i
].code
= i
;
1347 codeset
->table
[i
].ucs4
= src
;
1348 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1350 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
- (ULONG
)&codeset
->table
[i
].utf8
[1];
1352 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1353 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), (int (*)(const void *arg1
, const void *arg2
))codesetsCmpUnicode
);
1354 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1358 if(codesetsFind(csList
, "KOI8-R") == NULL
)
1360 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1363 codeset
->name
= mystrdup("KOI8-R");
1364 codeset
->alt_name
= mystrdup("KOI8R");
1365 codeset
->characterization
= mystrdup("Russian");
1366 codeset
->read_only
= 0;
1367 for(i
= 0; i
<256; i
++)
1369 UTF32
*src_ptr
= &src
;
1370 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1375 src
= koi8r_to_ucs4
[i
-0x80];
1377 codeset
->table
[i
].code
= i
;
1378 codeset
->table
[i
].ucs4
= src
;
1379 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1381 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1383 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1384 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1385 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1389 if(codesetsFind(csList
, "AmigaPL") == NULL
)
1391 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1394 codeset
->name
= mystrdup("AmigaPL");
1395 codeset
->alt_name
= mystrdup("AmiPL");
1396 codeset
->characterization
= mystrdup("Polish (Amiga)");
1397 codeset
->read_only
= 1;
1398 for(i
=0; i
<256; i
++)
1400 UTF32
*src_ptr
= &src
;
1401 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1406 src
= amigapl_to_ucs4
[i
-0xa0];
1408 codeset
->table
[i
].code
= i
;
1409 codeset
->table
[i
].ucs4
= src
;
1410 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1412 codeset
->table
[i
].utf8
[0] = (ULONG
)dest_ptr
-(ULONG
)&codeset
->table
[i
].utf8
[1];
1414 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1415 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
,const void *arg2
))codesetsCmpUnicode
);
1416 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1420 if(codesetsFind(csList
, "Amiga-1251") == NULL
)
1422 if((codeset
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codeset
))) == NULL
)
1425 codeset
->name
= mystrdup("Amiga-1251");
1426 codeset
->alt_name
= mystrdup("Ami1251");
1427 codeset
->characterization
= mystrdup("Cyrillic (Amiga)");
1428 codeset
->read_only
= 1;
1429 for(i
=0; i
<256; i
++)
1431 UTF32
*src_ptr
= &src
;
1432 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1437 src
= amiga1251_to_ucs4
[i
-0xa0];
1439 codeset
->table
[i
].code
= i
;
1440 codeset
->table
[i
].ucs4
= src
;
1441 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1443 codeset
->table
[i
].utf8
[0] = (char*)dest_ptr
- (char*)&codeset
->table
[i
].utf8
[1];
1445 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof(codeset
->table
));
1446 qsort(codeset
->table_sorted
,256,sizeof(codeset
->table
[0]),(int (*)(const void *arg1
, const void *arg2
))codesetsCmpUnicode
);
1447 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1451 ReleaseSemaphore(&CodesetsBase
->poolSem
);
1453 RETURN(codeset
!= 0);
1454 return codeset
!= NULL
;
1458 /// codesetsCleanup()
1459 // Cleanup the memory for the codeset
1461 codesetsCleanup(struct codesetList
*csList
)
1463 struct codeset
*code
;
1467 while((code
= (struct codeset
*)RemHead((struct List
*)csList
)))
1469 if(code
->name
) freeArbitrateVecPooled(code
->name
);
1470 if(code
->alt_name
) freeArbitrateVecPooled(code
->alt_name
);
1471 if(code
->characterization
) freeArbitrateVecPooled(code
->characterization
);
1473 freeArbitrateVecPooled(code
);
1481 // Returns the given codeset.
1483 codesetsFind(struct codesetList
*csList
, const char *name
)
1485 struct codeset
*res
= NULL
;
1491 struct codeset
*mstate
, *succ
;
1492 char *matchedName
= matchCodesetAlias(name
);
1494 if(matchedName
!= NULL
)
1497 for(mstate
= (struct codeset
*)csList
->list
.mlh_Head
; (succ
= (struct codeset
*)mstate
->node
.mln_Succ
); mstate
= succ
)
1499 if(stricmp(name
, mstate
->name
) == 0 ||
1500 (mstate
->alt_name
!= NULL
&& stricmp(name
, mstate
->alt_name
) == 0))
1515 /// codesetsFindBest()
1516 // Returns the best codeset for the given text
1517 static struct codeset
*
1518 codesetsFindBest(struct TagItem
*attrs
, ULONG csFamily
, STRPTR text
, int text_len
, int *error_ptr
)
1520 struct codeset
*best_codeset
= NULL
;
1521 int best_errors
= text_len
;
1526 // in case the user specified the codeset family as a
1527 // cyrillic one we go and do our cyrillic specific analysis first
1528 if(csFamily
== CSV_CodesetFamily_Cyrillic
)
1530 #define NUM_CYRILLIC 3
1532 struct CodesetSearch
1538 struct CodesetSearch search
[NUM_CYRILLIC
];
1541 int ctr
[NUM_CYRILLIC
];
1548 search
[0].name
= "windows-1251";
1549 search
[0].data
= cp1251_data
;
1550 search
[1].name
= "IBM866";
1551 search
[1].data
= cp866_data
;
1552 search
[2].name
= "KOI8-R";
1553 search
[2].data
= koi8r_data
;
1555 memset(&ctr
, 0, sizeof(ctr
));
1557 tp
= (unsigned char *)text
;
1562 int mid
= max
= -466725766; // TODO: what's the magic behind this constant?
1565 for(n
=0; n
< NUM_CYRILLIC
; n
++)
1567 unsigned char la
= 0;
1568 unsigned char *tptr
= (unsigned char *)search
[n
].data
;
1574 unsigned char lb
= (*p
++) ^ 128;
1576 if(!((la
| lb
) & 128))
1577 ctr
[n
] += (signed char)tptr
[(la
<< 7) + lb
];
1592 if((max
>= 500) && ((max
-mid
) >= 1000))
1598 while((*p
) && (!gr
));
1600 if(gr
|| ((!(*p
)) && lr
))
1603 // if our analysis found something, we go and try
1604 // to find the corresponding codeset in out codeset list
1607 struct TagItem
*tstate
= attrs
;
1608 struct TagItem
*tag
;
1610 D(DBF_STARTUP
, "identified text as '%s", search
[Nmax
-1].name
);
1612 // now we walk through our taglist and check if the user
1614 while((tag
= NextTagItem(&tstate
)))
1616 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
1618 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1620 if((best_codeset
= codesetsFind(csList
, search
[Nmax
-1].name
)) != NULL
)
1625 // if we still haven't found the matching codeset
1626 // we search the internal list
1627 if(best_codeset
== NULL
)
1628 best_codeset
= codesetsFind(&CodesetsBase
->codesets
, search
[Nmax
-1].name
);
1636 // if we haven't found the best codeset (through the cyrillic analysis
1637 // we go and do the dumb latin search in our codesetlist
1640 struct TagItem
*tstate
= attrs
;
1641 struct TagItem
*tag
;
1642 BOOL lastIteration
= FALSE
;
1644 while((tag
= NextTagItem(&tstate
)) || (lastIteration
= TRUE
))
1646 if(lastIteration
== TRUE
|| (tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0))
1648 struct codesetList
*csList
= (lastIteration
? &CodesetsBase
->codesets
: (struct codesetList
*)tag
->ti_Data
);
1649 struct codeset
*codeset
= (struct codeset
*)csList
->list
.mlh_Head
;
1651 // the following identification/detection routines is NOT really smart.
1652 // we just see how each UTF8 string is the representation of each char
1653 // in our source text and then check if they are valid or not. As said,
1654 // not very smart, but we don't have anything better right now :(
1658 if(!codeset
->read_only
&& codeset
!= CodesetsBase
->utf8Codeset
)
1660 char *text_ptr
= text
;
1664 for(i
=0; i
< text_len
; i
++)
1666 unsigned char c
= *text_ptr
++;
1670 struct single_convert
*f
= &codeset
->table
[c
];
1672 if(f
->utf8
[0] == 0 || f
->utf8
[1] == 0x00)
1679 D(DBF_STARTUP
, "tried to identify text as '%s' text with %ld of %ld errors", codeset
->name
, errors
, text_len
);
1681 if(errors
< best_errors
)
1683 best_codeset
= codeset
;
1684 best_errors
= errors
;
1687 if(best_errors
== 0)
1691 codeset
= (struct codeset
*)codeset
->node
.mln_Succ
;
1701 *error_ptr
= best_errors
;
1703 RETURN(best_codeset
);
1704 return best_codeset
;
1708 /**************************************************************************/
1710 /// CodesetsSupportedA()
1712 CodesetsSupportedA(REG(a0
, UNUSED
struct TagItem
* attrs
))
1714 STRPTR
*array
= NULL
;
1715 struct TagItem
*tstate
= attrs
;
1716 struct TagItem
*tag
;
1721 // first we need to check how many codesets our supplied
1723 numCodesets
= countCodesets(&CodesetsBase
->codesets
);
1724 while((tag
= NextTagItem(&tstate
)))
1726 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
1727 numCodesets
+= countCodesets((struct codesetList
*)tag
->ti_Data
);
1730 // now that we know how many codesets we have in our lists we
1731 // can put their names into our string arrays
1734 if((array
= allocArbitrateVecPooled((numCodesets
+1)*sizeof(STRPTR
))))
1736 struct codeset
*code
;
1737 struct codeset
*succ
;
1743 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1745 // first we walk through the internal codesets list and
1747 for(code
= (struct codeset
*)CodesetsBase
->codesets
.list
.mlh_Head
; (succ
= (struct codeset
*)code
->node
.mln_Succ
); code
= succ
, i
++)
1748 array
[i
] = code
->name
;
1750 // then we also iterate through our private codesets list
1751 while((tag
= NextTagItem(&tstate
)))
1753 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
1755 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1757 for(code
= (struct codeset
*)csList
->list
.mlh_Head
; (succ
= (struct codeset
*)code
->node
.mln_Succ
); code
= succ
, i
++)
1758 array
[i
] = code
->name
;
1764 ReleaseSemaphore(&CodesetsBase
->libSem
);
1775 CodesetsFreeA(REG(a0
, APTR obj
),
1776 REG(a1
, UNUSED
struct TagItem
*attrs
))
1781 freeArbitrateVecPooled(obj
);
1787 /// CodesetsSetDefaultA()
1788 struct codeset
*LIBFUNC
1789 CodesetsSetDefaultA(REG(a0
, STRPTR name
),
1790 REG(a1
, struct TagItem
*attrs
))
1792 struct codeset
*codeset
;
1796 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1798 if((codeset
= codesetsFind(&CodesetsBase
->codesets
,name
)))
1802 flags
= GVF_SAVE_VAR
| (GetTagData(CSA_Save
,FALSE
,attrs
) ? GVF_GLOBAL_ONLY
: 0);
1804 SetVar("codeset_default",codeset
->name
,strlen(codeset
->name
),flags
);
1807 ReleaseSemaphore(&CodesetsBase
->libSem
);
1815 struct codeset
*LIBFUNC
1816 CodesetsFindA(REG(a0
, STRPTR name
), REG(a1
, struct TagItem
*attrs
))
1818 struct codeset
*codeset
= NULL
;
1822 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1824 // if no name pointer was supplied we have to return
1825 // the default codeset only.
1828 // we first walk through our internal list and check if we
1829 // can find the requested codeset
1830 codeset
= codesetsFind(&CodesetsBase
->codesets
, name
);
1832 if(codeset
== NULL
&& attrs
!= NULL
)
1834 struct TagItem
*tstate
= attrs
;
1835 struct TagItem
*tag
;
1837 // now we walk through our taglist and check if the user
1839 while((tag
= NextTagItem(&tstate
)))
1841 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
1843 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1845 if((codeset
= codesetsFind(csList
, name
)) != NULL
)
1852 // check if we found something or not.
1853 if(codeset
== NULL
&& (attrs
== NULL
|| GetTagData(CSA_FallbackToDefault
, TRUE
, attrs
)))
1854 codeset
= defaultCodeset(FALSE
);
1856 ReleaseSemaphore(&CodesetsBase
->libSem
);
1863 /// CodesetsFindBestA()
1864 struct codeset
*LIBFUNC
1865 CodesetsFindBestA(REG(a0
, struct TagItem
*attrs
))
1867 struct codeset
*codeset
= NULL
;
1871 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1875 char *text
= (char *)GetTagData(CSA_Source
, 0, attrs
);
1876 ULONG text_len
= GetTagData(CSA_SourceLen
, text
!= NULL
? strlen(text
) : 0, attrs
);
1878 if(text
!= NULL
&& text_len
> 0)
1881 ULONG csFamily
= GetTagData(CSA_CodesetFamily
, CSV_CodesetFamily_Latin
, attrs
);
1882 int *error_ptr
= (int *)GetTagData(CSA_ErrPtr
, 0, attrs
);
1883 BOOL defaultFallBack
= GetTagData(CSA_FallbackToDefault
, FALSE
, attrs
);
1885 codeset
= codesetsFindBest(attrs
, csFamily
, text
, text_len
, &numErrors
);
1887 if(error_ptr
!= NULL
)
1888 *error_ptr
= numErrors
;
1890 // if we still haven't got the codeset we fallback to the default
1891 if(codeset
== NULL
&& defaultFallBack
== TRUE
)
1892 codeset
= defaultCodeset(FALSE
);
1896 ReleaseSemaphore(&CodesetsBase
->libSem
);
1903 /// CodesetsUTF8Len()
1904 // Returns the number of characters a utf8 string has. This is not
1905 // identically with the size of memory is required to hold the string.
1907 CodesetsUTF8Len(REG(a0
, UTF8
*str
))
1922 str
+= trailingBytesForUTF8
[c
];
1930 /// CodesetsStrLenA()
1932 CodesetsStrLenA(REG(a0
, STRPTR str
),
1933 REG(a1
, struct TagItem
*attrs
))
1941 struct codeset
*codeset
;
1946 if((codeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
1947 codeset
= defaultCodeset(TRUE
);
1948 if(codeset
== CodesetsBase
->utf32Codeset
)
1951 len
= utf32_strlen((UTF32
*)str
);
1953 else if(codeset
== CodesetsBase
->utf16Codeset
)
1956 len
= utf16_strlen((UTF16
*)str
);
1964 len
= GetTagData(CSA_SourceLen
, len
, attrs
);
1970 void *srcend
= src
+ len
;
1971 UTF8
*dstlen
= NULL
;
1976 CodesetsConvertUTF32toUTF8((const UTF32
**)&src
, srcend
, &dstlen
, NULL
, 0);
1979 CodesetsConvertUTF16toUTF8((const UTF16
**)&src
, srcend
, &dstlen
, NULL
, 0);
1982 res
= (ULONG
)dstlen
;
1990 while((c
= *src
++) != '\0' && len
!= 0)
1992 res
+= codeset
->table
[c
].utf8
[0];
2003 /// CodesetsUTF8ToStrA()
2004 // Converts an UTF8 string to a given charset. Return the number of bytes
2005 // written to dest excluding the NULL byte (which is always ensured by this
2006 // function; it means a NULL str will produce "" as dest; anyway you should
2007 // check NULL str to not waste your time!).
2009 CodesetsUTF8ToStrA(REG(a0
, struct TagItem
*attrs
))
2019 if((src
= (UTF8
*)GetTagData(CSA_Source
, (ULONG
)NULL
, attrs
)) != NULL
&&
2020 (srcLen
= GetTagData(CSA_SourceLen
, src
!= NULL
? strlen((char *)src
) : 0, attrs
)) > 0)
2022 struct convertMsg msg
;
2023 struct codeset
*codeset
;
2024 struct Hook
*destHook
;
2025 struct Hook
*mapForeignCharsHook
;
2027 STRPTR destIter
= NULL
;
2031 unsigned char *s
= src
;
2032 unsigned char *e
= (src
+srcLen
);
2033 int numConvErrors
= 0;
2034 int *numConvErrorsPtr
;
2035 BOOL mapForeignChars
;
2037 struct SignalSemaphore
*sem
= NULL
;
2041 // get some more optional attributes
2042 destHook
= (struct Hook
*)GetTagData(CSA_DestHook
, (ULONG
)NULL
, attrs
);
2043 destLen
= GetTagData(CSA_DestLen
, 0, attrs
);
2044 numConvErrorsPtr
= (int *)GetTagData(CSA_ErrPtr
, (ULONG
)NULL
, attrs
);
2045 mapForeignChars
= (BOOL
)GetTagData(CSA_MapForeignChars
, FALSE
, attrs
);
2046 mapForeignCharsHook
= (struct Hook
*)GetTagData(CSA_MapForeignCharsHook
, (ULONG
)NULL
, attrs
);
2048 // get the destination codeset pointer
2049 if((codeset
= (struct codeset
*)GetTagData(CSA_DestCodeset
, (ULONG
)NULL
, attrs
)) == NULL
)
2050 codeset
= defaultCodeset(TRUE
);
2051 if(codeset
== CodesetsBase
->utf32Codeset
)
2056 else if(codeset
== CodesetsBase
->utf16Codeset
)
2067 // first we make sure we allocate enough memory
2068 // for our destination buffer
2069 if(destHook
!= NULL
)
2071 if(destLen
< 16 || destLen
> sizeof(buf
))
2072 destLen
= sizeof(buf
);
2074 msg
.state
= CSV_Translating
;
2080 // in case the user wants us to dynamically generate the
2081 // destination buffer we do it right now
2082 if((dest
= (STRPTR
)GetTagData(CSA_Dest
, (ULONG
)NULL
, attrs
)) == NULL
||
2083 GetTagData(CSA_AllocIfNeeded
, TRUE
, attrs
) != FALSE
)
2087 // calculate the destLen
2090 void *dstlen
= NULL
;
2095 CodesetsConvertUTF8toUTF32((const UTF8
**)&s
, e
, (UTF32
**)&dstlen
, NULL
, 0);
2098 CodesetsConvertUTF8toUTF16((const UTF8
**)&s
, e
, (UTF16
**)&dstlen
, NULL
, 0);
2101 len
= (ULONG
)dstlen
;
2107 unsigned char c
= *s
++;
2110 s
+= trailingBytesForUTF8
[c
];
2114 if(dest
== NULL
|| (destLen
< len
+1))
2116 if((pool
= (APTR
)GetTagData(CSA_Pool
, (ULONG
)NULL
, attrs
)) != NULL
)
2118 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, (ULONG
)NULL
, attrs
)) != NULL
)
2119 ObtainSemaphore(sem
);
2121 // allocate the destination buffer
2122 dest
= allocVecPooled(pool
, len
+char_size
);
2125 ReleaseSemaphore(sem
);
2128 dest
= allocArbitrateVecPooled(len
+char_size
);
2130 destLen
= len
+char_size
;
2143 // now we convert the src string to the
2144 // destination buffer.
2150 if(destHook
!= NULL
)
2154 dstend
= b
+ destLen
- char_size
;
2160 r
= CodesetsConvertUTF8toUTF32((const UTF8
**)&s
, e
, (UTF32
**)&b
, dstend
, 0);
2163 r
= CodesetsConvertUTF8toUTF16((const UTF8
**)&s
, e
, (UTF16
**)&b
, dstend
, 0);
2169 if(r
!= CSR_TargetExhausted
)
2170 msg
.state
= CSV_End
;
2172 CallHookPkt(destHook
,&msg
,buf
);
2177 while(r
== CSR_TargetExhausted
);
2181 dstend
= destIter
+ destLen
- char_size
;
2185 CodesetsConvertUTF8toUTF32((const UTF8
**)&s
, e
, (UTF32
**)&destIter
, dstend
, 0);
2188 CodesetsConvertUTF8toUTF16((const UTF8
**)&s
, e
, (UTF16
**)&destIter
, dstend
, 0);
2198 if(destHook
== NULL
&& n
>= destLen
-1)
2201 // convert until we reach the end of the
2205 unsigned char c
= *s
;
2206 unsigned char d
= '?';
2207 const char *repstr
= NULL
;
2210 // check if the char is a >7bit char
2213 struct single_convert
*f
;
2214 int lenAdd
= trailingBytesForUTF8
[c
];
2215 int lenStr
= lenAdd
+1;
2216 unsigned char *src
= s
;
2220 // start each iteration with "no replacement found yet"
2224 // search in the UTF8 conversion table of the current charset if
2225 // we have a replacement character for the char sequence starting at s
2226 BIN_SEARCH(codeset
->table_sorted
, 0, 255, strncmp((char *)src
, (char *)codeset
->table_sorted
[m
].utf8
+1, lenStr
), f
);
2237 // the analysed char sequence (s) is not convertable to a
2238 // single visible char replacement, so we normally have to put
2239 // a ? sign as a "unknown char" sign at the very position.
2241 // For convienence we, however, allow users to replace these
2242 // UTF8 characters with char sequences that "looklike" the
2244 if(mapForeignChars
== TRUE
)
2245 replen
= mapUTF8toASCII(&repstr
, src
, lenStr
);
2247 // call the hook only, if the internal table yielded no suitable
2249 if(replen
== 0 && mapForeignCharsHook
!= NULL
)
2251 struct replaceMsg rmsg
;
2253 rmsg
.dst
= (char **)&repstr
;
2255 rmsg
.srclen
= lenStr
;
2256 replen
= CallHookPkt(mapForeignCharsHook
, &rmsg
, NULL
);
2261 D(DBF_UTF
, "got UTF8 replacement (%ld)", replen
);
2263 // stay in the loop as long as one replacement function delivers
2264 // further UTF8 replacement sequences
2265 src
= (unsigned char *)repstr
;
2267 else if(replen
== 0)
2269 D(DBF_UTF
, "found no ASCII replacement for UTF8 string (%ld)", replen
);
2273 D(DBF_UTF
, "got replacement string '%s' (%ld)", repstr
? repstr
: "<null>", replen
);
2278 if(repstr
== NULL
|| replen
== 0)
2292 if(destHook
!= NULL
)
2303 if(i
%(destLen
-1)==0)
2307 CallHookPkt(destHook
, &msg
, buf
);
2317 *b
++ = replen
> 0 ? *repstr
: d
;
2321 if(i
%(destLen
-1)==0)
2325 CallHookPkt(destHook
, &msg
, buf
);
2336 ULONG destPos
= destIter
-dest
;
2341 ObtainSemaphore(sem
);
2343 // allocate the destination buffer
2344 dest
= reallocVecPooled(pool
, dest
, destLen
, destLen
+replen
-1);
2347 ReleaseSemaphore(sem
);
2350 dest
= reallocArbitrateVecPooled(dest
, destLen
, destLen
+replen
-1);
2358 destIter
= dest
+destPos
;
2359 memcpy(destIter
, repstr
, replen
);
2361 // adjust our loop pointer and destination length
2363 destLen
+= replen
-1;
2365 else if(replen
== 1)
2366 *destIter
++ = *repstr
;
2377 if(destHook
!= NULL
)
2379 msg
.state
= CSV_End
;
2382 CallHookPkt(destHook
,&msg
,buf
);
2388 // let us write the number of conversion errors
2389 // to the proper variable pointer, if wanted
2390 if(numConvErrorsPtr
!= NULL
)
2391 *numConvErrorsPtr
= numConvErrors
;
2394 // put the final length of our destination buffer
2395 // into the destLenPtr
2396 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, (ULONG
)NULL
, attrs
)) != NULL
)
2404 /// CodesetsUTF8CreateA()
2405 // Converts a string and a charset to an UTF8. Returns the UTF8.
2406 // If a destination hook is supplied always return 0.
2407 // If from is NULL, it returns NULL and doesn't call the hook.
2409 CodesetsUTF8CreateA(REG(a0
, struct TagItem
*attrs
))
2413 struct codeset
*codeset
;
2414 ULONG fromLen
, *destLenPtr
;
2423 if((codeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
2424 codeset
= defaultCodeset(TRUE
);
2425 if(codeset
== CodesetsBase
->utf32Codeset
)
2427 else if(codeset
== CodesetsBase
->utf16Codeset
)
2432 from
= (UTF8
*)GetTagData(CSA_Source
, 0, attrs
);
2438 fromLen
= utf32_strlen((UTF32
*)from
);
2442 fromLen
= utf16_strlen((UTF16
*)from
);
2446 fromLen
= strlen((char *)from
);
2452 fromLen
= GetTagData(CSA_SourceLen
, fromLen
, attrs
);
2454 if(from
!= NULL
&& fromLen
!= 0)
2456 struct convertMsg msg
;
2461 UBYTE
*src
, *destPtr
= NULL
, *b
= NULL
, c
;
2463 hook
= (struct Hook
*)GetTagData(CSA_DestHook
, 0, attrs
);
2464 destLen
= GetTagData(CSA_DestLen
,0,attrs
);
2468 if(destLen
<16 || destLen
>sizeof(buf
))
2469 destLen
= sizeof(buf
);
2471 msg
.state
= CSV_Translating
;
2477 if((dest
= (UTF8
*)GetTagData(CSA_Dest
, 0, attrs
)) != NULL
||
2478 GetTagData(CSA_AllocIfNeeded
,TRUE
,attrs
))
2486 void *srcend
= src
+ fromLen
;
2487 UTF8
*dstlen
= NULL
;
2492 CodesetsConvertUTF32toUTF8((const UTF32
**)&src
, srcend
, &dstlen
, NULL
, 0);
2495 CodesetsConvertUTF16toUTF8((const UTF16
**)&src
, srcend
, &dstlen
, NULL
, 0);
2498 len
= (ULONG
)dstlen
;
2502 ULONG flen
= fromLen
;
2505 while((c
= *src
++) != '\0' && flen
!= 0)
2507 len
+= codeset
->table
[c
].utf8
[0];
2511 D(DBF_UTF
, "Calculated output UTF-8 buffer length: %lu\n", len
);
2513 if(dest
== NULL
|| (destLen
<len
+1))
2516 struct SignalSemaphore
*sem
;
2518 if((pool
= (APTR
)GetTagData(CSA_Pool
, 0, attrs
)) != NULL
)
2520 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
2521 ObtainSemaphore(sem
);
2523 // allocate the destination buffer
2524 dest
= allocVecPooled(pool
,len
+1);
2527 ReleaseSemaphore(sem
);
2530 dest
= allocArbitrateVecPooled(len
+1);
2542 destPtr
= (UBYTE
*)dest
;
2548 void *srcend
= src
+ fromLen
;
2555 dstend
= b
+ destLen
- 1;
2561 r
= CodesetsConvertUTF32toUTF8((const UTF32
**)&src
, srcend
, &b
, dstend
, 0);
2564 r
= CodesetsConvertUTF16toUTF8((const UTF16
**)&src
, srcend
, &b
, dstend
, 0);
2568 if(r
!= CSR_TargetExhausted
)
2569 msg
.state
= CSV_End
;
2571 CallHookPkt(hook
,&msg
,buf
);
2576 while(r
== CSR_TargetExhausted
);
2580 dstend
= destPtr
+ destLen
;
2584 CodesetsConvertUTF32toUTF8((const UTF32
**)&src
, srcend
, &destPtr
, dstend
, 0);
2587 CodesetsConvertUTF16toUTF8((const UTF16
**)&src
, srcend
, &destPtr
, dstend
, 0);
2595 for(; fromLen
&& (c
= *src
); src
++, fromLen
--)
2599 for(utf8_seq
= &codeset
->table
[c
].utf8
[1]; (c
= *utf8_seq
); utf8_seq
++)
2606 if(i
%(destLen
-1)==0)
2610 CallHookPkt(hook
,&msg
,buf
);
2631 msg
.state
= CSV_End
;
2634 CallHookPkt(hook
,&msg
,buf
);
2643 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)))
2651 /// CodesetsIsValidUTF8()
2652 #define GOOD_UCS(c) \
2653 ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2654 (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2657 CodesetsIsValidUTF8(REG(a0
, STRPTR s
))
2664 while((n
= parseUtf8(&t
)))
2678 /// CodesetsConvertStrA()
2679 // Converts a given string from one source Codeset to a given destination
2680 // codeset and returns the convert string
2682 CodesetsConvertStrA(REG(a0
, struct TagItem
*attrs
))
2684 struct codeset
*srcCodeset
;
2685 STRPTR srcStr
= NULL
;
2686 STRPTR dstStr
= NULL
;
2692 // get the ptr to the src string we want to convert
2693 // from the source codeset to the dest codeset.
2694 srcStr
= (STRPTR
)GetTagData(CSA_Source
, (ULONG
)NULL
, attrs
);
2696 // get the pointer to the codeset in which the src string is encoded
2697 if((srcCodeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, (ULONG
)NULL
, attrs
)) == NULL
)
2698 srcCodeset
= defaultCodeset(TRUE
);
2702 if (srcCodeset
== CodesetsBase
->utf32Codeset
)
2703 srcLen
= utf32_strlen((UTF32
*)srcStr
);
2704 else if (srcCodeset
== CodesetsBase
->utf16Codeset
)
2705 srcLen
= utf16_strlen((UTF16
*)srcStr
);
2707 srcLen
= strlen(srcStr
);
2711 srcLen
= GetTagData(CSA_SourceLen
, srcLen
, attrs
);
2713 if(srcStr
!= NULL
&& srcLen
> 0)
2715 struct codeset
*dstCodeset
;
2717 // get the pointer to the codeset in which the dst string should be encoded
2718 if((dstCodeset
= (struct codeset
*)GetTagData(CSA_DestCodeset
, (ULONG
)NULL
, attrs
)) == NULL
)
2719 dstCodeset
= defaultCodeset(TRUE
);
2721 D(DBF_UTF
, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset
->name
, dstCodeset
->name
);
2723 // check that the user didn't supplied the very same codeset
2724 // or otherwise a conversion is not required.
2725 if(srcCodeset
!= NULL
&& dstCodeset
!= NULL
&& srcCodeset
!= dstCodeset
)
2727 BOOL utf8Create
= FALSE
;
2728 BOOL strCreate
= FALSE
;
2730 ULONG utf8strLen
= 0;
2731 ULONG
*destLenPtr
= NULL
;
2732 BOOL mapForeignChars
;
2733 struct Hook
*mapForeignCharsHook
;
2735 mapForeignChars
= (BOOL
)GetTagData(CSA_MapForeignChars
, FALSE
, attrs
);
2736 mapForeignCharsHook
= (struct Hook
*)GetTagData(CSA_MapForeignCharsHook
, (ULONG
)NULL
, attrs
);
2738 // if the source codeset is UTF-8 we don't have to use the UTF8Create()
2739 // function and can directly call the UTF8ToStr() function
2740 if(srcCodeset
!= CodesetsBase
->utf8Codeset
)
2742 struct TagItem tags
[] = { { CSA_SourceCodeset
, (ULONG
)srcCodeset
},
2743 { CSA_Source
, (ULONG
)srcStr
},
2744 { CSA_SourceLen
, srcLen
},
2745 { CSA_DestLenPtr
, (ULONG
)&utf8strLen
},
2748 utf8str
= CodesetsUTF8CreateA((struct TagItem
*)&tags
[0]);
2754 utf8str
= (UTF8
*)srcStr
;
2755 utf8strLen
= srcLen
;
2758 // in case the destination codeset is UTF-8 we don't have to actually
2759 // use the UTF8ToStr() function and can immediately return our
2761 if(utf8str
!= NULL
&& utf8strLen
> 0 && dstCodeset
!= CodesetsBase
->utf8Codeset
)
2763 struct TagItem tags
[] = { { CSA_DestCodeset
, (ULONG
)dstCodeset
},
2764 { CSA_Source
, (ULONG
)utf8str
},
2765 { CSA_SourceLen
, utf8strLen
},
2766 { CSA_DestLenPtr
, (ULONG
)&dstLen
},
2767 { CSA_MapForeignChars
, mapForeignChars
},
2768 { CSA_MapForeignCharsHook
, (ULONG
)mapForeignCharsHook
},
2771 dstStr
= CodesetsUTF8ToStrA((struct TagItem
*)&tags
[0]);
2777 dstStr
= (STRPTR
)utf8str
;
2778 dstLen
= utf8strLen
;
2781 D(DBF_UTF
, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr
, srcLen
,
2786 // if everything was successfull we can go and finalize everything
2787 if(dstStr
!= NULL
&& utf8str
!= NULL
)
2789 // as the conversion was a two way pass we have to either free the
2790 // memory of the utf8 string or not
2791 if(utf8Create
== TRUE
&& strCreate
== TRUE
)
2792 CodesetsFreeA(utf8str
, NULL
);
2794 // if the user wants to be informed abour the length
2795 // of our destination string we store the length now in the supplied ptr.
2796 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, (ULONG
)NULL
, attrs
)) != NULL
)
2797 *destLenPtr
= dstLen
;
2799 D(DBF_UTF
, "successfully converted string with len %ld", dstLen
);
2803 W(DBF_ALWAYS
, "an error occurred while trying to convert a string");
2805 // free all memory in case the conversion didn't work out
2806 if(utf8Create
== TRUE
&& utf8str
!= NULL
)
2807 CodesetsFreeA(utf8str
, NULL
);
2809 if(strCreate
== TRUE
&& dstStr
!= NULL
)
2810 CodesetsFreeA(dstStr
, NULL
);
2822 /// CodesetsFreeVecPooledA()
2824 CodesetsFreeVecPooledA(REG(a0
, APTR pool
),
2826 REG(a2
, struct TagItem
*attrs
))
2832 struct SignalSemaphore
*sem
;
2834 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)))
2835 ObtainSemaphore(sem
);
2837 freeVecPooled(pool
,mem
);
2840 ReleaseSemaphore(sem
);
2847 /// CodesetsListCreateA()
2848 struct codesetList
*LIBFUNC
2849 CodesetsListCreateA(REG(a0
, struct TagItem
*attrs
))
2851 struct codesetList
*csList
= NULL
;
2855 ObtainSemaphore(&CodesetsBase
->poolSem
);
2857 // no matter what, we create a codesets list we will return to the user
2858 if((csList
= allocVecPooled(CodesetsBase
->pool
, sizeof(struct codesetList
))))
2860 BOOL scanProgDir
= TRUE
;
2861 struct TagItem
*tstate
= attrs
;
2862 struct TagItem
*tag
;
2864 // initialize the new private codeset list and put it into a separate list
2865 NewList((struct List
*)csList
);
2867 // first we get the path of the directory from which we go
2868 // and scan for charset tables from
2869 while((tag
= NextTagItem(&tstate
)))
2873 case CSA_CodesetDir
:
2875 codesetsScanDir(csList
, (STRPTR
)tag
->ti_Data
);
2877 scanProgDir
= FALSE
;
2881 case CSA_CodesetFile
:
2883 codesetsReadTable(csList
, (STRPTR
)tag
->ti_Data
);
2885 scanProgDir
= FALSE
;
2889 case CSA_SourceCodeset
:
2891 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
2893 AddTail((struct List
*)csList
, (struct Node
*)&cs
->node
);
2895 scanProgDir
= FALSE
;
2901 // in case the user also wants us to scan PROGDIR:
2903 if(scanProgDir
== TRUE
)
2904 codesetsScanDir(csList
, "PROGDIR:Charsets");
2907 ReleaseSemaphore(&CodesetsBase
->poolSem
);
2914 /// CodesetsListDeleteA()
2916 CodesetsListDeleteA(REG(a0
, struct TagItem
*attrs
))
2918 BOOL result
= FALSE
;
2921 ObtainSemaphore(&CodesetsBase
->poolSem
);
2926 struct TagItem
*tstate
= attrs
;
2927 struct TagItem
*tag
;
2929 // check if the caller wants us also to free the codesets
2930 freeCodesets
= (BOOL
)GetTagData(CSA_FreeCodesets
, TRUE
, attrs
);
2932 // now we iterate through or tagItems and see what the
2933 // user wants to remove from the list
2934 while((tag
= NextTagItem(&tstate
)))
2938 case CSA_CodesetList
:
2940 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
2944 // cleanup the codesets within the list
2946 codesetsCleanup(csList
);
2948 // then free the list itself
2949 freeArbitrateVecPooled(csList
);
2958 ReleaseSemaphore(&CodesetsBase
->poolSem
);
2965 /// CodesetsListAddA()
2967 CodesetsListAddA(REG(a0
, struct codesetList
*csList
),
2968 REG(a1
, struct TagItem
*attrs
))
2970 BOOL result
= FALSE
;
2973 ObtainSemaphore(&CodesetsBase
->poolSem
);
2975 if(csList
!= NULL
&& attrs
!= NULL
)
2977 struct TagItem
*tstate
= attrs
;
2978 struct TagItem
*tag
;
2980 // now we iterate through or tagItems and see if the user
2981 // wants to scan a whole directory or just adds a file.
2982 while((tag
= NextTagItem(&tstate
)))
2986 case CSA_CodesetDir
:
2988 codesetsScanDir(csList
, (STRPTR
)tag
->ti_Data
);
2993 case CSA_CodesetFile
:
2995 codesetsReadTable(csList
, (STRPTR
)tag
->ti_Data
);
3000 case CSA_SourceCodeset
:
3002 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
3004 AddTail((struct List
*)csList
, (struct Node
*)&cs
->node
);
3012 ReleaseSemaphore(&CodesetsBase
->poolSem
);
3019 /// CodesetsListRemoveA()
3021 CodesetsListRemoveA(REG(a0
, struct TagItem
*attrs
))
3023 BOOL result
= FALSE
;
3026 ObtainSemaphore(&CodesetsBase
->poolSem
);
3031 struct TagItem
*tstate
= attrs
;
3032 struct TagItem
*tag
;
3034 // check if the caller wants us also to free the codesets
3035 freeCodesets
= (BOOL
)GetTagData(CSA_FreeCodesets
, TRUE
, attrs
);
3037 // now we iterate through or tagItems and see what the
3038 // user wants to remove from the list
3039 while((tag
= NextTagItem(&tstate
)))
3043 case CSA_SourceCodeset
:
3045 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
3049 struct MinNode
*mstate
= &cs
->node
;
3051 // before we actually remove the node from its list, we
3052 // have to make sure it isn't part of our internal codesets list
3053 while(mstate
->mln_Succ
)
3054 mstate
= mstate
->mln_Succ
;
3056 if(mstate
!= CodesetsBase
->codesets
.list
.mlh_Tail
)
3058 Remove((struct Node
*)&cs
->node
);
3060 // free all codesets data if requested.
3061 if(freeCodesets
== TRUE
)
3063 if(cs
->name
) freeArbitrateVecPooled(cs
->name
);
3064 if(cs
->alt_name
) freeArbitrateVecPooled(cs
->alt_name
);
3065 if(cs
->characterization
) freeArbitrateVecPooled(cs
->characterization
);
3067 freeArbitrateVecPooled(cs
);
3073 W(DBF_ALWAYS
, "user tried to remove an internal codesets!");
3081 ReleaseSemaphore(&CodesetsBase
->poolSem
);
3089 /**************************************************************************/