1 /***************************************************************************
3 codesets.library - Amiga shared library for handling different codesets
4 Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
5 Copyright (C) 2005-2014 codesets.library Open Source Team
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 codesets.library project: http://sourceforge.net/projects/codesetslib/
19 Most of the code included in this file was relicensed from GPL to LGPL
20 from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
21 with full permissions by its authors.
25 ***************************************************************************/
29 #include <clib/alib_protos.h>
31 #include <diskfont/glyph.h>
32 #include <diskfont/diskfonttag.h>
33 #include <proto/diskfont.h>
38 #include <proto/keymap.h>
39 #include <proto/locale.h>
42 #include "codesets_table.h"
43 #include "convertUTF.h"
44 #include "codepages.h"
46 #include "SDI_stdarg.h"
51 #include <proto/codesets.h>
53 /**************************************************************************/
55 // a union used for various type casts while avoiding the annoying "dereferencing
56 // type punned pointer is breaking strict alias rules" warnings of GCC4+
61 unsigned char **uchar
;
72 // search a sorted array in O(log n) e.g.
73 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
74 #define BIN_SEARCH(array,low,high,compare,result) \
78 int m = (low+high)/2;\
83 if (!d){ result = &array[m]; break; }\
84 if (d < 0) h = m - 1;\
92 static STRPTR
mystrdup(const char *str
)
102 if((len
= strlen(str
)) > 0)
104 if((newStr
= allocArbitrateVecPooled(len
+1)) != NULL
)
105 strlcpy(newStr
, str
, len
+1);
115 static STRPTR
mystrndup(const char *str1
, int n
)
121 if((dest
= allocArbitrateVecPooled(n
+1)) != NULL
)
124 strlcpy(dest
, str1
, n
+1);
135 static BOOL
readLine(BPTR fh
, char *buf
, ULONG size
)
137 BOOL success
= FALSE
;
142 if((c
= FGets(fh
, buf
, size
)) != NULL
)
144 // we succeeded in reading something
147 // now find the end of the line and strip the LF/CR character
150 if(*c
== '\n' || *c
== '\r')
164 static const char *getConfigItem(const char *buf
, const char *item
)
166 const char *configItem
= NULL
;
173 if(strnicmp(buf
, item
, len
) == 0)
180 while((c
= *buf
) != '\0' && isspace(c
))
188 while((c
= *buf
) != '\0' && isspace(c
))
201 static int parseUtf8(CONST_STRPTR
*ps
)
203 CONST_STRPTR s
= *ps
;
225 if((s
[1] & 0xc0)!=0x80)
233 RETURN(((s
[0] & 0x1f)<<6) | (s
[1] & 0x3f));
234 return ((s
[0] & 0x1f)<<6) | (s
[1] & 0x3f);
271 wc
= *s
++ & ((1<<(7-n
))-1);
275 if((*s
& 0xc0) != 0x80)
281 wc
= (wc
<< 6) | (*s
++ & 0x3f);
284 if(wc
< (1 << (5 * n
- 4)))
298 static int countCodesets(struct codesetList
*csList
, BOOL allowMultibyte
)
303 for(node
= GetHead((struct List
*)csList
); node
!= NULL
; node
= GetSucc(node
))
305 struct codeset
*cs
= (struct codeset
*)node
;
307 if(allowMultibyte
== TRUE
||
308 (cs
!= CodesetsBase
->utf8Codeset
&& cs
!= CodesetsBase
->utf16Codeset
&& cs
!= CodesetsBase
->utf32Codeset
))
319 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
320 // function is used to replace these unknown sequences with lookalike characters that
321 // still make the text more readable. For more replacement see
322 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
324 // The conversion table in this function is partly borrowed from the awebcharset plugin
325 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
327 struct UTF8Replacement
329 const char *utf8
; // the original UTF8 string we are going to replace
330 const int utf8len
; // the length of the UTF8 string
331 const char *rep
; // pointer to the replacement string
332 const int replen
; // the length of the replacement string (minus for signalling an UTF8 string)
335 static int compareUTF8Replacements(const void *p1
, const void *p2
)
337 struct UTF8Replacement
*key
= (struct UTF8Replacement
*)p1
;
338 struct UTF8Replacement
*rep
= (struct UTF8Replacement
*)p2
;
341 // compare the length first, after that compare the strings
342 cmp
= key
->utf8len
- rep
->utf8len
;
344 cmp
= memcmp(key
->utf8
, rep
->utf8
, key
->utf8len
);
349 static int mapUTF8toASCII(const char **dst
, const unsigned char *src
, const int utf8len
)
352 struct UTF8Replacement key
= { (char *)src
, utf8len
, NULL
, 0 };
353 struct UTF8Replacement
*rep
;
355 static struct UTF8Replacement
const utf8map
[] =
357 // U+0100 ... U+017F (Latin Extended-A)
358 { "\xC4\x80", 2, "A", 1 }, // U+0100 -> A (LATIN CAPITAL LETTER A WITH MACRON)
359 { "\xC4\x81", 2, "a", 1 }, // U+0101 -> a (LATIN SMALL LETTER A WITH MACRON)
360 { "\xC4\x82", 2, "A", 1 }, // U+0102 -> A (LATIN CAPITAL LETTER A WITH BREVE)
361 { "\xC4\x83", 2, "a", 1 }, // U+0103 -> a (LATIN SMALL LETTER A WITH BREVE)
362 { "\xC4\x84", 2, "A", 1 }, // U+0104 -> A (LATIN CAPITAL LETTER A WITH OGONEK)
363 { "\xC4\x85", 2, "a", 1 }, // U+0105 -> a (LATIN SMALL LETTER A WITH OGONEK)
364 { "\xC4\x86", 2, "C", 1 }, // U+0106 -> C (LATIN CAPITAL LETTER C WITH ACUTE)
365 { "\xC4\x87", 2, "c", 1 }, // U+0107 -> c (LATIN SMALL LETTER C WITH ACUTE)
366 { "\xC4\x88", 2, "C", 1 }, // U+0108 -> C (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
367 { "\xC4\x89", 2, "c", 1 }, // U+0109 -> c (LATIN SMALL LETTER C WITH CIRCUMFLEX)
368 { "\xC4\x8A", 2, "C", 1 }, // U+010A -> C (LATIN CAPITAL LETTER C WITH DOT ABOVE)
369 { "\xC4\x8B", 2, "c", 1 }, // U+010B -> c (LATIN SMALL LETTER C WITH DOT ABOVE)
370 { "\xC4\x8C", 2, "C", 1 }, // U+010C -> C (LATIN CAPITAL LETTER C WITH CARON)
371 { "\xC4\x8D", 2, "c", 1 }, // U+010D -> c (LATIN SMALL LETTER C WITH CARON)
372 { "\xC4\x8E", 2, "D", 1 }, // U+010E -> D (LATIN CAPITAL LETTER D WITH CARON)
373 { "\xC4\x8F", 2, "d", 1 }, // U+010F -> d (LATIN SMALL LETTER D WITH CARON)
374 { "\xC4\x90", 2, "D", 1 }, // U+0110 -> D (LATIN CAPITAL LETTER D WITH STROKE)
375 { "\xC4\x91", 2, "d", 1 }, // U+0111 -> d (LATIN SMALL LETTER D WITH STROKE)
376 { "\xC4\x92", 2, "E", 1 }, // U+0112 -> E (LATIN CAPITAL LETTER E WITH MACRON)
377 { "\xC4\x93", 2, "e", 1 }, // U+0113 -> e (LATIN SMALL LETTER E WITH MACRON)
378 { "\xC4\x94", 2, "E", 1 }, // U+0114 -> E (LATIN CAPITAL LETTER E WITH BREVE)
379 { "\xC4\x95", 2, "e", 1 }, // U+0115 -> e (LATIN SMALL LETTER E WITH BREVE)
380 { "\xC4\x96", 2, "E", 1 }, // U+0116 -> E (LATIN CAPITAL LETTER E WITH DOT ABOVE)
381 { "\xC4\x97", 2, "e", 1 }, // U+0117 -> e (LATIN SMALL LETTER E WITH DOT ABOVE)
382 { "\xC4\x98", 2, "E", 1 }, // U+0118 -> E (LATIN CAPITAL LETTER E WITH OGONEK)
383 { "\xC4\x99", 2, "e", 1 }, // U+0119 -> e (LATIN SMALL LETTER E WITH OGONEK)
384 { "\xC4\x9A", 2, "E", 1 }, // U+011A -> E (LATIN CAPITAL LETTER E WITH CARON)
385 { "\xC4\x9B", 2, "e", 1 }, // U+011B -> e (LATIN SMALL LETTER E WITH CARON)
386 { "\xC4\x9C", 2, "G", 1 }, // U+011C -> G (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
387 { "\xC4\x9D", 2, "g", 1 }, // U+011D -> g (LATIN SMALL LETTER G WITH CIRCUMFLEX)
388 { "\xC4\x9E", 2, "G", 1 }, // U+011E -> G (LATIN CAPITAL LETTER G WITH BREVE)
389 { "\xC4\x9F", 2, "g", 1 }, // U+011F -> g (LATIN SMALL LETTER G WITH BREVE)
390 { "\xC4\xA0", 2, "G", 1 }, // U+0120 -> G (LATIN CAPITAL LETTER G WITH DOT ABOVE)
391 { "\xC4\xA1", 2, "g", 1 }, // U+0121 -> g (LATIN SMALL LETTER G WITH DOT ABOVE)
392 { "\xC4\xA2", 2, "G", 1 }, // U+0122 -> G (LATIN CAPITAL LETTER G WITH CEDILLA)
393 { "\xC4\xA3", 2, "g", 1 }, // U+0123 -> g (LATIN SMALL LETTER G WITH CEDILLA)
394 { "\xC4\xA4", 2, "H", 1 }, // U+0124 -> H (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
395 { "\xC4\xA5", 2, "h", 1 }, // U+0125 -> h (LATIN SMALL LETTER H WITH CIRCUMFLEX)
396 { "\xC4\xA6", 2, "H", 1 }, // U+0126 -> H (LATIN CAPITAL LETTER H WITH STROKE)
397 { "\xC4\xA7", 2, "h", 1 }, // U+0127 -> h (LATIN SMALL LETTER H WITH STROKE)
398 { "\xC4\xA8", 2, "I", 1 }, // U+0128 -> I (LATIN CAPITAL LETTER I WITH TILDE)
399 { "\xC4\xA9", 2, "i", 1 }, // U+0129 -> i (LATIN SMALL LETTER I WITH TILDE)
400 { "\xC4\xAA", 2, "I", 1 }, // U+012A -> I (LATIN CAPITAL LETTER I WITH MACRON)
401 { "\xC4\xAB", 2, "i", 1 }, // U+012B -> i (LATIN SMALL LETTER I WITH MACRON)
402 { "\xC4\xAC", 2, "I", 1 }, // U+012C -> I (LATIN CAPITAL LETTER I WITH BREVE)
403 { "\xC4\xAD", 2, "i", 1 }, // U+012D -> i (LATIN SMALL LETTER I WITH BREVE)
404 { "\xC4\xAE", 2, "I", 1 }, // U+012E -> I (LATIN CAPITAL LETTER I WITH OGONEK)
405 { "\xC4\xAF", 2, "i", 1 }, // U+012F -> i (LATIN SMALL LETTER I WITH OGONEK)
406 { "\xC4\xB0", 2, "I", 1 }, // U+0130 -> I (LATIN CAPITAL LETTER I WITH DOT ABOVE)
407 { "\xC4\xB1", 2, "i", 1 }, // U+0131 -> i (LATIN SMALL LETTER DOTLESS I)
408 { "\xC4\xB2", 2, "Ij", 2 }, // U+0132 -> Ij (LATIN CAPITAL LIGATURE IJ)
409 { "\xC4\xB3", 2, "ij", 2 }, // U+0133 -> ij (LATIN SMALL LIGATURE IJ)
410 { "\xC4\xB4", 2, "J", 1 }, // U+0134 -> J (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
411 { "\xC4\xB5", 2, "j", 1 }, // U+0135 -> j (LATIN SMALL LETTER J WITH CIRCUMFLEX)
412 { "\xC4\xB6", 2, "K", 1 }, // U+0136 -> K (LATIN CAPITAL LETTER K WITH CEDILLA)
413 { "\xC4\xB7", 2, "k", 1 }, // U+0137 -> k (LATIN SMALL LETTER K WITH CEDILLA)
414 { "\xC4\xB8", 2, "k", 1 }, // U+0138 -> k (LATIN SMALL LETTER KRA)
415 { "\xC4\xB9", 2, "L", 1 }, // U+0139 -> L (LATIN CAPITAL LETTER L WITH ACUTE)
416 { "\xC4\xBA", 2, "l", 1 }, // U+013A -> l (LATIN SMALL LETTER L WITH ACUTE)
417 { "\xC4\xBB", 2, "L", 1 }, // U+013B -> L (LATIN CAPITAL LETTER L WITH CEDILLA)
418 { "\xC4\xBC", 2, "l", 1 }, // U+013C -> l (LATIN SMALL LETTER L WITH CEDILLA)
419 { "\xC4\xBD", 2, "L", 1 }, // U+013D -> L (LATIN CAPITAL LETTER L WITH CARON)
420 { "\xC4\xBE", 2, "l", 1 }, // U+013E -> l (LATIN SMALL LETTER L WITH CARON)
421 { "\xC4\xBF", 2, "L", 1 }, // U+013F -> L (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
422 { "\xC5\x80", 2, "l", 1 }, // U+0140 -> l (LATIN SMALL LETTER L WITH MIDDLE DOT)
423 { "\xC5\x81", 2, "L", 1 }, // U+0141 -> L (LATIN CAPITAL LETTER L WITH STROKE)
424 { "\xC5\x82", 2, "l", 1 }, // U+0142 -> l (LATIN SMALL LETTER L WITH STROKE)
425 { "\xC5\x83", 2, "N", 1 }, // U+0143 -> N (LATIN CAPITAL LETTER N WITH ACUTE)
426 { "\xC5\x84", 2, "n", 1 }, // U+0144 -> n (LATIN SMALL LETTER N WITH ACUTE)
427 { "\xC5\x85", 2, "N", 1 }, // U+0145 -> N (LATIN CAPITAL LETTER N WITH CEDILLA)
428 { "\xC5\x86", 2, "n", 1 }, // U+0146 -> n (LATIN SMALL LETTER N WITH CEDILLA)
429 { "\xC5\x87", 2, "N", 1 }, // U+0147 -> N (LATIN CAPITAL LETTER N WITH CARON)
430 { "\xC5\x88", 2, "n", 1 }, // U+0148 -> n (LATIN SMALL LETTER N WITH CARON)
431 { "\xC5\x89", 2, "'n", 2 }, // U+0149 -> 'n (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
432 { "\xC5\x8A", 2, "Ng", 2 }, // U+014A -> Ng (LATIN CAPITAL LETTER ENG)
433 { "\xC5\x8B", 2, "ng", 2 }, // U+014B -> ng (LATIN SMALL LETTER ENG)
434 { "\xC5\x8C", 2, "O", 1 }, // U+014C -> O (LATIN CAPITAL LETTER O WITH MACRON)
435 { "\xC5\x8D", 2, "o", 1 }, // U+014D -> o (LATIN SMALL LETTER O WITH MACRON)
436 { "\xC5\x8E", 2, "O", 1 }, // U+014E -> O (LATIN CAPITAL LETTER O WITH BREVE)
437 { "\xC5\x8F", 2, "o", 1 }, // U+014F -> o (LATIN SMALL LETTER O WITH BREVE)
438 { "\xC5\x90", 2, "O", 1 }, // U+0150 -> O (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
439 { "\xC5\x91", 2, "o", 1 }, // U+0151 -> o (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
440 { "\xC5\x92", 2, "Oe", 2 }, // U+0152 -> Oe (LATIN CAPITAL LIGATURE OE)
441 { "\xC5\x93", 2, "oe", 2 }, // U+0153 -> oe (LATIN SMALL LIGATURE OE)
442 { "\xC5\x94", 2, "R", 1 }, // U+0154 -> R (LATIN CAPITAL LETTER R WITH ACUTE)
443 { "\xC5\x95", 2, "r", 1 }, // U+0155 -> r (LATIN SMALL LETTER R WITH ACUTE)
444 { "\xC5\x96", 2, "R", 1 }, // U+0156 -> R (LATIN CAPITAL LETTER R WITH CEDILLA)
445 { "\xC5\x97", 2, "r", 1 }, // U+0157 -> r (LATIN SMALL LETTER R WITH CEDILLA)
446 { "\xC5\x98", 2, "R", 1 }, // U+0158 -> R (LATIN CAPITAL LETTER R WITH CARON)
447 { "\xC5\x99", 2, "r", 1 }, // U+0159 -> r (LATIN SMALL LETTER R WITH CARON)
448 { "\xC5\x9A", 2, "S", 1 }, // U+015A -> S (LATIN CAPITAL LETTER S WITH ACUTE)
449 { "\xC5\x9B", 2, "s", 1 }, // U+015B -> s (LATIN SMALL LETTER S WITH ACUTE)
450 { "\xC5\x9C", 2, "S", 1 }, // U+015C -> S (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
451 { "\xC5\x9D", 2, "s", 1 }, // U+015D -> s (LATIN SMALL LETTER S WITH CIRCUMFLEX)
452 { "\xC5\x9E", 2, "S", 1 }, // U+015E -> S (LATIN CAPITAL LETTER S WITH CEDILLA)
453 { "\xC5\x9F", 2, "s", 1 }, // U+015F -> s (LATIN SMALL LETTER S WITH CEDILLA)
454 { "\xC5\xA0", 2, "S", 1 }, // U+0160 -> S (LATIN CAPITAL LETTER S WITH CARON)
455 { "\xC5\xA1", 2, "s", 1 }, // U+0161 -> s (LATIN SMALL LETTER S WITH CARON)
456 { "\xC5\xA2", 2, "T", 1 }, // U+0162 -> T (LATIN CAPITAL LETTER T WITH CEDILLA)
457 { "\xC5\xA3", 2, "t", 1 }, // U+0163 -> t (LATIN SMALL LETTER T WITH CEDILLA)
458 { "\xC5\xA4", 2, "T", 1 }, // U+0164 -> T (LATIN CAPITAL LETTER T WITH CARON)
459 { "\xC5\xA5", 2, "t", 1 }, // U+0165 -> t (LATIN SMALL LETTER T WITH CARON)
460 { "\xC5\xA6", 2, "T", 1 }, // U+0166 -> T (LATIN CAPITAL LETTER T WITH STROKE)
461 { "\xC5\xA7", 2, "t", 1 }, // U+0167 -> t (LATIN SMALL LETTER T WITH STROKE)
462 { "\xC5\xA8", 2, "U", 1 }, // U+0168 -> U (LATIN CAPITAL LETTER U WITH TILDE)
463 { "\xC5\xA9", 2, "u", 1 }, // U+0169 -> u (LATIN SMALL LETTER U WITH TILDE)
464 { "\xC5\xAA", 2, "U", 1 }, // U+016A -> U (LATIN CAPITAL LETTER U WITH MACRON)
465 { "\xC5\xAB", 2, "u", 1 }, // U+016B -> u (LATIN SMALL LETTER U WITH MACRON)
466 { "\xC5\xAC", 2, "U", 1 }, // U+016C -> U (LATIN CAPITAL LETTER U WITH BREVE)
467 { "\xC5\xAD", 2, "u", 1 }, // U+016D -> u (LATIN SMALL LETTER U WITH BREVE)
468 { "\xC5\xAE", 2, "U", 1 }, // U+016E -> U (LATIN CAPITAL LETTER U WITH RING ABOVE)
469 { "\xC5\xAF", 2, "u", 1 }, // U+016F -> u (LATIN SMALL LETTER U WITH RING ABOVE)
470 { "\xC5\xB0", 2, "U", 1 }, // U+0170 -> U (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
471 { "\xC5\xB1", 2, "u", 1 }, // U+0171 -> u (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
472 { "\xC5\xB2", 2, "U", 1 }, // U+0172 -> U (LATIN CAPITAL LETTER U WITH OGONEK)
473 { "\xC5\xB3", 2, "u", 1 }, // U+0173 -> u (LATIN SMALL LETTER U WITH OGONEK)
474 { "\xC5\xB4", 2, "W", 1 }, // U+0174 -> W (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
475 { "\xC5\xB5", 2, "w", 1 }, // U+0175 -> w (LATIN SMALL LETTER W WITH CIRCUMFLEX)
476 { "\xC5\xB6", 2, "Y", 1 }, // U+0176 -> Y (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
477 { "\xC5\xB7", 2, "y", 1 }, // U+0177 -> y (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
478 { "\xC5\xB8", 2, "Y", 1 }, // U+0178 -> Y (LATIN CAPITAL LETTER Y WITH DIAERESIS)
479 { "\xC5\xB9", 2, "Z", 1 }, // U+0179 -> Z (LATIN CAPITAL LETTER Z WITH ACUTE)
480 { "\xC5\xBA", 2, "z", 1 }, // U+017A -> z (LATIN SMALL LETTER Z WITH ACUTE)
481 { "\xC5\xBB", 2, "Z", 1 }, // U+017B -> Z (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
482 { "\xC5\xBC", 2, "z", 1 }, // U+017C -> z (LATIN SMALL LETTER Z WITH DOT ABOVE)
483 { "\xC5\xBD", 2, "Z", 1 }, // U+017D -> Z (LATIN CAPITAL LETTER Z WITH CARON)
484 { "\xC5\xBE", 2, "z", 1 }, // U+017E -> z (LATIN SMALL LETTER Z WITH CARON)
485 { "\xC5\xBF", 2, "s", 1 }, // U+017F -> s (LATIN SMALL LETTER LONG S
487 // U+2000 ... U+206F (General Punctuation)
488 { "\xE2\x80\x90", 3, "-", 1 }, // U+2010 -> - (HYPHEN)
489 { "\xE2\x80\x91", 3, "-", 1 }, // U+2011 -> - (NON-BREAKING HYPHEN)
490 { "\xE2\x80\x92", 3, "--", 2 }, // U+2012 -> -- (FIGURE DASH)
491 { "\xE2\x80\x93", 3, "--", 2 }, // U+2013 -> -- (EN DASH)
492 { "\xE2\x80\x94", 3, "---", 3 }, // U+2014 -> --- (EM DASH)
493 { "\xE2\x80\x95", 3, "---", 3 }, // U+2015 -> --- (HORIZONTAL BAR)
494 { "\xE2\x80\x96", 3, "||", 2 }, // U+2016 -> || (DOUBLE VERTICAL LINE)
495 { "\xE2\x80\x97", 3, "_", 1 }, // U+2017 -> _ (DOUBLE LOW LINE)
496 { "\xE2\x80\x98", 3, "`", 1 }, // U+2018 -> ` (LEFT SINGLE QUOTATION MARK)
497 { "\xE2\x80\x99", 3, "'", 1 }, // U+2019 -> ' (RIGHT SINGLE QUOTATION MARK)
498 { "\xE2\x80\x9A", 3, ",", 1 }, // U+201A -> , (SINGLE LOW-9 QUOTATION MARK)
499 { "\xE2\x80\x9B", 3, "'", 1 }, // U+201B -> ' (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
500 { "\xE2\x80\x9C", 3, "\"", 1 }, // U+201C -> " (LEFT DOUBLE QUOTATION MARK)
501 { "\xE2\x80\x9D", 3, "\"", 1 }, // U+201D -> " (RIGHT DOUBLE QUOTATION MARK)
502 { "\xE2\x80\x9E", 3, ",,", 2 }, // U+201E -> ,, (DOUBLE LOW-9 QUOTATION MARK)
503 { "\xE2\x80\x9F", 3, "``", 2 }, // U+201F -> `` (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
504 { "\xE2\x80\xA0", 3, "+", 1 }, // U+2020 -> + (DAGGER)
505 { "\xE2\x80\xA1", 3, "+", 1 }, // U+2021 -> + (DOUBLE DAGGER)
506 { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7 (BULLET) -> (MIDDLE POINT)
507 { "\xE2\x80\xA3", 3, ".", 1 }, // U+2023 -> . (TRIANGULAR BULLET)
508 { "\xE2\x80\xA4", 3, ".", 1 }, // U+2024 -> . (ONE DOT LEADER)
509 { "\xE2\x80\xA5", 3, "..", 2 }, // U+2025 -> .. (TWO DOT LEADER)
510 { "\xE2\x80\xA6", 3, "...", 3 }, // U+2026 -> ... (HORIZONTAL ELLIPSIS)
511 { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7 (HYPHENATION POINT) -> (MIDDLE POINT)
512 { "\xE2\x80\xB0", 3, "%.", 2 }, // U+2030 -> %. (PER MILLE SIGN)
513 { "\xE2\x80\xB1", 3, "%..", 3 }, // U+2031 -> %.. (PER TEN THOUSAND SIGN)
514 { "\xE2\x80\xB2", 3, "'", 1 }, // U+2032 -> ` (PRIME)
515 { "\xE2\x80\xB3", 3, "''", 2 }, // U+2033 -> '' (DOUBLE PRIME)
516 { "\xE2\x80\xB4", 3, "'''", 3 }, // U+2034 -> ''' (TRIPLE PRIME)
517 { "\xE2\x80\xB5", 3, "`", 1 }, // U+2035 -> ` (REVERSED PRIME)
518 { "\xE2\x80\xB6", 3, "``", 2 }, // U+2036 -> `` (REVERSED DOUBLE PRIME)
519 { "\xE2\x80\xB7", 3, "```", 3 }, // U+2037 -> ``` (REVERSED TRIPLE PRIME)
520 { "\xE2\x80\xB8", 3, "^", 1 }, // U+2038 -> ^ (CARET)
521 { "\xE2\x80\xB9", 3, "<", 1 }, // U+2039 -> < (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
522 { "\xE2\x80\xBA", 3, ">", 1 }, // U+203A -> > (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
523 { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7 (REFERENCE MARK) -> (MULTIPLICATION SIGN)
524 { "\xE2\x80\xBC", 3, "!!", 2 }, // U+203C -> !! (DOUBLE EXCLAMATION MARK)
525 { "\xE2\x80\xBD", 3, "?", 1 }, // U+203D -> ? (INTERROBANG)
526 { "\xE2\x81\x82", 3, "*", 1 }, // U+2042 -> * (ASTERISM)
527 { "\xE2\x81\x83", 3, ".", 1 }, // U+2043 -> . (HYPHEN BULLET)
528 { "\xE2\x81\x84", 3, "/", 1 }, // U+2044 -> / (FRACTION SLASH)
529 { "\xE2\x81\x87", 3, "??", 2 }, // U+2047 -> ?? (DOUBLE QUESTION MARK)
530 { "\xE2\x81\x88", 3, "?!", 2 }, // U+2048 -> ?! (QUESTION EXCLAMATION MARK)
531 { "\xE2\x81\x89", 3, "!?", 2 }, // U+2049 -> !? (EXCLAMATION QUESTION MARK)
532 { "\xE2\x81\x8E", 3, "*", 1 }, // U+204E -> * (LOW ASTERISK)
533 { "\xE2\x81\x8F", 3, ";", 1 }, // U+204F -> ; (REVERSED SEMICOLON)
534 { "\xE2\x81\x91", 3, "*", 1 }, // U+2051 -> * (TWO ASTERISKS ALIGNED VERTICALLY)
535 { "\xE2\x81\x92", 3, "-", 1 }, // U+2052 -> - (COMMERCIAL MINUS SIGN)
536 { "\xE2\x81\x93", 3, "~", 1 }, // U+2053 -> ~ (SWUNG DASH)
537 { "\xE2\x81\x95", 3, "*", 1 }, // U+2055 -> * (FLOWER PUNCTUATION MARK)
538 { "\xE2\x81\x97", 3, "''''", 4 }, // U+2057 -> '''' (QUADRUPLE PRIME)
539 { "\xE2\x81\x9A", 3, ":", 1 }, // U+205A -> : (TWO DOT PUNCTUATION)
540 { "\xE2\x81\x9C", 3, "+", 1 }, // U+205C -> + (DOTTED CROSS)
542 // U+20A0 ... U+20CF (Currency Symbols)
543 { "\xE2\x82\xA0", 3, "ECU", 3 }, // U+20A0 -> ECU (EURO-CURRENCY SIGN)
544 { "\xE2\x82\xA1", 3, "CRC", 3 }, // U+20A1 -> CRC (COLON SIGN)
545 { "\xE2\x82\xA2", 3, "BRC", 3 }, // U+20A2 -> BRC (CRUZEIRO SIGN)
546 { "\xE2\x82\xA3", 3, "BEF", 3 }, // U+20A3 -> BEF (FRENCH FRANC SIGN)
547 { "\xE2\x82\xA4", 3, "ITL", 3 }, // U+20A4 -> ITL (LIRA SIGN)
548 { "\xE2\x82\xA6", 3, "NGN", 3 }, // U+20A6 -> NGN (NEIRA SIGN)
549 { "\xE2\x82\xA7", 3, "ESP", 3 }, // U+20A7 -> ESP (PESETA SIGN)
550 { "\xE2\x82\xA8", 3, "MVQ", 3 }, // U+20A8 -> MVQ (RUPEE SIGN)
551 { "\xE2\x82\xA9", 3, "KPW", 3 }, // U+20A9 -> KPW (WON SIGN)
552 { "\xE2\x82\xAA", 3, "ILS", 3 }, // U+20AA -> ILS (NEW SHEQEL SIGN)
553 { "\xE2\x82\xAB", 3, "VNC", 3 }, // U+20AB -> VNC (DONG SIGN)
554 { "\xE2\x82\xAC", 3, "EUR", 3 }, // U+20AC -> EUR (EURO SIGN)
555 { "\xE2\x82\xAD", 3, "LAK", 3 }, // U+20AD -> LAK (KIP SIGN)
556 { "\xE2\x82\xAE", 3, "MNT", 3 }, // U+20AE -> MNT (TUGRIK SIGN)
557 { "\xE2\x82\xAF", 3, "GRD", 3 }, // U+20AF -> GRD (DRACHMA SIGN)
558 { "\xE2\x82\xB0", 3, "Pf", 2 }, // U+20B0 -> Pf (GERMAN PENNY SIGN)
559 { "\xE2\x82\xB1", 3, "P", 1 }, // U+20B1 -> P (PESO SIGN)
560 { "\xE2\x82\xB2", 3, "PYG", 3 }, // U+20B2 -> PYG (GUARANI SIGN)
561 { "\xE2\x82\xB3", 3, "ARA", 3 }, // U+20B3 -> ARA (AUSTRAL SIGN)
562 { "\xE2\x82\xB4", 3, "UAH", 3 }, // U+20B4 -> UAH (HRYVNIA SIGN)
563 { "\xE2\x82\xB5", 3, "GHS", 3 }, // U+20B5 -> GHS (CEDI SIGN)
565 // U+2190 ... U+21FF (Arrows)
566 { "\xE2\x86\x90", 3, "<-", 2 }, // U+2190 -> <- (LEFTWARDS ARROW)
567 { "\xE2\x86\x92", 3, "->", 2 }, // U+2192 -> -> (RIGHTWARDS ARROW)
572 // start with no replacement string
575 // perform a binary search in the lookup table
576 if((rep
= bsearch(&key
, utf8map
, sizeof(utf8map
) / sizeof(utf8map
[0]), sizeof(utf8map
[0]), compareUTF8Replacements
)) != NULL
)
578 // if we found something, then copy this over to the result variables
588 /// matchCodesetAlias()
590 struct CodesetAliases
592 const char *MIMEname
; // The official and correct MIME name for a codeset
593 const char *Aliases
; // A space separated array with well-known aliases
596 const struct CodesetAliases codesetAliases
[] =
599 { "Amiga-1251", "Ami1251 Amiga1251" },
600 { "AmigaPL", "AmiPL Amiga-PL" },
601 { "ISO-8859-1", "ISO8859-1 8859-1" },
602 { "ISO-8859-2", "ISO8859-2 8859-2" },
603 { "ISO-8859-3", "ISO8859-3 8859-3" },
604 { "ISO-8859-4", "ISO8859-4 8859-4" },
605 { "ISO-8859-5", "ISO8859-5 8859-5" },
606 { "ISO-8859-6", "ISO8859-6 8859-6" },
607 { "ISO-8859-7", "ISO8859-7 8859-7" },
608 { "ISO-8859-8", "ISO8859-8 8859-8" },
609 { "ISO-8859-9", "ISO8859-9 8859-9" },
610 { "ISO-8859-10", "ISO8859-10 8859-10" },
611 { "ISO-8859-11", "ISO8859-11 8859-11" },
612 { "ISO-8859-12", "ISO8859-12 8859-12" },
613 { "ISO-8859-13", "ISO8859-13 8859-13" },
614 { "ISO-8859-14", "ISO8859-14 8859-14" },
615 { "ISO-8859-15", "ISO8859-15 8859-15" },
616 { "ISO-8859-16", "ISO8859-16 8859-16" },
617 { "ISO-8859-10", "ISO8859-10 8859-10" },
618 { "KOI8-R", "KOI8R" },
619 { "US-ASCII", "ASCII" },
620 { "UTF-8", "UTF8 UTF" },
621 { "UTF-16", "UTF16" },
622 { "UTF-32", "UTF32" },
623 { "windows-1250", "cp1250 windows1250" },
624 { "windows-1251", "cp1251 windows1251" },
625 { "windows-1252", "cp1252 windows1252" },
626 { "windows-1253", "cp1253 windows1253" },
627 { "windows-1254", "cp1254 windows1254" },
628 { "windows-1255", "cp1255 windows1255" },
629 { "windows-1256", "cp1256 windows1256" },
630 { "windows-1257", "cp1257 windows1257" },
634 static const char *matchCodesetAlias(const char *search
)
636 const char *result
= NULL
;
637 size_t len
= strlen(search
);
642 for(i
=0; codesetAliases
[i
].MIMEname
!= NULL
; i
++)
646 // search the MIMEname first
647 if(stricmp(search
, codesetAliases
[i
].MIMEname
) == 0)
651 const char *s
= codesetAliases
[i
].Aliases
;
653 // loop through space separated list of aliases
654 while(s
!= NULL
&& *s
!= '\0')
656 if(strnicmp(search
, s
, len
) == 0)
662 if((s
= strpbrk(s
, " ")) != NULL
)
669 result
= codesetAliases
[i
].MIMEname
;
681 /**************************************************************************/
684 static struct codeset
*defaultCodeset(BOOL useSemaphore
)
687 struct codeset
*codeset
;
691 if(useSemaphore
== TRUE
)
692 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
695 GetVar("codeset_default" ,buf
, sizeof(buf
), GVF_GLOBAL_ONLY
);
697 if(buf
[0] == '\0' || (codeset
= codesetsFind(&CodesetsBase
->codesets
, buf
)) == NULL
)
698 codeset
= CodesetsBase
->systemCodeset
;
700 if(useSemaphore
== TRUE
)
701 ReleaseSemaphore(&CodesetsBase
->libSem
);
708 /// codesetsCmpUnicode()
709 // The compare function
710 static int codesetsCmpUnicode(const void *a1
, const void *a2
)
712 struct single_convert
*arg1
= (struct single_convert
*)a1
;
713 struct single_convert
*arg2
= (struct single_convert
*)a2
;
715 return strcmp((char*)&arg1
->utf8
[1], (char*)&arg2
->utf8
[1]);
719 /// codesetsReadTable()
721 #define ITEM_STANDARD "Standard"
722 #define ITEM_ALTSTANDARD "AltStandard"
723 #define ITEM_READONLY "ReadOnly"
724 #define ITEM_CHARACTERIZATION "Characterization"
726 // Reads a coding table and adds it
727 static BOOL
codesetsReadTable(struct codesetList
*csList
, STRPTR name
)
734 D(DBF_STARTUP
, "trying to read charset file '%s'...", name
);
736 if((fh
= Open(name
, MODE_OLDFILE
)) != (BPTR
)NULL
)
738 struct codeset
*codeset
;
740 if((codeset
= (struct codeset
*)allocArbitrateVecPooled(sizeof(*codeset
))) != NULL
)
745 memset(codeset
, 0, sizeof(*codeset
));
747 for(i
= 0; i
<256; i
++)
749 codeset
->table
[i
].code
= i
;
750 codeset
->table
[i
].ucs4
= i
;
753 while(readLine(fh
, buf
, sizeof(buf
)) == TRUE
)
759 if((result
= getConfigItem(buf
, ITEM_STANDARD
)) != NULL
)
760 codeset
->name
= mystrdup(result
);
761 else if(codeset
->name
== NULL
) // a valid file starts with "Standard" and nothing else!!
763 else if((result
= getConfigItem(buf
, ITEM_ALTSTANDARD
)) != NULL
)
764 codeset
->alt_name
= mystrdup(result
);
765 else if((result
= getConfigItem(buf
, ITEM_READONLY
)) != NULL
)
766 codeset
->read_only
= (atoi(result
) == 0) ? 0 : 1;
767 else if((result
= getConfigItem(buf
, ITEM_CHARACTERIZATION
)) != NULL
)
769 if(result
[0] == '_' && result
[1] == '(' && result
[2] == '"')
771 char *end
= strchr(result
+ 3, '"');
774 codeset
->characterization
= mystrndup(result
+3, end
-(result
+3));
777 codeset
->characterization
= mystrdup(result
);
784 if(*p
== '=' || (fmt2
= ((*p
=='0') || (*(p
+1)=='x'))))
789 i
= strtol(p
, &p
, 16);
795 if(strnicmp(p
, "U+", 2) == 0)
798 codeset
->table
[i
].ucs4
= strtol(p
, &p
, 16);
802 codeset
->table
[i
].ucs4
= strtol(p
, &p
, 0);
810 // check if there is not already codeset with the same name in here
811 if(codeset
->name
!= NULL
&& codesetsFind(csList
, codeset
->name
) == NULL
)
815 UTF32 src
= codeset
->table
[i
].ucs4
;
816 UTF32
*src_ptr
= &src
;
817 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
819 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
821 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)(&codeset
->table
[i
].utf8
[1]);
824 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
825 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
826 D(DBF_STARTUP
, "adding external codeset '%s'", codeset
->name
);
827 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
834 if(codeset
->name
!= NULL
)
835 freeArbitrateVecPooled(codeset
->name
);
836 if(codeset
->alt_name
!= NULL
)
837 freeArbitrateVecPooled(codeset
->alt_name
);
838 if(codeset
->characterization
!= NULL
)
839 freeArbitrateVecPooled(codeset
->characterization
);
840 freeArbitrateVecPooled(codeset
);
851 /// codesetsScanDir()
852 static void codesetsScanDir(struct codesetList
*csList
, const char *dirPath
)
856 if(dirPath
!= NULL
&& dirPath
[0] != '\0')
858 #if defined(__amigaos4__)
861 if((dirContext
= ObtainDirContextTags(EX_StringNameInput
, dirPath
,
862 EX_DataFields
, EXF_NAME
|EXF_TYPE
,
865 struct ExamineData
*exd
;
867 D(DBF_STARTUP
, "scanning directory '%s' for codesets tables", dirPath
);
869 while((exd
= ExamineDir(dirContext
)) != NULL
)
875 strlcpy(filePath
, dirPath
, sizeof(filePath
));
876 AddPart(filePath
, exd
->Name
, sizeof(filePath
));
878 D(DBF_STARTUP
, "about to read codeset table '%s'", filePath
);
880 codesetsReadTable(csList
, filePath
);
884 ReleaseDirContext(dirContext
);
889 if((dirLock
= Lock(dirPath
, ACCESS_READ
)))
891 struct ExAllControl
*eac
;
893 D(DBF_STARTUP
, "scanning directory '%s' for codesets tables", dirPath
);
895 if((eac
= AllocDosObject(DOS_EXALLCONTROL
, NULL
)) != NULL
)
897 struct ExAllData
*ead
;
898 struct ExAllData
*eabuffer
;
901 eac
->eac_LastKey
= 0;
902 eac
->eac_MatchString
= NULL
;
903 eac
->eac_MatchFunc
= NULL
;
905 if((eabuffer
= allocVecPooled(CodesetsBase
->pool
, 10*sizeof(struct ExAllData
))) != NULL
)
911 more
= ExAll(dirLock
, eabuffer
, 10*sizeof(struct ExAllData
), ED_TYPE
, eac
);
912 if(!more
&& IoErr() != ERROR_NO_MORE_ENTRIES
)
915 if(eac
->eac_Entries
== 0)
918 ead
= (struct ExAllData
*)eabuffer
;
921 // we only take that ead if it is a file (ed_Type < 0)
924 strlcpy(filePath
, dirPath
, sizeof(filePath
));
925 AddPart(filePath
, (char *)ead
->ed_Name
, sizeof(filePath
));
927 D(DBF_STARTUP
, "about to read codeset table '%s'", filePath
);
929 codesetsReadTable(csList
, filePath
);
937 freeVecPooled(CodesetsBase
->pool
, eabuffer
);
940 FreeDosObject(DOS_EXALLCONTROL
, eac
);
953 // Initialized and loads the codesets
954 BOOL
codesetsInit(struct codesetList
*csList
)
956 BOOL success
= FALSE
;
957 struct codeset
*codeset
;
960 #if defined(__amigaos4__)
966 NewList((struct List
*)csList
);
968 // to make the list of the supported codesets complete we also add fake
969 // 'UTF-8', 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
970 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
973 memset(codeset
, 0, sizeof(*codeset
));
974 codeset
->name
= mystrdup("UTF-8");
975 codeset
->alt_name
= mystrdup("UTF8");
976 codeset
->characterization
= mystrdup("Unicode");
977 codeset
->read_only
= 0;
978 D(DBF_STARTUP
, "adding internal codeset 'UTF-8'");
979 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
980 CodesetsBase
->utf8Codeset
= codeset
;
982 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
985 memset(codeset
, 0, sizeof(*codeset
));
986 codeset
->name
= mystrdup("UTF-16");
987 codeset
->alt_name
= mystrdup("UTF16");
988 codeset
->characterization
= mystrdup("16-bit Unicode");
989 codeset
->read_only
= 0;
990 D(DBF_STARTUP
, "adding internal codeset 'UTF-16'");
991 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
992 CodesetsBase
->utf16Codeset
= codeset
;
994 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
997 memset(codeset
, 0, sizeof(*codeset
));
998 codeset
->name
= mystrdup("UTF-32");
999 codeset
->alt_name
= mystrdup("UTF32");
1000 codeset
->characterization
= mystrdup("32-bit Unicode");
1001 codeset
->read_only
= 0;
1002 D(DBF_STARTUP
, "adding internal codeset 'UTF-32'");
1003 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1004 CodesetsBase
->utf32Codeset
= codeset
;
1006 // on AmigaOS4 we can use diskfont.library to inquire charset information as
1007 // it comes with a quite rich implementation of different charsets.
1008 #if defined(__amigaos4__)
1009 D(DBF_STARTUP
, "OS4, asking diskfont.library for codesets");
1015 ULONG curMIB
= nextMIB
;
1017 nextMIB
= ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_NEXTNUMBER
);
1021 mapTable
= (ULONG
*)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_MAPTABLE
);
1022 mimename
= (char *)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_MIMENAME
);
1023 ianaName
= (char *)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_NAME
);
1024 if(mapTable
!= NULL
&& mimename
!= NULL
&& codesetsFind(csList
, mimename
) == NULL
)
1026 D(DBF_STARTUP
, "loading charset '%s' from diskfont.library...", mimename
);
1028 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1031 codeset
->name
= mystrdup(mimename
);
1032 codeset
->alt_name
= NULL
;
1033 codeset
->characterization
= mystrdup(ianaName
);
1034 codeset
->read_only
= 0;
1036 for(i
=0; i
<256; i
++)
1038 UTF32
*src_ptr
= &src
;
1039 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1043 codeset
->table
[i
].code
= i
;
1044 codeset
->table
[i
].ucs4
= src
;
1045 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1047 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1050 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1051 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1053 D(DBF_STARTUP
, "adding diskfont.library codeset '%s'", codeset
->name
);
1054 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1060 #if defined(__MORPHOS__)
1062 struct Library
*KeymapBase
;
1063 struct Library
*LocaleBase
;
1064 // assume success at first
1065 BOOL success
= TRUE
;
1067 D(DBF_STARTUP
, "MorphOS, asking keymap.library for codesets");
1068 if((KeymapBase
= OpenLibrary("keymap.library", 51)) != NULL
)
1070 if((LocaleBase
= OpenLibrary("locale.library", 51)) != NULL
)
1072 struct KeyMap
*keymap
= AskKeyMapDefault();
1073 // it doesn't matter if this call fails, as we don't depend on the system codesets
1074 CONST_STRPTR name
= GetKeyMapCodepage(keymap
);
1076 // legacy keymaps dont have codepage or Unicode mappings
1077 if(name
!= NULL
&& keymap
!= NULL
)
1079 D(DBF_STARTUP
, "loading charset '%s' from keymap.library...", name
);
1081 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) != NULL
)
1083 codeset
->name
= mystrdup(name
);
1084 codeset
->alt_name
= NULL
;
1085 codeset
->characterization
= mystrdup(name
); // No further information available
1086 codeset
->read_only
= 0;
1088 for(i
=0; i
<256; i
++)
1090 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1093 codeset
->table
[i
].code
= i
;
1094 codeset
->table
[i
].ucs4
= src
= ToUCS4(i
, keymap
);
1096 // here we use UTF8_Encode() instead of ConvertUCS4ToUTF8() because
1097 // of an internal bug in MorphOS 2.2.
1098 rc
= UTF8_Encode(src
, dest_ptr
);
1099 rc
= rc
> 0 ? rc
: 1;
1101 dest_ptr
[rc
] = '\0';
1102 codeset
->table
[i
].utf8
[0] = rc
;
1105 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1106 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1108 D(DBF_STARTUP
, "adding keymap.library codeset '%s'", codeset
->name
);
1109 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1113 // only failed memory allocations are treated as error
1118 CloseLibrary(LocaleBase
);
1121 CloseLibrary(KeymapBase
);
1124 if(success
== FALSE
)
1129 D(DBF_STARTUP
, "loading charsets from LIBS:Charsets...");
1131 // we try to walk to the LIBS:Charsets directory on our own and readin our
1132 // own charset tables
1133 codesetsScanDir(csList
, "LIBS:Charsets");
1136 // now we go and initialize our internally supported codesets but only if
1137 // we have not already loaded a charset with the same name
1139 D(DBF_STARTUP
, "initializing internal charsets...");
1141 // ISO-8859-1 + EURO
1142 if(codesetsFind(csList
, "ISO-8859-1 + Euro") == NULL
)
1144 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1147 codeset
->name
= mystrdup("ISO-8859-1 + Euro");
1148 codeset
->alt_name
= NULL
;
1149 codeset
->characterization
= mystrdup("West European (with EURO)");
1150 codeset
->read_only
= 1;
1152 for(i
= 0; i
<256; i
++)
1154 UTF32
*src_ptr
= &src
;
1155 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1158 src
= 0x20AC; // the EURO sign
1162 codeset
->table
[i
].code
= i
;
1163 codeset
->table
[i
].ucs4
= src
;
1164 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1166 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1168 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1169 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1171 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1172 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1176 if(codesetsFind(csList
, "ISO-8859-1") == NULL
)
1178 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1181 codeset
->name
= mystrdup("ISO-8859-1");
1182 codeset
->alt_name
= mystrdup("ISO8859-1");
1183 codeset
->characterization
= mystrdup("West European");
1184 codeset
->read_only
= 0;
1186 for(i
= 0; i
<256; i
++)
1188 UTF32
*src_ptr
= &src
;
1189 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1193 codeset
->table
[i
].code
= i
;
1194 codeset
->table
[i
].ucs4
= src
;
1195 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1197 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1199 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1200 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1202 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1203 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1207 if(codesetsFind(csList
, "ISO-8859-2") == NULL
)
1209 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1212 codeset
->name
= mystrdup("ISO-8859-2");
1213 codeset
->alt_name
= mystrdup("ISO8859-2");
1214 codeset
->characterization
= mystrdup("Central/East European");
1215 codeset
->read_only
= 0;
1217 for(i
= 0; i
<256; i
++)
1219 UTF32
*src_ptr
= &src
;
1220 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1225 src
= iso_8859_2_to_ucs4
[i
-0xa0];
1227 codeset
->table
[i
].code
= i
;
1228 codeset
->table
[i
].ucs4
= src
;
1229 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
,dest_ptr
+6, CSF_StrictConversion
);
1231 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1233 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1234 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1236 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1237 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1241 if(codesetsFind(csList
, "ISO-8859-3") == NULL
)
1243 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1246 codeset
->name
= mystrdup("ISO-8859-3");
1247 codeset
->alt_name
= mystrdup("ISO8859-3");
1248 codeset
->characterization
= mystrdup("South European");
1249 codeset
->read_only
= 0;
1251 for(i
= 0; i
<256; i
++)
1253 UTF32
*src_ptr
= &src
;
1254 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1259 src
= iso_8859_3_to_ucs4
[i
-0xa0];
1261 codeset
->table
[i
].code
= i
;
1262 codeset
->table
[i
].ucs4
= src
;
1263 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1265 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1267 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1268 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1270 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1271 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1275 if(codesetsFind(csList
, "ISO-8859-4") == NULL
)
1277 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1280 codeset
->name
= mystrdup("ISO-8859-4");
1281 codeset
->alt_name
= mystrdup("ISO8859-4");
1282 codeset
->characterization
= mystrdup("North European");
1283 codeset
->read_only
= 0;
1285 for(i
= 0; i
<256; i
++)
1287 UTF32
*src_ptr
= &src
;
1288 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1293 src
= iso_8859_4_to_ucs4
[i
-0xa0];
1295 codeset
->table
[i
].code
= i
;
1296 codeset
->table
[i
].ucs4
= src
;
1297 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1299 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1301 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1302 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1304 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1305 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1309 if(codesetsFind(csList
, "ISO-8859-5") == NULL
)
1311 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1314 codeset
->name
= mystrdup("ISO-8859-5");
1315 codeset
->alt_name
= mystrdup("ISO8859-5");
1316 codeset
->characterization
= mystrdup("Slavic languages");
1317 codeset
->read_only
= 0;
1319 for(i
= 0; i
<256; i
++)
1321 UTF32
*src_ptr
= &src
;
1322 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1327 src
= iso_8859_5_to_ucs4
[i
-0xa0];
1329 codeset
->table
[i
].code
= i
;
1330 codeset
->table
[i
].ucs4
= src
;
1331 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1333 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1335 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1336 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1338 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1339 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1343 if(codesetsFind(csList
, "ISO-8859-9") == NULL
)
1345 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1348 codeset
->name
= mystrdup("ISO-8859-9");
1349 codeset
->alt_name
= mystrdup("ISO8859-9");
1350 codeset
->characterization
= mystrdup("Turkish");
1351 codeset
->read_only
= 0;
1353 for(i
= 0; i
<256; i
++)
1355 UTF32
*src_ptr
= &src
;
1356 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1361 src
= iso_8859_9_to_ucs4
[i
-0xa0];
1363 codeset
->table
[i
].code
= i
;
1364 codeset
->table
[i
].ucs4
= src
;
1365 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1367 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1369 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1370 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1372 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1373 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1377 if(codesetsFind(csList
, "ISO-8859-15") == NULL
)
1379 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1382 codeset
->name
= mystrdup("ISO-8859-15");
1383 codeset
->alt_name
= mystrdup("ISO8859-15");
1384 codeset
->characterization
= mystrdup("West European II");
1385 codeset
->read_only
= 0;
1387 for(i
= 0; i
<256; i
++)
1389 UTF32
*src_ptr
= &src
;
1390 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1395 src
= iso_8859_15_to_ucs4
[i
-0xa0];
1397 codeset
->table
[i
].code
= i
;
1398 codeset
->table
[i
].ucs4
= src
;
1399 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1401 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1403 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof (codeset
->table
));
1404 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1406 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1407 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1411 if(codesetsFind(csList
, "ISO-8859-16") == NULL
)
1413 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1416 codeset
->name
= mystrdup("ISO-8859-16");
1417 codeset
->alt_name
= mystrdup("ISO8869-16");
1418 codeset
->characterization
= mystrdup("South-Eastern European");
1419 codeset
->read_only
= 0;
1423 UTF32
*src_ptr
= &src
;
1424 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1429 src
= iso_8859_16_to_ucs4
[i
-0xa0];
1431 codeset
->table
[i
].code
= i
;
1432 codeset
->table
[i
].ucs4
= src
;
1433 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1435 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
- (IPTR
)&codeset
->table
[i
].utf8
[1];
1437 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1438 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1440 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1441 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1445 if(codesetsFind(csList
, "KOI8-R") == NULL
)
1447 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1450 codeset
->name
= mystrdup("KOI8-R");
1451 codeset
->alt_name
= mystrdup("KOI8R");
1452 codeset
->characterization
= mystrdup("Russian");
1453 codeset
->read_only
= 0;
1455 for(i
= 0; i
<256; i
++)
1457 UTF32
*src_ptr
= &src
;
1458 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1463 src
= koi8r_to_ucs4
[i
-0x80];
1465 codeset
->table
[i
].code
= i
;
1466 codeset
->table
[i
].ucs4
= src
;
1467 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1469 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1471 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1472 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1474 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1475 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1479 if(codesetsFind(csList
, "AmigaPL") == NULL
)
1481 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1484 codeset
->name
= mystrdup("AmigaPL");
1485 codeset
->alt_name
= mystrdup("AmiPL");
1486 codeset
->characterization
= mystrdup("Polish (Amiga)");
1487 codeset
->read_only
= 1;
1489 for(i
=0; i
<256; i
++)
1491 UTF32
*src_ptr
= &src
;
1492 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1497 src
= amigapl_to_ucs4
[i
-0xa0];
1499 codeset
->table
[i
].code
= i
;
1500 codeset
->table
[i
].ucs4
= src
;
1501 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1503 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1505 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1506 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1508 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1509 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1513 if(codesetsFind(csList
, "Amiga-1251") == NULL
)
1515 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1518 codeset
->name
= mystrdup("Amiga-1251");
1519 codeset
->alt_name
= mystrdup("Ami1251");
1520 codeset
->characterization
= mystrdup("Cyrillic (Amiga)");
1521 codeset
->read_only
= 1;
1523 for(i
=0; i
<256; i
++)
1525 UTF32
*src_ptr
= &src
;
1526 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1531 src
= amiga1251_to_ucs4
[i
-0xa0];
1533 codeset
->table
[i
].code
= i
;
1534 codeset
->table
[i
].ucs4
= src
;
1535 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1537 codeset
->table
[i
].utf8
[0] = (char*)dest_ptr
- (char*)&codeset
->table
[i
].utf8
[1];
1539 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1540 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1542 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1543 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1554 /// codesetsCleanup()
1555 // Cleanup the memory for the codeset
1556 void codesetsCleanup(struct codesetList
*csList
)
1558 struct codeset
*code
;
1562 while((code
= (struct codeset
*)RemHead((struct List
*)csList
)) != NULL
)
1564 if(code
->name
!= NULL
)
1565 freeArbitrateVecPooled(code
->name
);
1566 if(code
->alt_name
!= NULL
)
1567 freeArbitrateVecPooled(code
->alt_name
);
1568 if(code
->characterization
!= NULL
)
1569 freeArbitrateVecPooled(code
->characterization
);
1571 freeArbitrateVecPooled(code
);
1579 // Returns the given codeset.
1580 struct codeset
*codesetsFind(struct codesetList
*csList
, const char *name
)
1582 struct codeset
*res
= NULL
;
1586 if(name
!= NULL
&& name
[0] != '\0')
1589 const char *matchedName
;
1591 if((matchedName
= matchCodesetAlias(name
)) != NULL
)
1594 for(node
= GetHead((struct List
*)csList
); node
!= NULL
; node
= GetSucc(node
))
1596 struct codeset
*mstate
= (struct codeset
*)node
;
1598 if(stricmp(name
, mstate
->name
) == 0 ||
1599 (mstate
->alt_name
!= NULL
&& stricmp(name
, mstate
->alt_name
) == 0))
1613 /// checkTextAgainstSingleCodeset
1614 // check how good a text can be represented by a specific codeset
1615 static int checkTextAgainstSingleCodeset(CONST_STRPTR text
, ULONG textLen
, struct codeset
*codeset
)
1617 int errors
= textLen
;
1621 if(codeset
->read_only
== 0 &&
1622 codeset
!= CodesetsBase
->utf8Codeset
&&
1623 codeset
!= CodesetsBase
->utf16Codeset
&&
1624 codeset
!= CodesetsBase
->utf32Codeset
)
1626 CONST_STRPTR text_ptr
= text
;
1631 // the following identification/detection routine is NOT really smart.
1632 // we just see how each UTF8 string is the representation of each char
1633 // in our source text and then check if they are valid or not. As said,
1634 // not very smart, but we don't have anything better right now :(
1635 for(i
=0; i
< textLen
; i
++)
1637 unsigned char c
= *text_ptr
++;
1641 struct single_convert
*f
= &codeset
->table
[c
];
1643 if(f
->utf8
[0] == 0x00 || f
->utf8
[1] == 0x00)
1651 W(DBF_STARTUP
, "codeset '%s' is either read-only (%ld) or UTF8/16/32 (%ld)", codeset
->name
, codeset
->read_only
, codeset
== CodesetsBase
->utf8Codeset
|| codeset
== CodesetsBase
->utf16Codeset
|| codeset
== CodesetsBase
->utf32Codeset
);
1653 D(DBF_STARTUP
, "tried to identify text as '%s' text with %ld of %ld errors", codeset
->name
, errors
, textLen
);
1660 /// checkTextAgainstCodesetList
1661 static int checkTextAgainstCodesetList(CONST_STRPTR text
, ULONG textLen
, struct codesetList
*csList
, struct codeset
**bestCodeset
)
1664 int bestErrors
= textLen
;
1668 *bestCodeset
= NULL
;
1670 for(node
= GetHead((struct List
*)csList
); node
!= NULL
; node
= GetSucc(node
))
1672 struct codeset
*codeset
= (struct codeset
*)node
;
1675 errors
= checkTextAgainstSingleCodeset(text
, textLen
, codeset
);
1676 if(errors
< bestErrors
)
1678 *bestCodeset
= codeset
;
1679 bestErrors
= errors
;
1691 /// codesetsFindBest()
1692 // Returns the best codeset for the given text
1693 static struct codeset
*codesetsFindBest(struct TagItem
*attrs
, ULONG csFamily
, CONST_STRPTR text
, ULONG textLen
, int *errorPtr
)
1695 struct codeset
*bestCodeset
= NULL
;
1696 int bestErrors
= textLen
;
1701 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1703 // in case the user specified the codeset family as a
1704 // cyrillic one we go and do our cyrillic specific analysis first
1705 if(csFamily
== CSV_CodesetFamily_Cyrillic
)
1707 #define NUM_CYRILLIC 3
1709 struct CodesetSearch
1715 struct CodesetSearch search
[NUM_CYRILLIC
];
1718 int ctr
[NUM_CYRILLIC
];
1725 D(DBF_STARTUP
, "performing cyrillic analysis");
1727 search
[0].name
= "windows-1251";
1728 search
[0].data
= cp1251_data
;
1729 search
[1].name
= "IBM866";
1730 search
[1].data
= cp866_data
;
1731 search
[2].name
= "KOI8-R";
1732 search
[2].data
= koi8r_data
;
1734 memset(&ctr
, 0, sizeof(ctr
));
1736 tp
= (unsigned char *)text
;
1741 int mid
= max
= -466725766; // TODO: what's the magic behind this constant?
1744 for(n
=0; n
< NUM_CYRILLIC
; n
++)
1746 unsigned char la
= 0;
1747 unsigned char *tptr
= (unsigned char *)search
[n
].data
;
1753 unsigned char lb
= (*p
++) ^ 128;
1755 if(!((la
| lb
) & 128))
1756 ctr
[n
] += (signed char)tptr
[(la
<< 7) + lb
];
1771 if((max
>= 500) && ((max
-mid
) >= 1000))
1777 while((*p
) && (!gr
));
1779 if(gr
|| ((!(*p
)) && lr
))
1782 // if our analysis found something, we go and try
1783 // to find the corresponding codeset in out codeset list
1786 struct TagItem
*tstate
= attrs
;
1787 struct TagItem
*tag
;
1789 D(DBF_STARTUP
, "identified text as '%s", search
[Nmax
-1].name
);
1791 // now we walk through our taglist and check if the user
1793 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1795 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
1797 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1799 if((bestCodeset
= codesetsFind(csList
, search
[Nmax
-1].name
)) != NULL
)
1804 // if we still haven't found the matching codeset
1805 // we search the internal list
1806 if(bestCodeset
== NULL
)
1807 bestCodeset
= codesetsFind(&CodesetsBase
->codesets
, search
[Nmax
-1].name
);
1815 // if we haven't found the best codeset (through the cyrillic analysis)
1816 // we go and do the dumb latin search in our codesetlist
1819 struct TagItem
*tstate
= attrs
;
1820 struct TagItem
*tag
;
1822 // check text against all codesets in all supplied lists of codesets
1823 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1827 case CSA_CodesetList
:
1829 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1830 struct codeset
*bestCodesetInList
;
1831 int bestErrorsInList
;
1833 D(DBF_STARTUP
, "checking against external codeset list");
1834 bestErrorsInList
= checkTextAgainstCodesetList(text
, textLen
, csList
, &bestCodesetInList
);
1835 if(bestErrorsInList
< bestErrors
&& bestCodesetInList
!= NULL
)
1837 bestCodeset
= bestCodesetInList
;
1838 bestErrors
= bestErrorsInList
;
1848 // we didn't find a "best" codeset in the supplied codesets lists so far,
1849 // so now we check against our internal list
1852 struct codeset
*bestCodesetInList
;
1853 int bestErrorsInList
;
1855 D(DBF_STARTUP
, "checking against internal codeset list");
1856 bestErrorsInList
= checkTextAgainstCodesetList(text
, textLen
, &CodesetsBase
->codesets
, &bestCodesetInList
);
1857 if(bestErrorsInList
< bestErrors
&& bestCodesetInList
!= NULL
)
1859 bestCodeset
= bestCodesetInList
;
1860 bestErrors
= bestErrorsInList
;
1865 ReleaseSemaphore(&CodesetsBase
->libSem
);
1867 if(errorPtr
!= NULL
)
1868 *errorPtr
= bestErrors
;
1870 RETURN(bestCodeset
);
1876 /**************************************************************************/
1878 /// CodesetsSupportedA()
1879 LIBPROTO(CodesetsSupportedA
, STRPTR
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
1881 STRPTR
*array
= NULL
;
1882 struct TagItem
*tstate
= attrs
;
1883 struct TagItem
*tag
;
1884 BOOL allowMultibyte
;
1889 allowMultibyte
= GetTagData(CSA_AllowMultibyteCodesets
, TRUE
, attrs
);
1891 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1893 // first we need to check how many codesets our supplied
1895 numCodesets
= countCodesets(&CodesetsBase
->codesets
, allowMultibyte
);
1896 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1900 case CSA_CodesetList
:
1902 numCodesets
+= countCodesets((struct codesetList
*)tag
->ti_Data
, allowMultibyte
);
1908 // now that we know how many codesets we have in our lists we
1909 // can put their names into our string arrays
1912 if((array
= allocArbitrateVecPooled((numCodesets
+1)*sizeof(STRPTR
))) != NULL
)
1917 // first we walk through the internal codesets list and
1919 for(node
= GetHead((struct List
*)&CodesetsBase
->codesets
); node
!= NULL
; node
= GetSucc(node
))
1921 struct codeset
*code
= (struct codeset
*)node
;
1923 if(allowMultibyte
== TRUE
||
1924 (code
!= CodesetsBase
->utf8Codeset
&& code
!= CodesetsBase
->utf16Codeset
&& code
!= CodesetsBase
->utf32Codeset
))
1926 array
[i
] = code
->name
;
1934 // then we also iterate through our private codesets list
1935 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1939 case CSA_CodesetList
:
1941 for(node
= GetHead((struct List
*)tag
->ti_Data
); node
!= NULL
; node
= GetSucc(node
))
1943 struct codeset
*code
= (struct codeset
*)node
;
1945 if(allowMultibyte
== TRUE
||
1946 (code
!= CodesetsBase
->utf8Codeset
&& code
!= CodesetsBase
->utf16Codeset
&& code
!= CodesetsBase
->utf32Codeset
))
1948 array
[i
] = code
->name
;
1961 ReleaseSemaphore(&CodesetsBase
->libSem
);
1967 #if defined(__amigaos4__)
1968 LIBPROTOVA(CodesetsSupported
, STRPTR
*, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
1973 VA_START(args
, ICodesets
);
1974 res
= CodesetsSupportedA(VA_ARG(args
, struct TagItem
*));
1983 LIBPROTO(CodesetsFreeA
, void, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, APTR obj
), REG(a1
, UNUSED
struct TagItem
*attrs
))
1988 freeArbitrateVecPooled(obj
);
1993 #if defined(__amigaos4__)
1994 LIBPROTOVA(CodesetsFree
, void, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, APTR obj
), ...)
1998 VA_START(args
, obj
);
1999 CodesetsFreeA(obj
, VA_ARG(args
, struct TagItem
*));
2005 /// CodesetsSetDefaultA()
2006 LIBPROTO(CodesetsSetDefaultA
, struct codeset
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, STRPTR name
), REG(a1
, struct TagItem
*attrs
))
2008 struct codeset
*codeset
;
2012 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
2014 if((codeset
= codesetsFind(&CodesetsBase
->codesets
, name
)) != NULL
)
2018 flags
= GVF_SAVE_VAR
;
2019 if(GetTagData(CSA_Save
, FALSE
, attrs
))
2020 SET_FLAG(flags
, GVF_GLOBAL_ONLY
);
2022 SetVar("codeset_default", codeset
->name
, strlen(codeset
->name
), flags
);
2025 ReleaseSemaphore(&CodesetsBase
->libSem
);
2031 #if defined(__amigaos4__)
2032 LIBPROTOVA(CodesetsSetDefault
, struct codeset
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, STRPTR name
), ...)
2037 VA_START(args
, name
);
2038 cs
= CodesetsSetDefaultA(name
, VA_ARG(args
, struct TagItem
*));
2047 LIBPROTO(CodesetsFindA
, struct codeset
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, STRPTR name
), REG(a1
, struct TagItem
*attrs
))
2049 struct codeset
*codeset
= NULL
;
2053 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
2055 // if no name pointer was supplied we have to return
2056 // the default codeset only.
2059 // we first walk through our internal list and check if we
2060 // can find the requested codeset
2061 codeset
= codesetsFind(&CodesetsBase
->codesets
, name
);
2065 struct TagItem
*tstate
= attrs
;
2066 struct TagItem
*tag
;
2068 // now we walk through our taglist and check if the user
2070 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
2072 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
2074 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
2076 if((codeset
= codesetsFind(csList
, name
)) != NULL
)
2083 // check if we found something or not.
2084 if(codeset
== NULL
&& GetTagData(CSA_FallbackToDefault
, TRUE
, attrs
))
2085 codeset
= defaultCodeset(FALSE
);
2087 ReleaseSemaphore(&CodesetsBase
->libSem
);
2093 #if defined(__amigaos4__)
2094 LIBPROTOVA(CodesetsFind
, struct codeset
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, STRPTR name
), ...)
2099 VA_START(args
, name
);
2100 cs
= CodesetsFindA(name
, VA_ARG(args
, struct TagItem
*));
2108 /// CodesetsFindBestA()
2109 LIBPROTO(CodesetsFindBestA
, struct codeset
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
2111 struct codeset
*codeset
= NULL
;
2117 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
2119 text
= (char *)GetTagData(CSA_Source
, 0, attrs
);
2120 textLen
= GetTagData(CSA_SourceLen
, text
!= NULL
? strlen(text
) : 0, attrs
);
2122 if(text
!= NULL
&& textLen
!= 0)
2125 ULONG csFamily
= GetTagData(CSA_CodesetFamily
, CSV_CodesetFamily_Latin
, attrs
);
2126 int *errorPtr
= (int *)GetTagData(CSA_ErrPtr
, 0, attrs
);
2128 codeset
= codesetsFindBest(attrs
, csFamily
, text
, textLen
, &numErrors
);
2130 if(errorPtr
!= NULL
)
2131 *errorPtr
= numErrors
;
2133 // if we still haven't got the codeset we fallback to the default
2134 if(codeset
== NULL
&& GetTagData(CSA_FallbackToDefault
, FALSE
, attrs
))
2135 codeset
= defaultCodeset(FALSE
);
2138 ReleaseSemaphore(&CodesetsBase
->libSem
);
2144 #if defined(__amigaos4__)
2145 LIBPROTOVA(CodesetsFindBest
, struct codeset
*, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
2150 VA_START(args
, ICodesets
);
2151 cs
= CodesetsFindBestA(VA_ARG(args
, struct TagItem
*));
2159 /// CodesetsUTF8Len()
2160 // Returns the number of characters a utf8 string has. This is not
2161 // identically with the size of memory is required to hold the string.
2162 LIBPROTO(CodesetsUTF8Len
, ULONG
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, const UTF8
*str
))
2174 str
+= trailingBytesForUTF8
[c
];
2183 /// CodesetsStrLenA()
2184 LIBPROTO(CodesetsStrLenA
, ULONG
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, STRPTR str
), REG(a1
, struct TagItem
*attrs
))
2192 struct codeset
*codeset
;
2197 if((codeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
2198 codeset
= defaultCodeset(TRUE
);
2200 if(codeset
== CodesetsBase
->utf32Codeset
)
2203 len
= utf32_strlen((UTF32
*)str
);
2205 else if(codeset
== CodesetsBase
->utf16Codeset
)
2208 len
= utf16_strlen((UTF16
*)str
);
2216 len
= GetTagData(CSA_SourceLen
, len
, attrs
);
2222 void *srcend
= src
+ len
;
2223 UTF8
*dstlen
= NULL
;
2224 union TypeAliases srcAlias
;
2225 union TypeAliases dstAlias
;
2227 srcAlias
.strptr
= &src
;
2228 dstAlias
.utf8
= &dstlen
;
2233 CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, NULL
, 0);
2237 CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, NULL
, 0);
2248 while((c
= *src
++) != '\0' && len
!= 0)
2250 res
+= codeset
->table
[c
].utf8
[0];
2260 #if defined(__amigaos4__)
2261 LIBPROTOVA(CodesetsStrLen
, ULONG
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, STRPTR str
), ...)
2266 VA_START(args
, str
);
2267 res
= CodesetsStrLenA(str
, VA_ARG(args
, struct TagItem
*));
2275 /// CodesetsUTF8ToStrA()
2276 // Converts an UTF8 string to a given charset. Return the number of bytes
2277 // written to dest excluding the NULL byte (which is always ensured by this
2278 // function; it means a NULL str will produce "" as dest; anyway you should
2279 // check NULL str to not waste your time!).
2280 LIBPROTO(CodesetsUTF8ToStrA
, STRPTR
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
2291 if((src
= (UTF8
*)GetTagData(CSA_Source
, 0, attrs
)) != NULL
&&
2292 (srcLen
= GetTagData(CSA_SourceLen
, src
!= NULL
? strlen((char *)src
) : 0, attrs
)) > 0)
2294 struct convertMsg msg
;
2295 struct codeset
*codeset
;
2296 struct Hook
*destHook
;
2297 struct Hook
*mapForeignCharsHook
;
2299 STRPTR destIter
= NULL
;
2302 unsigned char *s
= src
;
2303 unsigned char *e
= (src
+srcLen
);
2304 int numConvErrors
= 0;
2305 int *numConvErrorsPtr
;
2306 BOOL mapForeignChars
;
2308 struct SignalSemaphore
*sem
= NULL
;
2312 // get some more optional attributes
2313 destHook
= (struct Hook
*)GetTagData(CSA_DestHook
, 0, attrs
);
2314 destLen
= GetTagData(CSA_DestLen
, 0, attrs
);
2315 numConvErrorsPtr
= (int *)GetTagData(CSA_ErrPtr
, 0, attrs
);
2316 mapForeignChars
= (BOOL
)GetTagData(CSA_MapForeignChars
, FALSE
, attrs
);
2317 mapForeignCharsHook
= (struct Hook
*)GetTagData(CSA_MapForeignCharsHook
, 0, attrs
);
2319 // get the destination codeset pointer
2320 if((codeset
= (struct codeset
*)GetTagData(CSA_DestCodeset
, 0, attrs
)) == NULL
)
2321 codeset
= defaultCodeset(TRUE
);
2322 if(codeset
== CodesetsBase
->utf32Codeset
)
2327 else if(codeset
== CodesetsBase
->utf16Codeset
)
2338 // first we make sure we allocate enough memory
2339 // for our destination buffer
2340 if(destHook
!= NULL
)
2342 if(destLen
< 16 || destLen
> sizeof(buf
))
2343 destLen
= sizeof(buf
);
2345 msg
.state
= CSV_Translating
;
2351 // in case the user wants us to dynamically generate the
2352 // destination buffer we do it right now
2353 if((dest
= (STRPTR
)GetTagData(CSA_Dest
, 0, attrs
)) == NULL
||
2354 GetTagData(CSA_AllocIfNeeded
, TRUE
, attrs
) != FALSE
)
2358 // calculate the destLen
2361 void *dstlen
= NULL
;
2362 union TypeAliases srcAlias
;
2363 union TypeAliases dstAlias
;
2365 srcAlias
.uchar
= &s
;
2366 dstAlias
.voidptr
= &dstlen
;
2371 CodesetsConvertUTF8toUTF16(srcAlias
.cutf8
, e
, dstAlias
.utf16
, NULL
, 0);
2375 CodesetsConvertUTF8toUTF32(srcAlias
.cutf8
, e
, dstAlias
.utf32
, NULL
, 0);
2384 unsigned char c
= *s
++;
2387 s
+= trailingBytesForUTF8
[c
];
2391 if(dest
== NULL
|| (destLen
< len
+1))
2393 if((pool
= (APTR
)GetTagData(CSA_Pool
, 0, attrs
)) != NULL
)
2395 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
2396 ObtainSemaphore(sem
);
2398 // allocate the destination buffer
2399 dest
= allocVecPooled(pool
, len
+char_size
);
2402 ReleaseSemaphore(sem
);
2405 dest
= allocArbitrateVecPooled(len
+char_size
);
2407 destLen
= len
+char_size
;
2420 // now we convert the src string to the
2421 // destination buffer.
2427 if(destHook
!= NULL
)
2429 ULONG r
= CSR_TargetExhausted
;
2431 dstend
= b
+ destLen
- char_size
;
2434 union TypeAliases srcAlias
;
2435 union TypeAliases dstAlias
;
2437 srcAlias
.uchar
= &s
;
2438 dstAlias
.schar
= &b
;
2443 r
= CodesetsConvertUTF8toUTF16(srcAlias
.cutf8
, e
, dstAlias
.utf16
, dstend
, 0);
2447 r
= CodesetsConvertUTF8toUTF32(srcAlias
.cutf8
, e
, dstAlias
.utf32
, dstend
, 0);
2453 if(r
!= CSR_TargetExhausted
)
2454 msg
.state
= CSV_End
;
2456 CallHookPkt(destHook
,&msg
,buf
);
2461 while(r
== CSR_TargetExhausted
);
2465 union TypeAliases srcAlias
;
2466 union TypeAliases dstAlias
;
2468 srcAlias
.uchar
= &s
;
2469 dstAlias
.strptr
= &destIter
;
2470 dstend
= destIter
+ destLen
- char_size
;
2474 CodesetsConvertUTF8toUTF16(srcAlias
.cutf8
, e
, dstAlias
.utf16
, dstend
, 0);
2478 CodesetsConvertUTF8toUTF32(srcAlias
.cutf8
, e
, dstAlias
.utf32
, dstend
, 0);
2488 if(destHook
== NULL
&& n
>= destLen
-1)
2491 // convert until we reach the end of the
2495 unsigned char c
= *s
;
2496 unsigned char d
= '?';
2497 const char *repstr
= NULL
;
2500 // check if the char is a >7bit char
2503 struct single_convert
*f
;
2504 int lenAdd
= trailingBytesForUTF8
[c
];
2505 int lenStr
= lenAdd
+1;
2506 unsigned char *src
= s
;
2510 // start each iteration with "no replacement found yet"
2514 // search in the UTF8 conversion table of the current charset if
2515 // we have a replacement character for the char sequence starting at s
2516 BIN_SEARCH(codeset
->table_sorted
, 0, 255, strncmp((char *)src
, (char *)codeset
->table_sorted
[m
].utf8
+1, lenStr
), f
);
2527 // the analysed char sequence (s) is not convertable to a
2528 // single visible char replacement, so we normally have to put
2529 // a ? sign as a "unknown char" sign at the very position.
2531 // For convienence we, however, allow users to replace these
2532 // UTF8 characters with char sequences that "looklike" the
2534 if(mapForeignChars
== TRUE
)
2535 replen
= mapUTF8toASCII(&repstr
, src
, lenStr
);
2537 // call the hook only, if the internal table yielded no suitable
2539 if(replen
== 0 && mapForeignCharsHook
!= NULL
)
2541 struct replaceMsg rmsg
;
2543 rmsg
.dst
= (char **)&repstr
;
2545 rmsg
.srclen
= lenStr
;
2546 replen
= CallHookPkt(mapForeignCharsHook
, &rmsg
, NULL
);
2551 D(DBF_UTF
, "got UTF8 replacement (%ld)", replen
);
2553 // stay in the loop as long as one replacement function delivers
2554 // further UTF8 replacement sequences
2555 src
= (unsigned char *)repstr
;
2556 // remember the length of the replaced string, as we might do another
2557 // iteration in the loop which might result in a further replacement
2560 else if(replen
== 0)
2562 D(DBF_UTF
, "found no ASCII replacement for UTF8 string (%ld)", replen
);
2566 D(DBF_UTF
, "got replacement string '%s' (%ld)", repstr
? repstr
: "<null>", replen
);
2571 if(repstr
== NULL
|| replen
== 0)
2585 if(destHook
!= NULL
)
2596 if(i
%(destLen
-1)==0)
2600 CallHookPkt(destHook
, &msg
, buf
);
2610 *b
++ = replen
> 0 ? *repstr
: d
;
2614 if(i
%(destLen
-1)==0)
2618 CallHookPkt(destHook
, &msg
, buf
);
2629 ULONG destPos
= destIter
-dest
;
2634 ObtainSemaphore(sem
);
2636 // allocate the destination buffer
2637 dest
= reallocVecPooled(pool
, dest
, destLen
, destLen
+replen
-1);
2640 ReleaseSemaphore(sem
);
2643 dest
= reallocArbitrateVecPooled(dest
, destLen
, destLen
+replen
-1);
2651 destIter
= dest
+destPos
;
2652 memcpy(destIter
, repstr
, replen
);
2654 // adjust our loop pointer and destination length
2656 destLen
+= replen
-1;
2658 else if(replen
== 1)
2659 *destIter
++ = *repstr
;
2670 if(destHook
!= NULL
)
2672 msg
.state
= CSV_End
;
2675 CallHookPkt(destHook
,&msg
,buf
);
2681 // let us write the number of conversion errors
2682 // to the proper variable pointer, if wanted
2683 if(numConvErrorsPtr
!= NULL
)
2684 *numConvErrorsPtr
= numConvErrors
;
2687 // put the final length of our destination buffer
2688 // into the destLenPtr
2689 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
2692 *destLenPtr
= destLen
-1;
2701 #if defined(__amigaos4__)
2702 LIBPROTOVA(CodesetsUTF8ToStr
, STRPTR
, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
2707 VA_START(args
, ICodesets
);
2708 res
= CodesetsUTF8ToStrA(VA_ARG(args
, struct TagItem
*));
2716 /// CodesetsUTF8CreateA()
2717 // Converts a string and a charset to an UTF8. Returns the UTF8.
2718 // If a destination hook is supplied always return 0.
2719 // If from is NULL, it returns NULL and doesn't call the hook.
2720 LIBPROTO(CodesetsUTF8CreateA
, UTF8
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
2724 struct codeset
*codeset
;
2725 ULONG fromLen
, *destLenPtr
;
2734 if((codeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
2735 codeset
= defaultCodeset(TRUE
);
2736 if(codeset
== CodesetsBase
->utf32Codeset
)
2738 else if(codeset
== CodesetsBase
->utf16Codeset
)
2743 from
= (UTF8
*)GetTagData(CSA_Source
, 0, attrs
);
2749 fromLen
= utf32_strlen((UTF32
*)from
);
2753 fromLen
= utf16_strlen((UTF16
*)from
);
2757 fromLen
= strlen((char *)from
);
2763 fromLen
= GetTagData(CSA_SourceLen
, fromLen
, attrs
);
2765 if(from
!= NULL
&& fromLen
!= 0)
2767 struct convertMsg msg
;
2772 STRPTR src
, destPtr
= NULL
, b
= NULL
;
2775 hook
= (struct Hook
*)GetTagData(CSA_DestHook
, 0, attrs
);
2776 destLen
= GetTagData(CSA_DestLen
, 0, attrs
);
2780 if(destLen
<16 || destLen
>sizeof(buf
))
2781 destLen
= sizeof(buf
);
2783 msg
.state
= CSV_Translating
;
2789 if((dest
= (UTF8
*)GetTagData(CSA_Dest
, 0, attrs
)) != NULL
||
2790 GetTagData(CSA_AllocIfNeeded
, TRUE
, attrs
))
2798 void *srcend
= src
+ fromLen
;
2799 UTF8
*dstlen
= NULL
;
2800 union TypeAliases srcAlias
;
2801 union TypeAliases dstAlias
;
2803 srcAlias
.strptr
= &src
;
2804 dstAlias
.utf8
= &dstlen
;
2809 CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, NULL
, 0);
2813 CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, NULL
, 0);
2820 ULONG flen
= fromLen
;
2823 while((c
= *src
++) != '\0' && flen
!= 0)
2825 len
+= codeset
->table
[c
].utf8
[0];
2829 D(DBF_UTF
, "Calculated output UTF-8 buffer length: %lu", len
);
2831 if(dest
== NULL
|| (destLen
<len
+1))
2834 struct SignalSemaphore
*sem
;
2836 if((pool
= (APTR
)GetTagData(CSA_Pool
, 0, attrs
)) != NULL
)
2838 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
2839 ObtainSemaphore(sem
);
2841 // allocate the destination buffer
2842 dest
= allocVecPooled(pool
,len
+1);
2845 ReleaseSemaphore(sem
);
2848 dest
= allocArbitrateVecPooled(len
+1);
2860 destPtr
= (STRPTR
)dest
;
2866 void *srcend
= src
+ fromLen
;
2871 ULONG r
= CSR_TargetExhausted
;
2872 union TypeAliases srcAlias
;
2873 union TypeAliases dstAlias
;
2875 srcAlias
.strptr
= &src
;
2876 dstAlias
.strptr
= &b
;
2877 dstend
= (UTF8
*)(b
+ destLen
- 1);
2883 r
= CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, dstend
, 0);
2887 r
= CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, dstend
, 0);
2891 if(r
!= CSR_TargetExhausted
)
2892 msg
.state
= CSV_End
;
2894 CallHookPkt(hook
,&msg
,buf
);
2899 while(r
== CSR_TargetExhausted
);
2903 union TypeAliases srcAlias
;
2904 union TypeAliases dstAlias
;
2906 srcAlias
.strptr
= &src
;
2907 dstAlias
.strptr
= &destPtr
;
2908 dstend
= (UTF8
*)(destPtr
+ destLen
);
2912 CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, dstend
, 0);
2916 CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, dstend
, 0);
2919 n
= destPtr
-(STRPTR
)dest
;
2924 for(; fromLen
&& (c
= *src
); src
++, fromLen
--)
2928 for(utf8_seq
= &codeset
->table
[c
].utf8
[1]; (c
= *utf8_seq
); utf8_seq
++)
2935 if(i
%(destLen
-1)==0)
2939 CallHookPkt(hook
,&msg
,buf
);
2960 msg
.state
= CSV_End
;
2963 CallHookPkt(hook
,&msg
,buf
);
2972 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
2979 #if defined(__amigaos4__)
2980 LIBPROTOVA(CodesetsUTF8Create
, UTF8
*, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
2985 VA_START(args
, ICodesets
);
2986 res
= CodesetsUTF8CreateA(VA_ARG(args
, struct TagItem
*));
2994 /// CodesetsIsValidUTF8()
2995 #define GOOD_UCS(c) \
2996 ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2997 (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2999 LIBPROTO(CodesetsIsValidUTF8
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, CONST_STRPTR s
))
3006 while((n
= parseUtf8(&t
)) != 0)
3020 /// CodesetsConvertStrA()
3021 // Converts a given string from one source Codeset to a given destination
3022 // codeset and returns the convert string
3023 LIBPROTO(CodesetsConvertStrA
, STRPTR
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
3025 struct codeset
*srcCodeset
;
3026 STRPTR srcStr
= NULL
;
3027 STRPTR dstStr
= NULL
;
3034 // get the ptr to the src string we want to convert
3035 // from the source codeset to the dest codeset.
3036 srcStr
= (STRPTR
)GetTagData(CSA_Source
, 0, attrs
);
3038 // get the pointer to the codeset in which the src string is encoded
3039 if((srcCodeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
3040 srcCodeset
= defaultCodeset(TRUE
);
3044 if(srcCodeset
== CodesetsBase
->utf32Codeset
)
3046 srcLen
= utf32_strlen((UTF32
*)srcStr
);
3047 charSize
= sizeof(UTF32
);
3049 else if(srcCodeset
== CodesetsBase
->utf16Codeset
)
3051 srcLen
= utf16_strlen((UTF16
*)srcStr
);
3052 charSize
= sizeof(UTF16
);
3056 srcLen
= strlen(srcStr
);
3057 charSize
= sizeof(char);
3062 srcLen
= GetTagData(CSA_SourceLen
, srcLen
, attrs
);
3064 if(srcStr
!= NULL
&& srcLen
> 0)
3066 struct codeset
*dstCodeset
;
3068 // get the pointer to the codeset in which the dst string should be encoded
3069 if((dstCodeset
= (struct codeset
*)GetTagData(CSA_DestCodeset
, 0, attrs
)) == NULL
)
3070 dstCodeset
= defaultCodeset(TRUE
);
3072 D(DBF_UTF
, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset
->name
, dstCodeset
->name
);
3074 if(srcCodeset
!= NULL
&& dstCodeset
!= NULL
)
3076 // check that the user didn't supplied the very same codeset
3077 // or otherwise a conversion is not required.
3078 if(srcCodeset
!= dstCodeset
)
3080 BOOL utf8Create
= FALSE
;
3081 BOOL strCreate
= FALSE
;
3083 ULONG utf8strLen
= 0;
3084 ULONG
*destLenPtr
= NULL
;
3085 BOOL mapForeignChars
;
3086 struct Hook
*mapForeignCharsHook
;
3088 mapForeignChars
= (BOOL
)GetTagData(CSA_MapForeignChars
, FALSE
, attrs
);
3089 mapForeignCharsHook
= (struct Hook
*)GetTagData(CSA_MapForeignCharsHook
, 0, attrs
);
3091 // if the source codeset is UTF-8 we don't have to use the UTF8Create()
3092 // function and can directly call the UTF8ToStr() function
3093 if(srcCodeset
!= CodesetsBase
->utf8Codeset
)
3095 struct TagItem tags
[] = { { CSA_SourceCodeset
, (IPTR
)srcCodeset
},
3096 { CSA_Source
, (IPTR
)srcStr
},
3097 { CSA_SourceLen
, srcLen
},
3098 { CSA_DestLenPtr
, (IPTR
)&utf8strLen
},
3101 utf8str
= CodesetsUTF8CreateA((struct TagItem
*)&tags
[0]);
3107 utf8str
= (UTF8
*)srcStr
;
3108 utf8strLen
= srcLen
;
3111 // in case the destination codeset is UTF-8 we don't have to actually
3112 // use the UTF8ToStr() function and can immediately return our
3114 if(utf8str
!= NULL
&& utf8strLen
> 0 && dstCodeset
!= CodesetsBase
->utf8Codeset
)
3116 struct TagItem tags
[] = { { CSA_DestCodeset
, (IPTR
)dstCodeset
},
3117 { CSA_Source
, (IPTR
)utf8str
},
3118 { CSA_SourceLen
, utf8strLen
},
3119 { CSA_DestLenPtr
, (IPTR
)&dstLen
},
3120 { CSA_MapForeignChars
, mapForeignChars
},
3121 { CSA_MapForeignCharsHook
, (IPTR
)mapForeignCharsHook
},
3124 dstStr
= CodesetsUTF8ToStrA((struct TagItem
*)&tags
[0]);
3130 dstStr
= (STRPTR
)utf8str
;
3131 dstLen
= utf8strLen
;
3134 D(DBF_UTF
, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr
, srcLen
,
3139 // if everything was successfull we can go and finalize everything
3140 if(dstStr
!= NULL
&& utf8str
!= NULL
)
3142 // as the conversion was a two way pass we have to either free the
3143 // memory of the utf8 string or not
3144 if(utf8Create
== TRUE
&& strCreate
== TRUE
)
3145 CodesetsFreeA(utf8str
, NULL
);
3147 // if the user wants to be informed abour the length
3148 // of our destination string we store the length now in the supplied ptr.
3149 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
3150 *destLenPtr
= dstLen
;
3152 D(DBF_UTF
, "successfully converted string with len %ld", dstLen
);
3156 W(DBF_ALWAYS
, "an error occurred while trying to convert a string");
3158 // free all memory in case the conversion didn't work out
3159 if(utf8Create
== TRUE
&& utf8str
!= NULL
)
3160 CodesetsFreeA(utf8str
, NULL
);
3162 if(strCreate
== TRUE
&& dstStr
!= NULL
)
3163 CodesetsFreeA(dstStr
, NULL
);
3170 // we got the same source and destination codesets passed in
3171 // instead of failing silently we just create a copy of the source string
3172 ULONG
*destLenPtr
= NULL
;
3174 // allocate memory for the destination string, including a trailing NUL byte
3175 if((dstStr
= allocArbitrateVecPooled(srcLen
+ charSize
)) != NULL
)
3177 // just copy the source string without any further modification
3178 // we must use memcpy() as the source string could be UTF16/32 encoded and
3179 // thus strcpy() would not do what we want.
3180 memcpy(dstStr
, srcStr
, srcLen
+ charSize
);
3182 D(DBF_UTF
, "successfully copied string with len %ld", dstLen
);
3185 W(DBF_ALWAYS
, "no memory for dest string");
3187 // if the user wants to be informed abour the length
3188 // of our destination string we store the length now in the supplied ptr.
3189 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
3190 *destLenPtr
= dstLen
;
3199 #if defined(__amigaos4__)
3200 LIBPROTOVA(CodesetsConvertStr
, STRPTR
, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
3205 VA_START(args
, ICodesets
);
3206 res
= CodesetsConvertStrA(VA_ARG(args
, struct TagItem
*));
3214 /// CodesetsFreeVecPooledA()
3215 LIBPROTO(CodesetsFreeVecPooledA
, void, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, APTR pool
), REG(a1
, APTR mem
), REG(a2
, struct TagItem
*attrs
))
3219 if(pool
!= NULL
&& mem
!= NULL
)
3221 struct SignalSemaphore
*sem
;
3223 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
3224 ObtainSemaphore(sem
);
3226 freeVecPooled(pool
,mem
);
3229 ReleaseSemaphore(sem
);
3235 #if defined(__amigaos4__)
3236 LIBPROTOVA(CodesetsFreeVecPooled
, void, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, APTR pool
), REG(a1
, APTR mem
), ...)
3240 VA_START(args
, mem
);
3241 CodesetsFreeVecPooledA(pool
, mem
, VA_ARG(args
, struct TagItem
*));
3247 /// CodesetsListCreateA()
3248 LIBPROTO(CodesetsListCreateA
, struct codesetList
*, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
3250 struct codesetList
*csList
= NULL
;
3254 // no matter what, we create a codesets list we will return to the user
3255 if((csList
= allocArbitrateVecPooled(sizeof(struct codesetList
))) != NULL
)
3257 BOOL scanProgDir
= TRUE
;
3258 struct TagItem
*tstate
= attrs
;
3259 struct TagItem
*tag
;
3261 // initialize the new private codeset list and put it into a separate list
3262 NewList((struct List
*)csList
);
3264 // first we get the path of the directory from which we go
3265 // and scan for charset tables from
3266 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3270 case CSA_CodesetDir
:
3272 codesetsScanDir(csList
, (STRPTR
)tag
->ti_Data
);
3274 scanProgDir
= FALSE
;
3278 case CSA_CodesetFile
:
3280 codesetsReadTable(csList
, (STRPTR
)tag
->ti_Data
);
3282 scanProgDir
= FALSE
;
3286 case CSA_SourceCodeset
:
3288 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
3290 AddTail((struct List
*)csList
, (struct Node
*)&cs
->node
);
3292 scanProgDir
= FALSE
;
3298 // in case the user also wants us to scan PROGDIR:
3300 if(scanProgDir
== TRUE
)
3301 codesetsScanDir(csList
, "PROGDIR:Charsets");
3308 #if defined(__amigaos4__)
3309 LIBPROTOVA(CodesetsListCreate
, struct codesetList
*, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
3311 struct codesetList
*res
;
3314 VA_START(args
, ICodesets
);
3315 res
= CodesetsListCreateA(VA_ARG(args
, struct TagItem
*));
3323 /// CodesetsListDeleteA()
3324 LIBPROTO(CodesetsListDeleteA
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
3326 BOOL result
= FALSE
;
3327 struct TagItem
*tstate
= attrs
;
3328 struct TagItem
*tag
;
3333 // check if the caller wants us also to free the codesets
3334 freeCodesets
= (BOOL
)GetTagData(CSA_FreeCodesets
, TRUE
, attrs
);
3336 // now we iterate through or tagItems and see what the
3337 // user wants to remove from the list
3338 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3342 case CSA_CodesetList
:
3344 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
3348 // cleanup the codesets within the list
3349 if(freeCodesets
== TRUE
)
3350 codesetsCleanup(csList
);
3352 // then free the list itICodesets
3353 freeArbitrateVecPooled(csList
);
3366 #if defined(__amigaos4__)
3367 LIBPROTOVA(CodesetsListDelete
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
3372 VA_START(args
, ICodesets
);
3373 result
= CodesetsListDeleteA(VA_ARG(args
, struct TagItem
*));
3381 /// CodesetsListAddA()
3382 LIBPROTO(CodesetsListAddA
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct codesetList
*csList
), REG(a1
, struct TagItem
*attrs
))
3384 BOOL result
= FALSE
;
3390 struct TagItem
*tstate
= attrs
;
3391 struct TagItem
*tag
;
3393 // now we iterate through or tagItems and see if the user
3394 // wants to scan a whole directory or just adds a file.
3395 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3399 case CSA_CodesetDir
:
3401 codesetsScanDir(csList
, (STRPTR
)tag
->ti_Data
);
3406 case CSA_CodesetFile
:
3408 codesetsReadTable(csList
, (STRPTR
)tag
->ti_Data
);
3413 case CSA_SourceCodeset
:
3415 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
3417 AddTail((struct List
*)csList
, (struct Node
*)&cs
->node
);
3429 #if defined(__amigaos4__)
3430 LIBPROTOVA(CodesetsListAdd
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct codesetList
*csList
), ...)
3435 VA_START(args
, csList
);
3436 result
= CodesetsListAddA(csList
, VA_ARG(args
, struct TagItem
*));
3444 /// CodesetsListRemoveA()
3445 LIBPROTO(CodesetsListRemoveA
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), REG(a0
, struct TagItem
*attrs
))
3447 BOOL result
= FALSE
;
3448 struct TagItem
*tstate
= attrs
;
3449 struct TagItem
*tag
;
3454 // check if the caller wants us also to free the codesets
3455 freeCodesets
= (BOOL
)GetTagData(CSA_FreeCodesets
, TRUE
, attrs
);
3457 // now we iterate through or tagItems and see what the
3458 // user wants to remove from the list
3459 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3463 case CSA_SourceCodeset
:
3465 struct codeset
*removeCS
= (struct codeset
*)tag
->ti_Data
;
3467 if(removeCS
!= NULL
)
3470 BOOL isExternalNode
= TRUE
;
3472 ObtainSemaphore(&CodesetsBase
->libSem
);
3474 // iterate over our internal list an check whether the given
3475 // node is part of that list
3476 for(node
= GetHead((struct List
*)&CodesetsBase
->codesets
); node
!= NULL
; node
= GetSucc(node
))
3478 if((struct codeset
*)node
== removeCS
)
3480 isExternalNode
= FALSE
;
3485 ReleaseSemaphore(&CodesetsBase
->libSem
);
3487 if(isExternalNode
== TRUE
)
3489 Remove((struct Node
*)removeCS
);
3491 // free all codesets data if requested
3492 if(freeCodesets
== TRUE
)
3494 if(removeCS
->name
!= NULL
)
3495 freeArbitrateVecPooled(removeCS
->name
);
3496 if(removeCS
->alt_name
!= NULL
)
3497 freeArbitrateVecPooled(removeCS
->alt_name
);
3498 if(removeCS
->characterization
!= NULL
)
3499 freeArbitrateVecPooled(removeCS
->characterization
);
3501 freeArbitrateVecPooled(removeCS
);
3507 W(DBF_ALWAYS
, "user tried to remove an internal codeset!");
3518 #if defined(__amigaos4__)
3519 LIBPROTOVA(CodesetsListRemove
, BOOL
, REG(a6
, UNUSED __BASE_OR_IFACE
), ...)
3524 VA_START(args
, ICodesets
);
3525 result
= CodesetsListRemoveA(VA_ARG(args
, struct TagItem
*));
3534 /**************************************************************************/