2 * Copyright (C) 2024 Mikulas Patocka
4 * This file is part of Ajla.
6 * Ajla is free software: you can redistribute it and/or modify it under the
7 * terms of the GNU General Public License as published by the Free Software
8 * Foundation, either version 3 of the License, or (at your option) any later
11 * Ajla is distributed in the hope that it will be useful, but WITHOUT ANY
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
13 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along with
16 * Ajla. If not, see <https://www.gnu.org/licenses/>.
24 const error_char : char;
25 const error_ascii : byte;
27 const combining_shift : char := 21;
28 const combining_mask : char := (1 shl combining_shift) - 1;
29 const unicode_mask : char := #1FFFFF;
30 fn n_combining_characters(code : char) : int;
32 fn char_validate~inline(c : char) : bool;
33 fn char_to_unicode~inline(c : char) : int;
34 fn char_upcase(c : char) : char;
35 fn char_locase(c : char) : char;
36 fn string_upcase(s : string) : string;
37 fn string_locase(s : string) : string;
39 fn ascii_upcase~inline(ascii : byte) : byte;
40 fn ascii_locase~inline(ascii : byte) : byte;
41 fn ascii_to_string(ascii : bytes) : string;
42 fn string_to_ascii(str : string) : bytes;
44 fn utf8_validate(utf8 : bytes) : bool;
45 fn utf8_get_char(utf8 : bytes) : (char, int);
46 fn utf8_to_string(utf8 : bytes) : string;
47 fn string_to_utf8(s : string) : bytes;
49 const class_combining : int := -1;
50 const class_zero : int := 0;
51 const class_one : int := 1;
52 const class_two : int := 2;
53 fn classify_character(code : char) : int;
54 fn char_length~inline(c : char) : int;
55 fn string_length(s : string) : int;
68 fn locale_get_charset~inline(loc : locale) : charset_desc;
69 fn locale_validate_character~inline(loc : locale, code : char) : bool;
70 fn locale_validate~inline(loc : locale, b : bytes) : bool;
71 fn locale_get_char(loc : locale, b : bytes) : (char, int);
72 fn locale_to_string~inline(loc : locale, b : bytes) : string;
73 fn string_to_locale~inline(loc : locale, s : string) : bytes;
74 fn charset_name_normalize(charset : bytes) : bytes;
75 fn locale_get~cache(lc : bytes) : locale;
76 fn locale_init(env : treemap(bytes, bytes)) : locale;
77 fn locale_console_init(env : treemap(bytes, bytes)) : locale;
79 fn charset_list~cache : list(charset_desc);
90 const u_nl : char := 10;
91 const error_char : char := -1;
92 const error_ascii : byte := '*';
94 const lazy_buffer_size := 4096;
99 chr_to_unicode : array(char, [128]);
100 unicode_to_chr : list(byte);
103 fn n_combining_characters(code : char) : int
105 var b := bsr(code or 1);
106 return b div combining_shift;
109 fn validate_unicode~inline(c : char) : bool
113 if c >= #D800, c < #E000 then
118 fn char_validate~inline(c : char) : bool
122 if not validate_unicode(c and unicode_mask) then
124 for i := 0 to n_combining_characters(c) do [
125 var cc := (c shr (i + 1) * combining_shift);
126 if not validate_unicode(cc and unicode_mask) then
132 fn char_to_unicode~inline(c : char) : int
134 return c and unicode_mask;
137 fn uplocase~inline(a : array(char, [#110000]), c : char) : char
139 if not char_validate(c) then
141 var u := c and unicode_mask;
145 return c and not unicode_mask or v;
148 fn char_upcase(c : char) : char
150 return uplocase(uni_upcase, c);
153 fn char_locase(c : char) : char
155 return uplocase(uni_locase, c);
158 fn string_upcase(s : string) : string
160 for i := 0 to len(s) do
161 s[i] := char_upcase~inline(s[i]);
165 fn string_locase(s : string) : string
167 for i := 0 to len(s) do
168 s[i] := char_locase~inline(s[i]);
172 fn ascii_upcase~inline(ascii : byte) : byte
174 if ascii >= 'a', ascii <= 'z' then
179 fn ascii_locase~inline(ascii : byte) : byte
181 if ascii >= 'A', ascii <= 'Z' then
186 fn ascii_to_string(ascii : bytes) : string
188 var s := empty(char);
189 for i := 0 to len(ascii) do [
199 fn string_to_ascii(str : string) : bytes
201 var s := empty(byte);
202 for i := 0 to len(str) do [
204 if c >= 0, c < 128 then
212 fn utf8_validate(utf8 : bytes) : bool
215 while len_greater_than(utf8, i) do [
218 if utf8[i] < #80 then [
221 ] else if utf8[i] < #c0 then [
223 ] else if utf8[i] < #e0, len_at_least(utf8, i + 2) then [
224 if utf8[i + 1] < #80 or utf8[i + 1] >= #c0 then
226 code := (#1f and utf8[i]) shl 6 or (#3f and utf8[i + 1]);
230 ] else if utf8[i] < #f0, len_at_least(utf8, i + 3) then [
231 if utf8[i + 1] < #80 or utf8[i + 1] >= #c0 then
233 if utf8[i + 2] < #80 or utf8[i + 2] >= #c0 then
235 code := (#0f and utf8[i]) shl 12 or (#3f and utf8[i + 1]) shl 6 or (#3f and utf8[i + 2]);
239 ] else if utf8[i] < #f8, len_at_least(utf8, i + 4) then [
240 if utf8[i + 1] < #80 or utf8[i + 1] >= #c0 then
242 if utf8[i + 2] < #80 or utf8[i + 2] >= #c0 then
244 if utf8[i + 3] < #80 or utf8[i + 3] >= #c0 then
246 code := (#07 and utf8[i]) shl 18 or (#3f and utf8[i + 1]) shl 12 or (#3f and utf8[i + 2]) shl 6 or (#3f and utf8[i + 3]);
247 if code < #10000 or code > #10ffff then
253 if not char_validate(code) then
255 if st = 0, classify_character(code) = class_combining then
261 fn utf8_get_char(utf8 : bytes) : (char, int)
264 var result : char := -1;
265 while len_greater_than(utf8, i) do [
268 if utf8[i] < #80 then [
271 ] else if utf8[i] < #c0 then [
273 ] else if utf8[i] < #e0 then [
274 if not len_at_least(utf8, i + 2) then
276 if utf8[i + 1] < #80 or utf8[i + 1] >= #c0 then
278 code := (#1f and utf8[i]) shl 6 or (#3f and utf8[i + 1]);
282 ] else if utf8[i] < #f0 then [
283 if not len_at_least(utf8, i + 3) then
285 if utf8[i + 1] < #80 or utf8[i + 1] >= #c0 then
287 if utf8[i + 2] < #80 or utf8[i + 2] >= #c0 then
289 code := (#0f and utf8[i]) shl 12 or (#3f and utf8[i + 1]) shl 6 or (#3f and utf8[i + 2]);
293 ] else if utf8[i] < #f8 then [
294 if not len_at_least(utf8, i + 4) then
296 if utf8[i + 1] < #80 or utf8[i + 1] >= #c0 then
298 if utf8[i + 2] < #80 or utf8[i + 2] >= #c0 then
300 if utf8[i + 3] < #80 or utf8[i + 3] >= #c0 then
302 code := (#07 and utf8[i]) shl 18 or (#3f and utf8[i + 1]) shl 12 or (#3f and utf8[i + 2]) shl 6 or (#3f and utf8[i + 3]);
303 if code < #10000 then
310 return error_char, st + 1;
312 if not validate_unicode(code) then
314 if classify_character(code) = class_combining then [
316 return error_char, i;
317 var n := n_combining_characters(result);
318 result or= code shl (n + 1) * combining_shift;
326 return error_char, select(len_greater_than(utf8, 0), 0, 1);
330 fn utf8_to_string(utf8 : bytes) : string
332 var s := empty(char);
334 while len_greater_than(utf8, 0) do [
335 var c, i := utf8_get_char~inline(utf8);
339 if processed >= lazy_buffer_size then
340 return s + utf8_to_string~lazy(utf8);
345 fn char_to_utf8~inline(code : char) : bytes
348 return bytes.[ error_ascii ];
349 else if code < #80 then
350 return bytes.[ code ];
351 else if code < #800 then
352 return bytes.[ code shr 6 or #c0, code and #3f or #80 ];
353 else if code < #10000 then
354 return bytes.[ code shr 12 or #e0, code shr 6 and #3f or #80, code and #3f or #80 ];
355 else if code < #110000 then
356 return bytes.[ code shr 18 or #f0, code shr 12 and #3f or #80, code shr 6 and #3f or #80, code and #3f or #80 ];
358 return bytes.[ error_ascii ];
361 fn string_to_utf8(s : string) : bytes
363 var result := empty(byte);
364 for i in list_iterator(s) do [
365 if i >= lazy_buffer_size then
366 return result + string_to_utf8~lazy(s[i .. ]);
369 result += char_to_utf8(code and unicode_mask);
370 code shr= combining_shift;
377 fn ch8_validate_character(ch8 : charset_8bit, c : char) : bool
379 if not char_validate(c) then
381 if n_combining_characters(c) > 0 then
386 return ch8.unicode_to_chr[c] > 0;
389 fn ch8_validate(ch8 : charset_8bit, ch : bytes) : bool
391 for i := 0 to len(ch) do [
396 var uni := ch8.chr_to_unicode[b - #80];
404 fn ch8_get_char(ch8 : charset_8bit, ch : bytes) : (char, int)
406 if not len_greater_than(ch, 0) then
407 return error_char, 0;
411 var uni := ch8.chr_to_unicode[b - #80];
412 if uni = 0 or classify_character(uni) = class_combining then
417 fn ch8_to_string(ch8 : charset_8bit, ch : bytes) : string
419 var s := empty(char);
420 while len_greater_than(ch, 0) do [
421 var c, i := ch8_get_char~inline(ch8, ch);
424 if len(s) >= lazy_buffer_size then
425 return s + ch8_to_string~lazy(ch8, ch);
430 fn string_to_ch8(ch8 : charset_8bit, s : string) : bytes
432 var result := empty(byte);
433 for i in list_iterator(s) do [
434 if i >= lazy_buffer_size then
435 return result + string_to_ch8~lazy(ch8, s[i .. ]);
439 result +<= error_ascii;
440 ] else if uni < #80 then [
442 ] else if uni < #110000 then [
443 var b := ch8.unicode_to_chr[uni];
447 var cls := classify_character(uni);
449 uni := uni_fallback[uni];
452 result +<= error_ascii;
453 ] else if cls = 2 then [
454 result += [ error_ascii, error_ascii ];
458 result +<= error_ascii;
464 fn classify_character(code : char) : int
466 code and= unicode_mask;
468 if code >= #20, code < #7F then
471 var table := uni_table;
473 var end := len(table) shr 1;
475 while start < end - 1 do [
476 var mid := start + end shr 1;
477 if table[mid shl 1] > code then [
484 return table[(start shl 1) + 1];
487 fn char_length~inline(c : char) : int
489 if c >= #20, c < #7F then
492 var cls := classify_character(c);
498 fn string_length(s : string) : int
501 for i := 0 to len(s) do [
502 var cls := classify_character(s[i]);
511 charset : charset_desc;
512 locale_validate_character : fn(char) : bool;
513 locale_validate : fn(bytes) : bool;
514 locale_get_char : fn(bytes) : (char, int);
515 locale_to_string : fn(bytes) : string;
516 string_to_locale : fn(string) : bytes;
521 default_charset : bytes;
524 const languages~cache : list(language) := [
525 language.[ loc_name : "C", default_charset : "us-ascii" ],
526 language.[ loc_name : "aa_DJ", default_charset : "ISO-8859-1" ],
527 language.[ loc_name : "aa_ER", default_charset : "UTF-8" ],
528 language.[ loc_name : "aa_ER@saaho", default_charset : "UTF-8" ],
529 language.[ loc_name : "aa_ET", default_charset : "UTF-8" ],
530 language.[ loc_name : "af_ZA", default_charset : "ISO-8859-1" ],
531 language.[ loc_name : "agr_PE", default_charset : "UTF-8" ],
532 language.[ loc_name : "ak_GH", default_charset : "UTF-8" ],
533 language.[ loc_name : "am_ET", default_charset : "UTF-8" ],
534 language.[ loc_name : "an_ES", default_charset : "ISO-8859-15" ],
535 language.[ loc_name : "anp_IN", default_charset : "UTF-8" ],
536 language.[ loc_name : "ar_AE", default_charset : "ISO-8859-6" ],
537 language.[ loc_name : "ar_BH", default_charset : "ISO-8859-6" ],
538 language.[ loc_name : "ar_DZ", default_charset : "ISO-8859-6" ],
539 language.[ loc_name : "ar_EG", default_charset : "ISO-8859-6" ],
540 language.[ loc_name : "ar_IN", default_charset : "UTF-8" ],
541 language.[ loc_name : "ar_IQ", default_charset : "ISO-8859-6" ],
542 language.[ loc_name : "ar_JO", default_charset : "ISO-8859-6" ],
543 language.[ loc_name : "ar_KW", default_charset : "ISO-8859-6" ],
544 language.[ loc_name : "ar_LB", default_charset : "ISO-8859-6" ],
545 language.[ loc_name : "ar_LY", default_charset : "ISO-8859-6" ],
546 language.[ loc_name : "ar_MA", default_charset : "ISO-8859-6" ],
547 language.[ loc_name : "ar_OM", default_charset : "ISO-8859-6" ],
548 language.[ loc_name : "ar_QA", default_charset : "ISO-8859-6" ],
549 language.[ loc_name : "ar_SA", default_charset : "ISO-8859-6" ],
550 language.[ loc_name : "ar_SD", default_charset : "ISO-8859-6" ],
551 language.[ loc_name : "ar_SS", default_charset : "UTF-8" ],
552 language.[ loc_name : "ar_SY", default_charset : "ISO-8859-6" ],
553 language.[ loc_name : "ar_TN", default_charset : "ISO-8859-6" ],
554 language.[ loc_name : "ar_YE", default_charset : "ISO-8859-6" ],
555 language.[ loc_name : "as_IN", default_charset : "UTF-8" ],
556 language.[ loc_name : "ast_ES", default_charset : "ISO-8859-15" ],
557 language.[ loc_name : "ayc_PE", default_charset : "UTF-8" ],
558 language.[ loc_name : "az_AZ", default_charset : "UTF-8" ],
559 language.[ loc_name : "az_IR", default_charset : "UTF-8" ],
560 language.[ loc_name : "be_BY", default_charset : "CP1251" ],
561 language.[ loc_name : "be_BY@latin", default_charset : "UTF-8" ],
562 language.[ loc_name : "bem_ZM", default_charset : "UTF-8" ],
563 language.[ loc_name : "ber_DZ", default_charset : "UTF-8" ],
564 language.[ loc_name : "ber_MA", default_charset : "UTF-8" ],
565 language.[ loc_name : "bg_BG", default_charset : "CP1251" ],
566 language.[ loc_name : "bhb_IN", default_charset : "UTF-8" ],
567 language.[ loc_name : "bho_IN", default_charset : "UTF-8" ],
568 language.[ loc_name : "bho_NP", default_charset : "UTF-8" ],
569 language.[ loc_name : "bi_VU", default_charset : "UTF-8" ],
570 language.[ loc_name : "bn_BD", default_charset : "UTF-8" ],
571 language.[ loc_name : "bn_IN", default_charset : "UTF-8" ],
572 language.[ loc_name : "bo_CN", default_charset : "UTF-8" ],
573 language.[ loc_name : "bo_IN", default_charset : "UTF-8" ],
574 language.[ loc_name : "br_FR", default_charset : "ISO-8859-1" ],
575 language.[ loc_name : "br_FR@euro", default_charset : "ISO-8859-15" ],
576 language.[ loc_name : "brx_IN", default_charset : "UTF-8" ],
577 language.[ loc_name : "bs_BA", default_charset : "ISO-8859-2" ],
578 language.[ loc_name : "byn_ER", default_charset : "UTF-8" ],
579 language.[ loc_name : "ca_AD", default_charset : "ISO-8859-15" ],
580 language.[ loc_name : "ca_ES", default_charset : "ISO-8859-1" ],
581 language.[ loc_name : "ca_ES@euro", default_charset : "ISO-8859-15" ],
582 language.[ loc_name : "ca_ES@valencia", default_charset : "UTF-8" ],
583 language.[ loc_name : "ca_FR", default_charset : "ISO-8859-15" ],
584 language.[ loc_name : "ca_IT", default_charset : "ISO-8859-15" ],
585 language.[ loc_name : "ce_RU", default_charset : "UTF-8" ],
586 language.[ loc_name : "chr_US", default_charset : "UTF-8" ],
587 language.[ loc_name : "cmn_TW", default_charset : "UTF-8" ],
588 language.[ loc_name : "crh_UA", default_charset : "UTF-8" ],
589 language.[ loc_name : "cs_CZ", default_charset : "ISO-8859-2" ],
590 language.[ loc_name : "csb_PL", default_charset : "UTF-8" ],
591 language.[ loc_name : "cv_RU", default_charset : "UTF-8" ],
592 language.[ loc_name : "cy_GB", default_charset : "ISO-8859-14" ],
593 language.[ loc_name : "da_DK", default_charset : "ISO-8859-1" ],
594 language.[ loc_name : "de_AT", default_charset : "ISO-8859-1" ],
595 language.[ loc_name : "de_AT@euro", default_charset : "ISO-8859-15" ],
596 language.[ loc_name : "de_BE", default_charset : "ISO-8859-1" ],
597 language.[ loc_name : "de_BE@euro", default_charset : "ISO-8859-15" ],
598 language.[ loc_name : "de_CH", default_charset : "ISO-8859-1" ],
599 language.[ loc_name : "de_DE", default_charset : "ISO-8859-1" ],
600 language.[ loc_name : "de_DE@euro", default_charset : "ISO-8859-15" ],
601 language.[ loc_name : "de_IT", default_charset : "ISO-8859-1" ],
602 language.[ loc_name : "de_LI", default_charset : "UTF-8" ],
603 language.[ loc_name : "de_LU", default_charset : "ISO-8859-1" ],
604 language.[ loc_name : "de_LU@euro", default_charset : "ISO-8859-15" ],
605 language.[ loc_name : "doi_IN", default_charset : "UTF-8" ],
606 language.[ loc_name : "dsb_DE", default_charset : "UTF-8" ],
607 language.[ loc_name : "dv_MV", default_charset : "UTF-8" ],
608 language.[ loc_name : "dz_BT", default_charset : "UTF-8" ],
609 language.[ loc_name : "el_CY", default_charset : "ISO-8859-7" ],
610 language.[ loc_name : "el_GR", default_charset : "ISO-8859-7" ],
611 language.[ loc_name : "el_GR@euro", default_charset : "ISO-8859-7" ],
612 language.[ loc_name : "en_AG", default_charset : "UTF-8" ],
613 language.[ loc_name : "en_AU", default_charset : "ISO-8859-1" ],
614 language.[ loc_name : "en_BW", default_charset : "ISO-8859-1" ],
615 language.[ loc_name : "en_CA", default_charset : "ISO-8859-1" ],
616 language.[ loc_name : "en_DK", default_charset : "ISO-8859-1" ],
617 language.[ loc_name : "en_GB", default_charset : "ISO-8859-1" ],
618 language.[ loc_name : "en_HK", default_charset : "ISO-8859-1" ],
619 language.[ loc_name : "en_IE", default_charset : "ISO-8859-1" ],
620 language.[ loc_name : "en_IE@euro", default_charset : "ISO-8859-15" ],
621 language.[ loc_name : "en_IL", default_charset : "UTF-8" ],
622 language.[ loc_name : "en_IN", default_charset : "UTF-8" ],
623 language.[ loc_name : "en_NG", default_charset : "UTF-8" ],
624 language.[ loc_name : "en_NZ", default_charset : "ISO-8859-1" ],
625 language.[ loc_name : "en_PH", default_charset : "ISO-8859-1" ],
626 language.[ loc_name : "en_SC", default_charset : "UTF-8" ],
627 language.[ loc_name : "en_SG", default_charset : "ISO-8859-1" ],
628 language.[ loc_name : "en_US", default_charset : "ISO-8859-1" ],
629 language.[ loc_name : "en_ZA", default_charset : "ISO-8859-1" ],
630 language.[ loc_name : "en_ZM", default_charset : "UTF-8" ],
631 language.[ loc_name : "en_ZW", default_charset : "ISO-8859-1" ],
632 language.[ loc_name : "eo", default_charset : "UTF-8" ],
633 language.[ loc_name : "es_AR", default_charset : "ISO-8859-1" ],
634 language.[ loc_name : "es_BO", default_charset : "ISO-8859-1" ],
635 language.[ loc_name : "es_CL", default_charset : "ISO-8859-1" ],
636 language.[ loc_name : "es_CO", default_charset : "ISO-8859-1" ],
637 language.[ loc_name : "es_CR", default_charset : "ISO-8859-1" ],
638 language.[ loc_name : "es_CU", default_charset : "UTF-8" ],
639 language.[ loc_name : "es_DO", default_charset : "ISO-8859-1" ],
640 language.[ loc_name : "es_EC", default_charset : "ISO-8859-1" ],
641 language.[ loc_name : "es_ES", default_charset : "ISO-8859-1" ],
642 language.[ loc_name : "es_ES@euro", default_charset : "ISO-8859-15" ],
643 language.[ loc_name : "es_GT", default_charset : "ISO-8859-1" ],
644 language.[ loc_name : "es_HN", default_charset : "ISO-8859-1" ],
645 language.[ loc_name : "es_MX", default_charset : "ISO-8859-1" ],
646 language.[ loc_name : "es_NI", default_charset : "ISO-8859-1" ],
647 language.[ loc_name : "es_PA", default_charset : "ISO-8859-1" ],
648 language.[ loc_name : "es_PE", default_charset : "ISO-8859-1" ],
649 language.[ loc_name : "es_PR", default_charset : "ISO-8859-1" ],
650 language.[ loc_name : "es_PY", default_charset : "ISO-8859-1" ],
651 language.[ loc_name : "es_SV", default_charset : "ISO-8859-1" ],
652 language.[ loc_name : "es_US", default_charset : "ISO-8859-1" ],
653 language.[ loc_name : "es_UY", default_charset : "ISO-8859-1" ],
654 language.[ loc_name : "es_VE", default_charset : "ISO-8859-1" ],
655 language.[ loc_name : "et_EE", default_charset : "ISO-8859-1" ],
656 language.[ loc_name : "eu_ES", default_charset : "ISO-8859-1" ],
657 language.[ loc_name : "eu_ES@euro", default_charset : "ISO-8859-15" ],
658 language.[ loc_name : "eu_FR", default_charset : "ISO-8859-1" ],
659 language.[ loc_name : "eu_FR@euro", default_charset : "ISO-8859-15" ],
660 language.[ loc_name : "fa_IR", default_charset : "UTF-8" ],
661 language.[ loc_name : "ff_SN", default_charset : "UTF-8" ],
662 language.[ loc_name : "fi_FI", default_charset : "ISO-8859-1" ],
663 language.[ loc_name : "fi_FI@euro", default_charset : "ISO-8859-15" ],
664 language.[ loc_name : "fil_PH", default_charset : "UTF-8" ],
665 language.[ loc_name : "fo_FO", default_charset : "ISO-8859-1" ],
666 language.[ loc_name : "fr_BE", default_charset : "ISO-8859-1" ],
667 language.[ loc_name : "fr_BE@euro", default_charset : "ISO-8859-15" ],
668 language.[ loc_name : "fr_CA", default_charset : "ISO-8859-1" ],
669 language.[ loc_name : "fr_CH", default_charset : "ISO-8859-1" ],
670 language.[ loc_name : "fr_FR", default_charset : "ISO-8859-1" ],
671 language.[ loc_name : "fr_FR@euro", default_charset : "ISO-8859-15" ],
672 language.[ loc_name : "fr_LU", default_charset : "ISO-8859-1" ],
673 language.[ loc_name : "fr_LU@euro", default_charset : "ISO-8859-15" ],
674 language.[ loc_name : "fur_IT", default_charset : "UTF-8" ],
675 language.[ loc_name : "fy_DE", default_charset : "UTF-8" ],
676 language.[ loc_name : "fy_NL", default_charset : "UTF-8" ],
677 language.[ loc_name : "ga_IE", default_charset : "ISO-8859-1" ],
678 language.[ loc_name : "ga_IE@euro", default_charset : "ISO-8859-15" ],
679 language.[ loc_name : "gd_GB", default_charset : "ISO-8859-15" ],
680 language.[ loc_name : "gez_ER", default_charset : "UTF-8" ],
681 language.[ loc_name : "gez_ER@abegede", default_charset : "UTF-8" ],
682 language.[ loc_name : "gez_ET", default_charset : "UTF-8" ],
683 language.[ loc_name : "gez_ET@abegede", default_charset : "UTF-8" ],
684 language.[ loc_name : "gl_ES", default_charset : "ISO-8859-1" ],
685 language.[ loc_name : "gl_ES@euro", default_charset : "ISO-8859-15" ],
686 language.[ loc_name : "gu_IN", default_charset : "UTF-8" ],
687 language.[ loc_name : "gv_GB", default_charset : "ISO-8859-1" ],
688 language.[ loc_name : "ha_NG", default_charset : "UTF-8" ],
689 language.[ loc_name : "hak_TW", default_charset : "UTF-8" ],
690 language.[ loc_name : "he_IL", default_charset : "ISO-8859-8" ],
691 language.[ loc_name : "hi_IN", default_charset : "UTF-8" ],
692 language.[ loc_name : "hif_FJ", default_charset : "UTF-8" ],
693 language.[ loc_name : "hne_IN", default_charset : "UTF-8" ],
694 language.[ loc_name : "hr_HR", default_charset : "ISO-8859-2" ],
695 language.[ loc_name : "hsb_DE", default_charset : "ISO-8859-2" ],
696 language.[ loc_name : "ht_HT", default_charset : "UTF-8" ],
697 language.[ loc_name : "hu_HU", default_charset : "ISO-8859-2" ],
698 language.[ loc_name : "hy_AM", default_charset : "UTF-8" ],
699 language.[ loc_name : "ia_FR", default_charset : "UTF-8" ],
700 language.[ loc_name : "id_ID", default_charset : "ISO-8859-1" ],
701 language.[ loc_name : "ig_NG", default_charset : "UTF-8" ],
702 language.[ loc_name : "ik_CA", default_charset : "UTF-8" ],
703 language.[ loc_name : "is_IS", default_charset : "ISO-8859-1" ],
704 language.[ loc_name : "it_CH", default_charset : "ISO-8859-1" ],
705 language.[ loc_name : "it_IT", default_charset : "ISO-8859-1" ],
706 language.[ loc_name : "it_IT@euro", default_charset : "ISO-8859-15" ],
707 language.[ loc_name : "iu_CA", default_charset : "UTF-8" ],
708 language.[ loc_name : "ja_JP", default_charset : "EUC-JP" ],
709 language.[ loc_name : "ka_GE", default_charset : "GEORGIAN-PS" ],
710 language.[ loc_name : "kab_DZ", default_charset : "UTF-8" ],
711 language.[ loc_name : "kk_KZ", default_charset : "PT154" ],
712 language.[ loc_name : "kl_GL", default_charset : "ISO-8859-1" ],
713 language.[ loc_name : "km_KH", default_charset : "UTF-8" ],
714 language.[ loc_name : "kn_IN", default_charset : "UTF-8" ],
715 language.[ loc_name : "ko_KR", default_charset : "EUC-KR" ],
716 language.[ loc_name : "kok_IN", default_charset : "UTF-8" ],
717 language.[ loc_name : "ks_IN", default_charset : "UTF-8" ],
718 language.[ loc_name : "ks_IN@devanagari", default_charset : "UTF-8" ],
719 language.[ loc_name : "ku_TR", default_charset : "ISO-8859-9" ],
720 language.[ loc_name : "kw_GB", default_charset : "ISO-8859-1" ],
721 language.[ loc_name : "ky_KG", default_charset : "UTF-8" ],
722 language.[ loc_name : "lb_LU", default_charset : "UTF-8" ],
723 language.[ loc_name : "lg_UG", default_charset : "ISO-8859-10" ],
724 language.[ loc_name : "li_BE", default_charset : "UTF-8" ],
725 language.[ loc_name : "li_NL", default_charset : "UTF-8" ],
726 language.[ loc_name : "lij_IT", default_charset : "UTF-8" ],
727 language.[ loc_name : "ln_CD", default_charset : "UTF-8" ],
728 language.[ loc_name : "lo_LA", default_charset : "UTF-8" ],
729 language.[ loc_name : "lt_LT", default_charset : "ISO-8859-13" ],
730 language.[ loc_name : "lv_LV", default_charset : "ISO-8859-13" ],
731 language.[ loc_name : "lzh_TW", default_charset : "UTF-8" ],
732 language.[ loc_name : "mag_IN", default_charset : "UTF-8" ],
733 language.[ loc_name : "mai_IN", default_charset : "UTF-8" ],
734 language.[ loc_name : "mai_NP", default_charset : "UTF-8" ],
735 language.[ loc_name : "mfe_MU", default_charset : "UTF-8" ],
736 language.[ loc_name : "mg_MG", default_charset : "ISO-8859-15" ],
737 language.[ loc_name : "mhr_RU", default_charset : "UTF-8" ],
738 language.[ loc_name : "mi_NZ", default_charset : "ISO-8859-13" ],
739 language.[ loc_name : "miq_NI", default_charset : "UTF-8" ],
740 language.[ loc_name : "mjw_IN", default_charset : "UTF-8" ],
741 language.[ loc_name : "mk_MK", default_charset : "ISO-8859-5" ],
742 language.[ loc_name : "ml_IN", default_charset : "UTF-8" ],
743 language.[ loc_name : "mn_MN", default_charset : "UTF-8" ],
744 language.[ loc_name : "mni_IN", default_charset : "UTF-8" ],
745 language.[ loc_name : "mnw_MM", default_charset : "UTF-8" ],
746 language.[ loc_name : "mr_IN", default_charset : "UTF-8" ],
747 language.[ loc_name : "ms_MY", default_charset : "ISO-8859-1" ],
748 language.[ loc_name : "mt_MT", default_charset : "ISO-8859-3" ],
749 language.[ loc_name : "my_MM", default_charset : "UTF-8" ],
750 language.[ loc_name : "nan_TW", default_charset : "UTF-8" ],
751 language.[ loc_name : "nan_TW@latin", default_charset : "UTF-8" ],
752 language.[ loc_name : "nb_NO", default_charset : "ISO-8859-1" ],
753 language.[ loc_name : "nds_DE", default_charset : "UTF-8" ],
754 language.[ loc_name : "nds_NL", default_charset : "UTF-8" ],
755 language.[ loc_name : "ne_NP", default_charset : "UTF-8" ],
756 language.[ loc_name : "nhn_MX", default_charset : "UTF-8" ],
757 language.[ loc_name : "niu_NU", default_charset : "UTF-8" ],
758 language.[ loc_name : "niu_NZ", default_charset : "UTF-8" ],
759 language.[ loc_name : "nl_AW", default_charset : "UTF-8" ],
760 language.[ loc_name : "nl_BE", default_charset : "ISO-8859-1" ],
761 language.[ loc_name : "nl_BE@euro", default_charset : "ISO-8859-15" ],
762 language.[ loc_name : "nl_NL", default_charset : "ISO-8859-1" ],
763 language.[ loc_name : "nl_NL@euro", default_charset : "ISO-8859-15" ],
764 language.[ loc_name : "nn_NO", default_charset : "ISO-8859-1" ],
765 language.[ loc_name : "nr_ZA", default_charset : "UTF-8" ],
766 language.[ loc_name : "nso_ZA", default_charset : "UTF-8" ],
767 language.[ loc_name : "oc_FR", default_charset : "ISO-8859-1" ],
768 language.[ loc_name : "om_ET", default_charset : "UTF-8" ],
769 language.[ loc_name : "om_KE", default_charset : "ISO-8859-1" ],
770 language.[ loc_name : "or_IN", default_charset : "UTF-8" ],
771 language.[ loc_name : "os_RU", default_charset : "UTF-8" ],
772 language.[ loc_name : "pa_IN", default_charset : "UTF-8" ],
773 language.[ loc_name : "pa_PK", default_charset : "UTF-8" ],
774 language.[ loc_name : "pap_AW", default_charset : "UTF-8" ],
775 language.[ loc_name : "pap_CW", default_charset : "UTF-8" ],
776 language.[ loc_name : "pl_PL", default_charset : "ISO-8859-2" ],
777 language.[ loc_name : "ps_AF", default_charset : "UTF-8" ],
778 language.[ loc_name : "pt_BR", default_charset : "ISO-8859-1" ],
779 language.[ loc_name : "pt_PT", default_charset : "ISO-8859-1" ],
780 language.[ loc_name : "pt_PT@euro", default_charset : "ISO-8859-15" ],
781 language.[ loc_name : "quz_PE", default_charset : "UTF-8" ],
782 language.[ loc_name : "raj_IN", default_charset : "UTF-8" ],
783 language.[ loc_name : "ro_RO", default_charset : "ISO-8859-2" ],
784 language.[ loc_name : "ru_RU", default_charset : "ISO-8859-5" ],
785 language.[ loc_name : "ru_UA", default_charset : "KOI8-U" ],
786 language.[ loc_name : "rw_RW", default_charset : "UTF-8" ],
787 language.[ loc_name : "sa_IN", default_charset : "UTF-8" ],
788 language.[ loc_name : "sah_RU", default_charset : "UTF-8" ],
789 language.[ loc_name : "sat_IN", default_charset : "UTF-8" ],
790 language.[ loc_name : "sc_IT", default_charset : "UTF-8" ],
791 language.[ loc_name : "sd_IN", default_charset : "UTF-8" ],
792 language.[ loc_name : "sd_IN@devanagari", default_charset : "UTF-8" ],
793 language.[ loc_name : "se_NO", default_charset : "UTF-8" ],
794 language.[ loc_name : "sgs_LT", default_charset : "UTF-8" ],
795 language.[ loc_name : "shn_MM", default_charset : "UTF-8" ],
796 language.[ loc_name : "shs_CA", default_charset : "UTF-8" ],
797 language.[ loc_name : "si_LK", default_charset : "UTF-8" ],
798 language.[ loc_name : "sid_ET", default_charset : "UTF-8" ],
799 language.[ loc_name : "sk_SK", default_charset : "ISO-8859-2" ],
800 language.[ loc_name : "sl_SI", default_charset : "ISO-8859-2" ],
801 language.[ loc_name : "sm_WS", default_charset : "UTF-8" ],
802 language.[ loc_name : "so_DJ", default_charset : "ISO-8859-1" ],
803 language.[ loc_name : "so_ET", default_charset : "UTF-8" ],
804 language.[ loc_name : "so_KE", default_charset : "ISO-8859-1" ],
805 language.[ loc_name : "so_SO", default_charset : "ISO-8859-1" ],
806 language.[ loc_name : "sq_AL", default_charset : "ISO-8859-1" ],
807 language.[ loc_name : "sq_MK", default_charset : "UTF-8" ],
808 language.[ loc_name : "sr_ME", default_charset : "UTF-8" ],
809 language.[ loc_name : "sr_RS", default_charset : "UTF-8" ],
810 language.[ loc_name : "sr_RS@latin", default_charset : "UTF-8" ],
811 language.[ loc_name : "ss_ZA", default_charset : "UTF-8" ],
812 language.[ loc_name : "st_ZA", default_charset : "ISO-8859-1" ],
813 language.[ loc_name : "sv_FI", default_charset : "ISO-8859-1" ],
814 language.[ loc_name : "sv_FI@euro", default_charset : "ISO-8859-15" ],
815 language.[ loc_name : "sv_SE", default_charset : "ISO-8859-1" ],
816 language.[ loc_name : "sw_KE", default_charset : "UTF-8" ],
817 language.[ loc_name : "sw_TZ", default_charset : "UTF-8" ],
818 language.[ loc_name : "szl_PL", default_charset : "UTF-8" ],
819 language.[ loc_name : "ta_IN", default_charset : "UTF-8" ],
820 language.[ loc_name : "ta_LK", default_charset : "UTF-8" ],
821 language.[ loc_name : "tcy_IN", default_charset : "UTF-8" ],
822 language.[ loc_name : "te_IN", default_charset : "UTF-8" ],
823 language.[ loc_name : "tg_TJ", default_charset : "KOI8-T" ],
824 language.[ loc_name : "th_TH", default_charset : "TIS-620" ],
825 language.[ loc_name : "the_NP", default_charset : "UTF-8" ],
826 language.[ loc_name : "ti_ER", default_charset : "UTF-8" ],
827 language.[ loc_name : "ti_ET", default_charset : "UTF-8" ],
828 language.[ loc_name : "tig_ER", default_charset : "UTF-8" ],
829 language.[ loc_name : "tk_TM", default_charset : "UTF-8" ],
830 language.[ loc_name : "tl_PH", default_charset : "ISO-8859-1" ],
831 language.[ loc_name : "tn_ZA", default_charset : "UTF-8" ],
832 language.[ loc_name : "to_TO", default_charset : "UTF-8" ],
833 language.[ loc_name : "tpi_PG", default_charset : "UTF-8" ],
834 language.[ loc_name : "tr_CY", default_charset : "ISO-8859-9" ],
835 language.[ loc_name : "tr_TR", default_charset : "ISO-8859-9" ],
836 language.[ loc_name : "ts_ZA", default_charset : "UTF-8" ],
837 language.[ loc_name : "tt_RU", default_charset : "UTF-8" ],
838 language.[ loc_name : "tt_RU@iqtelif", default_charset : "UTF-8" ],
839 language.[ loc_name : "ug_CN", default_charset : "UTF-8" ],
840 language.[ loc_name : "uk_UA", default_charset : "KOI8-U" ],
841 language.[ loc_name : "unm_US", default_charset : "UTF-8" ],
842 language.[ loc_name : "ur_IN", default_charset : "UTF-8" ],
843 language.[ loc_name : "ur_PK", default_charset : "UTF-8" ],
844 language.[ loc_name : "uz_UZ", default_charset : "ISO-8859-1" ],
845 language.[ loc_name : "uz_UZ@cyrillic", default_charset : "UTF-8" ],
846 language.[ loc_name : "ve_ZA", default_charset : "UTF-8" ],
847 language.[ loc_name : "vi_VN", default_charset : "UTF-8" ],
848 language.[ loc_name : "wa_BE", default_charset : "ISO-8859-1" ],
849 language.[ loc_name : "wa_BE@euro", default_charset : "ISO-8859-15" ],
850 language.[ loc_name : "wae_CH", default_charset : "UTF-8" ],
851 language.[ loc_name : "wal_ET", default_charset : "UTF-8" ],
852 language.[ loc_name : "wo_SN", default_charset : "UTF-8" ],
853 language.[ loc_name : "xh_ZA", default_charset : "ISO-8859-1" ],
854 language.[ loc_name : "yi_US", default_charset : "CP1255" ],
855 language.[ loc_name : "yo_NG", default_charset : "UTF-8" ],
856 language.[ loc_name : "yue_HK", default_charset : "UTF-8" ],
857 language.[ loc_name : "yuw_PG", default_charset : "UTF-8" ],
858 language.[ loc_name : "zh_CN", default_charset : "GB2312" ],
859 language.[ loc_name : "zh_HK", default_charset : "BIG5-HKSCS" ],
860 language.[ loc_name : "zh_SG", default_charset : "GB2312" ],
861 language.[ loc_name : "zh_TW", default_charset : "BIG5" ],
862 language.[ loc_name : "zu_ZA", default_charset : "ISO-8859-1" ],
865 fn find_language(loc_name : bytes) : language
867 var lng := languages;
870 while start < end do [
871 var middle := (start + end) shr 1;
872 var l := lng[middle];
873 if l.loc_name = loc_name then
875 if l.loc_name < loc_name then
880 abort exception_make_str(language, ec_sync, error_invalid_operation, 0, false, "Unknown language");
884 fn locale_get_charset~inline(loc : locale) : charset_desc
889 fn locale_validate_character~inline(loc : locale, code : char) : bool
891 return loc.locale_validate_character(code);
894 fn locale_validate~inline(loc : locale, b : bytes) : bool
896 return loc.locale_validate(b);
899 fn locale_get_char(loc : locale, b : bytes) : (char, int)
901 return loc.locale_get_char(b);
904 fn locale_to_string~inline(loc : locale, b : bytes) : string
906 return loc.locale_to_string(b);
909 fn string_to_locale~inline(loc : locale, s : string) : bytes
911 return loc.string_to_locale(s);
914 fn charset_name_normalize(charset : bytes) : bytes
917 var was_letter := false;
918 for i := 0 to len(charset) do [
920 if c >= '0', c <= '9' then [
922 ] else if c >= 'A', c <= 'Z' then [
925 ] else if c >= 'a', c <= 'z' then [
930 if not was_letter then
935 fn charset_8bit_find~cache(nname : bytes) : charset_8bit
937 var file_name := nname;
938 if len(file_name) > 8 then
939 file_name := file_name[ .. 3] + "_" + file_name[len(file_name) - 4 .. ];
941 var b := read_lazy(ropen_lazy(dlib(unsafe_get_world), path_append("charsets", file_name), 0));
942 if is_exception b then
943 abort exception_make_str(charset_8bit, ec_sync, error_invalid_operation, 0, false, "Unknown charset");
945 var first_null := list_search(b, 0);
946 var second_null := first_null + 1 + list_search(b[first_null + 1 .. ], 0);
948 var res := charset_8bit.[
949 label : utf8_to_string(b[ .. first_null]),
950 mime_name : b[first_null + 1 .. second_null],
951 chr_to_unicode : array_fill(char, 0, [128]),
952 unicode_to_chr : sparse(byte, 0, #110000),
955 var normalized_name := charset_name_normalize(res.mime_name);
956 if normalized_name <> nname then
957 abort exception_make_str(charset_8bit, ec_sync, error_invalid_operation, 0, false, "Unknown charset");
959 b := b[second_null + 1 ..];
961 for i := 0 to 128 do [
962 var uni := #10000 * b[i * 3 + 2] + #100 * b[i * 3 + 1] + b[i * 3];
963 res.chr_to_unicode[i] := uni;
964 res.unicode_to_chr[uni] := i + 128;
970 fn locale_get~cache(lc : bytes) : locale
973 var dot := list_search(lc, '.');
975 var lng := find_language(lc);
976 if is_exception lng then
977 lng := find_language("C");
978 charset := lng.default_charset;
980 charset := lc[dot + 1 .. ];
981 var a := list_search(charset, '@');
983 charset := charset[ .. a];
985 charset := charset_name_normalize(charset);
986 var loc := locale.[ ];
987 if charset = "utf8" then [
988 loc.charset := charset_desc.[
991 mode : locale_mode.utf8,
993 loc.locale_validate_character := char_validate;
994 loc.locale_validate := utf8_validate;
995 loc.locale_get_char := utf8_get_char;
996 loc.locale_to_string := utf8_to_string;
997 loc.string_to_locale := string_to_utf8;
999 var ch8 := charset_8bit_find(charset);
1001 loc.charset := charset_desc.[
1003 mime_name : ch8.mime_name,
1004 mode : locale_mode.eightbit,
1006 loc.locale_validate_character := ch8_validate_character(ch8,);
1007 loc.locale_validate := ch8_validate(ch8,);
1008 loc.locale_get_char := ch8_get_char(ch8,);
1009 loc.locale_to_string := ch8_to_string(ch8,);
1010 loc.string_to_locale := string_to_ch8(ch8,);
1015 fn charset_to_bytes(charset : int) : bytes
1020 else if charset < 1250 then
1021 chstr := "cp" + ntos(charset);
1023 chstr := "windows-" + ntos(charset);
1027 fn locale_init(env : treemap(bytes, bytes)) : locale
1029 var charset := sysprop(SystemProperty_Charset);
1030 if charset >= 0 then [
1031 var loc := locale_get("." + charset_to_bytes(charset));
1032 if not is_exception loc then
1035 var lc := treemap_search(env, "LC_ALL");
1037 lc := treemap_search(env, "LC_CTYPE");
1039 lc := treemap_search(env, "LANG");
1041 lc := maybe(bytes).j.("C");
1042 return locale_get(lc.j);
1045 fn locale_console_init(env : treemap(bytes, bytes)) : locale
1047 var charset := sysprop(SystemProperty_Charset_Console);
1048 if charset >= 0 then [
1049 var loc := locale_get("." + charset_to_bytes(charset));
1050 if not is_exception loc then
1053 return locale_init(env);
1056 fn instance_ord_charset_desc := class_ord(charset_desc).[
1057 equal : lambda(a b : charset_desc) [ return a.label = b.label; ],
1058 less : lambda(a b : charset_desc) [ return a.label < b.label; ],
1061 fn charset_list~cache : list(charset_desc)
1063 var result := [ charset_desc.[
1065 mime_name : "utf-8",
1066 mode : locale_mode.utf8,
1068 var d := dopen_lazy(dlib(unsafe_get_world), "charsets", 0);
1069 var files := dread_lazy(d);
1071 if not list_ends_with(f, ".c8") then
1073 var b := read_lazy(ropen_lazy(d, f, 0));
1074 var first_null := list_search(b, 0);
1075 var second_null := first_null + 1 + list_search(b[first_null + 1 .. ], 0);
1076 result +<= charset_desc.[
1077 label : utf8_to_string(b[ .. first_null]),
1078 mime_name : b[first_null + 1 .. second_null],
1079 mode : locale_mode.eightbit,
1082 result := list_sort(instance_ord_charset_desc, result);