nss: upgrade to release 3.73
[LibreOffice.git] / i18nutil / source / utility / unicode.cxx
blobf944d69c40e27372db936cabcc9d83d3f91c1f48
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include "unicode_data.h"
28 #include <rtl/character.hxx>
29 #include <memory>
31 // Workaround for glibc braindamage:
32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 #undef CURRENCY_SYMBOL
36 using namespace ::com::sun::star::i18n;
38 template<class L, typename T>
39 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
41 sal_Int16 i = 0;
42 css::i18n::UnicodeScript type = typeList[0].to;
43 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
44 type = typeList[++i].to;
47 return (type < UnicodeScript_kScriptCount &&
48 ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
49 typeList[i].value : unknownType;
52 sal_Int16
53 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
54 return getScriptType(ch, typeList, unknownType);
57 sal_Unicode
58 unicode::getUnicodeScriptStart( UnicodeScript type) {
59 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
62 sal_Unicode
63 unicode::getUnicodeScriptEnd( UnicodeScript type) {
64 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
67 sal_Int16
68 unicode::getUnicodeType( const sal_Unicode ch ) {
69 static sal_Unicode c = 0x00;
70 static sal_Int16 r = 0x00;
72 if (ch == c) return r;
73 else c = ch;
75 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
76 r = static_cast<sal_Int16>(
77 (address < UnicodeTypeNumberBlock)
78 ? UnicodeTypeBlockValue[address]
79 : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
80 return r;
83 sal_uInt8
84 unicode::getUnicodeDirection( const sal_Unicode ch ) {
85 static sal_Unicode c = 0x00;
86 static sal_uInt8 r = 0x00;
88 if (ch == c) return r;
89 else c = ch;
91 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
92 r = (address < UnicodeDirectionNumberBlock)
93 ? UnicodeDirectionBlockValue[address]
94 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
95 return r;
98 #define bit(name) (1U << name)
100 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
102 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
104 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
106 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
107 bit(UnicodeType::MODIFIER_LETTER)|\
108 bit(UnicodeType::OTHER_LETTER)
110 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
111 bit(UnicodeType::LINE_SEPARATOR)|\
112 bit(UnicodeType::PARAGRAPH_SEPARATOR)
114 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
115 bit(UnicodeType::FORMAT)|\
116 bit(UnicodeType::LINE_SEPARATOR)|\
117 bit(UnicodeType::PARAGRAPH_SEPARATOR)
119 #define IsType(func, mask) \
120 bool func( const sal_Unicode ch) {\
121 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
124 IsType(unicode::isControl, CONTROLMASK)
125 IsType(unicode::isAlpha, ALPHAMASK)
126 IsType(unicode::isSpace, SPACEMASK)
128 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
129 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
131 bool unicode::isWhiteSpace( const sal_Unicode ch) {
132 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
135 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
137 //See unicode/uscript.h
138 static const sal_Int16 scriptTypes[] =
140 ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
141 ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
142 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
143 // 15
144 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
145 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
146 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
147 // 30
148 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
149 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
150 ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
151 // 45
152 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
153 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
154 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
155 // 60
156 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
157 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
158 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
159 // 75
160 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
161 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
162 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
163 // 90
164 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
165 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
166 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
167 // 105
168 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
169 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
170 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
171 // 120
172 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
173 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
174 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
175 // 135
176 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
177 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
178 ScriptType::COMPLEX,
179 ScriptType::WEAK
182 sal_Int16 nRet;
183 if (eScript < USCRIPT_COMMON)
184 nRet = ScriptType::WEAK;
185 else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
186 nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
187 else
188 nRet = scriptTypes[eScript];
189 return nRet;
192 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
194 OString sRet;
195 switch (eScript)
197 case USCRIPT_CODE_LIMIT:
198 case USCRIPT_INVALID_CODE:
199 sRet = "zxx";
200 break;
201 case USCRIPT_COMMON:
202 case USCRIPT_INHERITED:
203 sRet = "und";
204 break;
205 case USCRIPT_MATHEMATICAL_NOTATION:
206 case USCRIPT_SYMBOLS:
207 sRet = "zxx";
208 break;
209 case USCRIPT_UNWRITTEN_LANGUAGES:
210 case USCRIPT_UNKNOWN:
211 sRet = "und";
212 break;
213 case USCRIPT_ARABIC:
214 sRet = "ar";
215 break;
216 case USCRIPT_ARMENIAN:
217 sRet = "hy";
218 break;
219 case USCRIPT_BENGALI:
220 sRet = "bn";
221 break;
222 case USCRIPT_BOPOMOFO:
223 sRet = "zh";
224 break;
225 case USCRIPT_CHEROKEE:
226 sRet = "chr";
227 break;
228 case USCRIPT_COPTIC:
229 sRet = "cop";
230 break;
231 case USCRIPT_CYRILLIC:
232 sRet = "ru";
233 break;
234 case USCRIPT_DESERET:
235 sRet = "en";
236 break;
237 case USCRIPT_DEVANAGARI:
238 sRet = "hi";
239 break;
240 case USCRIPT_ETHIOPIC:
241 sRet = "am";
242 break;
243 case USCRIPT_GEORGIAN:
244 sRet = "ka";
245 break;
246 case USCRIPT_GOTHIC:
247 sRet = "got";
248 break;
249 case USCRIPT_GREEK:
250 sRet = "el";
251 break;
252 case USCRIPT_GUJARATI:
253 sRet = "gu";
254 break;
255 case USCRIPT_GURMUKHI:
256 sRet = "pa";
257 break;
258 case USCRIPT_HAN:
259 sRet = "zh";
260 break;
261 case USCRIPT_HANGUL:
262 sRet = "ko";
263 break;
264 case USCRIPT_HEBREW:
265 sRet = "hr";
266 break;
267 case USCRIPT_HIRAGANA:
268 sRet = "ja";
269 break;
270 case USCRIPT_KANNADA:
271 sRet = "kn";
272 break;
273 case USCRIPT_KATAKANA:
274 sRet = "ja";
275 break;
276 case USCRIPT_KHMER:
277 sRet = "km";
278 break;
279 case USCRIPT_LAO:
280 sRet = "lo";
281 break;
282 case USCRIPT_LATIN:
283 sRet = "en";
284 break;
285 case USCRIPT_MALAYALAM:
286 sRet = "ml";
287 break;
288 case USCRIPT_MONGOLIAN:
289 sRet = "mn";
290 break;
291 case USCRIPT_MYANMAR:
292 sRet = "my";
293 break;
294 case USCRIPT_OGHAM:
295 sRet = "pgl";
296 break;
297 case USCRIPT_OLD_ITALIC:
298 sRet = "osc";
299 break;
300 case USCRIPT_ORIYA:
301 sRet = "or";
302 break;
303 case USCRIPT_RUNIC:
304 sRet = "ang";
305 break;
306 case USCRIPT_SINHALA:
307 sRet = "si";
308 break;
309 case USCRIPT_SYRIAC:
310 sRet = "syr";
311 break;
312 case USCRIPT_TAMIL:
313 sRet = "ta";
314 break;
315 case USCRIPT_TELUGU:
316 sRet = "te";
317 break;
318 case USCRIPT_THAANA:
319 sRet = "dv";
320 break;
321 case USCRIPT_THAI:
322 sRet = "th";
323 break;
324 case USCRIPT_TIBETAN:
325 sRet = "bo";
326 break;
327 case USCRIPT_CANADIAN_ABORIGINAL:
328 sRet = "iu";
329 break;
330 case USCRIPT_YI:
331 sRet = "ii";
332 break;
333 case USCRIPT_TAGALOG:
334 sRet = "tl";
335 break;
336 case USCRIPT_HANUNOO:
337 sRet = "hnn";
338 break;
339 case USCRIPT_BUHID:
340 sRet = "bku";
341 break;
342 case USCRIPT_TAGBANWA:
343 sRet = "tbw";
344 break;
345 case USCRIPT_BRAILLE:
346 sRet = "en";
347 break;
348 case USCRIPT_CYPRIOT:
349 sRet = "ecy";
350 break;
351 case USCRIPT_LIMBU:
352 sRet = "lif";
353 break;
354 case USCRIPT_LINEAR_B:
355 sRet = "gmy";
356 break;
357 case USCRIPT_OSMANYA:
358 sRet = "so";
359 break;
360 case USCRIPT_SHAVIAN:
361 sRet = "en";
362 break;
363 case USCRIPT_TAI_LE:
364 sRet = "tdd";
365 break;
366 case USCRIPT_UGARITIC:
367 sRet = "uga";
368 break;
369 case USCRIPT_KATAKANA_OR_HIRAGANA:
370 sRet = "ja";
371 break;
372 case USCRIPT_BUGINESE:
373 sRet = "bug";
374 break;
375 case USCRIPT_GLAGOLITIC:
376 sRet = "ch";
377 break;
378 case USCRIPT_KHAROSHTHI:
379 sRet = "pra";
380 break;
381 case USCRIPT_SYLOTI_NAGRI:
382 sRet = "syl";
383 break;
384 case USCRIPT_NEW_TAI_LUE:
385 sRet = "khb";
386 break;
387 case USCRIPT_TIFINAGH:
388 sRet = "tmh";
389 break;
390 case USCRIPT_OLD_PERSIAN:
391 sRet = "peo";
392 break;
393 case USCRIPT_BALINESE:
394 sRet = "ban";
395 break;
396 case USCRIPT_BATAK:
397 sRet = "btk";
398 break;
399 case USCRIPT_BLISSYMBOLS:
400 sRet = "en";
401 break;
402 case USCRIPT_BRAHMI:
403 sRet = "pra";
404 break;
405 case USCRIPT_CHAM:
406 sRet = "cja";
407 break;
408 case USCRIPT_CIRTH:
409 sRet = "sjn";
410 break;
411 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
412 sRet = "cu";
413 break;
414 case USCRIPT_DEMOTIC_EGYPTIAN:
415 case USCRIPT_HIERATIC_EGYPTIAN:
416 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
417 sRet = "egy";
418 break;
419 case USCRIPT_KHUTSURI:
420 sRet = "ka";
421 break;
422 case USCRIPT_SIMPLIFIED_HAN:
423 sRet = "zh";
424 break;
425 case USCRIPT_TRADITIONAL_HAN:
426 sRet = "zh";
427 break;
428 case USCRIPT_PAHAWH_HMONG:
429 sRet = "blu";
430 break;
431 case USCRIPT_OLD_HUNGARIAN:
432 sRet = "ohu";
433 break;
434 case USCRIPT_HARAPPAN_INDUS:
435 sRet = "xiv";
436 break;
437 case USCRIPT_JAVANESE:
438 sRet = "kaw";
439 break;
440 case USCRIPT_KAYAH_LI:
441 sRet = "eky";
442 break;
443 case USCRIPT_LATIN_FRAKTUR:
444 sRet = "de";
445 break;
446 case USCRIPT_LATIN_GAELIC:
447 sRet = "ga";
448 break;
449 case USCRIPT_LEPCHA:
450 sRet = "lep";
451 break;
452 case USCRIPT_LINEAR_A:
453 sRet = "ecr";
454 break;
455 case USCRIPT_MAYAN_HIEROGLYPHS:
456 sRet = "myn";
457 break;
458 case USCRIPT_MEROITIC:
459 sRet = "xmr";
460 break;
461 case USCRIPT_NKO:
462 sRet = "nqo";
463 break;
464 case USCRIPT_ORKHON:
465 sRet = "otk";
466 break;
467 case USCRIPT_OLD_PERMIC:
468 sRet = "kv";
469 break;
470 case USCRIPT_PHAGS_PA:
471 sRet = "xng";
472 break;
473 case USCRIPT_PHOENICIAN:
474 sRet = "phn";
475 break;
476 case USCRIPT_PHONETIC_POLLARD:
477 sRet = "hmd";
478 break;
479 case USCRIPT_RONGORONGO:
480 sRet = "rap";
481 break;
482 case USCRIPT_SARATI:
483 sRet = "qya";
484 break;
485 case USCRIPT_ESTRANGELO_SYRIAC:
486 sRet = "syr";
487 break;
488 case USCRIPT_WESTERN_SYRIAC:
489 sRet = "tru";
490 break;
491 case USCRIPT_EASTERN_SYRIAC:
492 sRet = "aii";
493 break;
494 case USCRIPT_TENGWAR:
495 sRet = "sjn";
496 break;
497 case USCRIPT_VAI:
498 sRet = "vai";
499 break;
500 case USCRIPT_VISIBLE_SPEECH:
501 sRet = "en";
502 break;
503 case USCRIPT_CUNEIFORM:
504 sRet = "akk";
505 break;
506 case USCRIPT_CARIAN:
507 sRet = "xcr";
508 break;
509 case USCRIPT_JAPANESE:
510 sRet = "ja";
511 break;
512 case USCRIPT_LANNA:
513 sRet = "nod";
514 break;
515 case USCRIPT_LYCIAN:
516 sRet = "xlc";
517 break;
518 case USCRIPT_LYDIAN:
519 sRet = "xld";
520 break;
521 case USCRIPT_OL_CHIKI:
522 sRet = "sat";
523 break;
524 case USCRIPT_REJANG:
525 sRet = "rej";
526 break;
527 case USCRIPT_SAURASHTRA:
528 sRet = "saz";
529 break;
530 case USCRIPT_SIGN_WRITING:
531 sRet = "en";
532 break;
533 case USCRIPT_SUNDANESE:
534 sRet = "su";
535 break;
536 case USCRIPT_MOON:
537 sRet = "en";
538 break;
539 case USCRIPT_MEITEI_MAYEK:
540 sRet = "mni";
541 break;
542 case USCRIPT_IMPERIAL_ARAMAIC:
543 sRet = "arc";
544 break;
545 case USCRIPT_AVESTAN:
546 sRet = "ae";
547 break;
548 case USCRIPT_CHAKMA:
549 sRet = "ccp";
550 break;
551 case USCRIPT_KOREAN:
552 sRet = "ko";
553 break;
554 case USCRIPT_KAITHI:
555 sRet = "awa";
556 break;
557 case USCRIPT_MANICHAEAN:
558 sRet = "xmn";
559 break;
560 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
561 case USCRIPT_PSALTER_PAHLAVI:
562 case USCRIPT_BOOK_PAHLAVI:
563 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
564 sRet = "xpr";
565 break;
566 case USCRIPT_SAMARITAN:
567 sRet = "heb";
568 break;
569 case USCRIPT_TAI_VIET:
570 sRet = "blt";
571 break;
572 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
573 sRet = "mic";
574 break;
575 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
576 case USCRIPT_NABATAEAN: //no language with an assigned code yet
577 sRet = "mis";
578 break;
579 case USCRIPT_PALMYRENE: //no language with an assigned code yet
580 sRet = "mis";
581 break;
582 case USCRIPT_BAMUM:
583 sRet = "bax";
584 break;
585 case USCRIPT_LISU:
586 sRet = "lis";
587 break;
588 case USCRIPT_NAKHI_GEBA:
589 sRet = "nxq";
590 break;
591 case USCRIPT_OLD_SOUTH_ARABIAN:
592 sRet = "xsa";
593 break;
594 case USCRIPT_BASSA_VAH:
595 sRet = "bsq";
596 break;
597 case USCRIPT_DUPLOYAN_SHORTAND:
598 sRet = "fr";
599 break;
600 case USCRIPT_ELBASAN:
601 sRet = "sq";
602 break;
603 case USCRIPT_GRANTHA:
604 sRet = "ta";
605 break;
606 case USCRIPT_KPELLE:
607 sRet = "kpe";
608 break;
609 case USCRIPT_LOMA:
610 sRet = "lom";
611 break;
612 case USCRIPT_MENDE:
613 sRet = "men";
614 break;
615 case USCRIPT_MEROITIC_CURSIVE:
616 sRet = "xmr";
617 break;
618 case USCRIPT_OLD_NORTH_ARABIAN:
619 sRet = "xna";
620 break;
621 case USCRIPT_SINDHI:
622 sRet = "sd";
623 break;
624 case USCRIPT_WARANG_CITI:
625 sRet = "hoc";
626 break;
627 #endif
628 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
629 case USCRIPT_AFAKA:
630 sRet = "djk";
631 break;
632 case USCRIPT_JURCHEN:
633 sRet = "juc";
634 break;
635 case USCRIPT_MRO:
636 sRet = "cmr";
637 break;
638 case USCRIPT_NUSHU: //no language with an assigned code yet
639 sRet = "mis";
640 break;
641 case USCRIPT_SHARADA:
642 sRet = "sa";
643 break;
644 case USCRIPT_SORA_SOMPENG:
645 sRet = "srb";
646 break;
647 case USCRIPT_TAKRI:
648 sRet = "doi";
649 break;
650 case USCRIPT_TANGUT:
651 sRet = "txg";
652 break;
653 case USCRIPT_WOLEAI:
654 sRet = "woe";
655 break;
656 #endif
657 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
658 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
659 sRet = "hlu";
660 break;
661 case USCRIPT_KHOJKI:
662 sRet = "gu";
663 break;
664 case USCRIPT_TIRHUTA:
665 sRet = "mai";
666 break;
667 #endif
668 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
669 case USCRIPT_CAUCASIAN_ALBANIAN:
670 sRet = "xag";
671 break;
672 case USCRIPT_MAHAJANI:
673 sRet = "mwr";
674 break;
675 #endif
676 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
677 case USCRIPT_AHOM:
678 sRet = "aho";
679 break;
680 case USCRIPT_HATRAN:
681 sRet = "qly-Hatr";
682 break;
683 case USCRIPT_MODI:
684 sRet = "mr-Modi";
685 break;
686 case USCRIPT_MULTANI:
687 sRet = "skr-Mutl";
688 break;
689 case USCRIPT_PAU_CIN_HAU:
690 sRet = "ctd-Pauc";
691 break;
692 case USCRIPT_SIDDHAM:
693 sRet = "sa-Sidd";
694 break;
695 #endif
696 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
697 case USCRIPT_ADLAM:
698 sRet = "mis"; // Adlm - Adlam for Fulani, no language code
699 break;
700 case USCRIPT_BHAIKSUKI:
701 sRet = "mis"; // Bhks - Bhaiksuki for some Buddhist texts, no language code
702 break;
703 case USCRIPT_MARCHEN:
704 sRet = "bo-Marc";
705 break;
706 case USCRIPT_NEWA:
707 sRet = "new-Newa";
708 break;
709 case USCRIPT_OSAGE:
710 sRet = "osa-Osge";
711 break;
712 case USCRIPT_HAN_WITH_BOPOMOFO:
713 sRet = "mis"; // Hanb - Han with Bopomofo, zh-Hanb ?
714 break;
715 case USCRIPT_JAMO:
716 sRet = "ko"; // Jamo - elements of Hangul Syllables
717 break;
718 case USCRIPT_SYMBOLS_EMOJI:
719 sRet = "mis"; // Zsye - Emoji variant
720 break;
721 #endif
722 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
723 case USCRIPT_MASARAM_GONDI:
724 sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno
725 break;
726 case USCRIPT_SOYOMBO:
727 sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
728 break;
729 case USCRIPT_ZANABAZAR_SQUARE:
730 sRet = "mn-Zanb"; // abugida to write Mongolian
731 break;
732 #endif
733 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
734 case USCRIPT_DOGRA:
735 sRet = "dgo"; // Dogri proper
736 break;
737 case USCRIPT_GUNJALA_GONDI:
738 sRet = "wsg"; // Adilabad Gondi
739 break;
740 case USCRIPT_MAKASAR:
741 sRet = "mak";
742 break;
743 case USCRIPT_MEDEFAIDRIN:
744 sRet = "mis-Medf"; // Uncoded with script
745 break;
746 case USCRIPT_HANIFI_ROHINGYA:
747 sRet = "rhg";
748 break;
749 case USCRIPT_SOGDIAN:
750 sRet = "sog";
751 break;
752 case USCRIPT_OLD_SOGDIAN:
753 sRet = "sog";
754 break;
755 #endif
756 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
757 case USCRIPT_ELYMAIC:
758 sRet = "arc-Elym";
759 break;
760 case USCRIPT_NYIAKENG_PUACHUE_HMONG:
761 sRet = "hmn-Hmnp"; // macrolanguage code
762 break;
763 case USCRIPT_NANDINAGARI:
764 sRet = "sa-Nand";
765 break;
766 case USCRIPT_WANCHO:
767 sRet = "nnp-Wcho";
768 break;
769 #endif
770 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
771 case USCRIPT_CHORASMIAN:
772 sRet = "xco-Chrs";
773 break;
774 case USCRIPT_DIVES_AKURU:
775 sRet = "dv-Diak";
776 break;
777 case USCRIPT_KHITAN_SMALL_SCRIPT:
778 sRet = "zkt-Kits";
779 break;
780 case USCRIPT_YEZIDI:
781 sRet = "kmr-Yezi";
782 break;
783 #endif
785 return sRet;
788 //Format a number as a percentage according to the rules of the given
789 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
790 OUString unicode::formatPercent(double dNumber,
791 const LanguageTag &rLangTag)
793 // get a currency formatter for this locale ID
794 UErrorCode errorCode=U_ZERO_ERROR;
796 LanguageTag aLangTag(rLangTag);
798 // As of CLDR Version 24 these languages were not listed as using spacing
799 // between number and % but are reported as such by our l10n groups
800 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
801 // so format using French which has the desired rules
802 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
803 aLangTag.reset("fr-FR");
805 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
807 std::unique_ptr<icu::NumberFormat> xF(
808 icu::NumberFormat::createPercentInstance(aLocale, errorCode));
809 if(U_FAILURE(errorCode))
811 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
812 return OUString::number(dNumber) + "%";
815 icu::UnicodeString output;
816 xF->format(dNumber/100, output);
817 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
818 output.length());
819 if (rLangTag.getLanguage() == "de")
821 //narrow no-break space instead of (normal) no-break space
822 return aRet.replace(0x00A0, 0x202F);
824 return aRet;
827 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
829 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
830 if( maInput.getLength() > 255 )
831 mbAllowMoreChars = false;
833 if( !mbAllowMoreChars )
834 return false;
836 bool bPreventNonHex = false;
837 if( maInput.indexOf("U+") != -1 )
838 bPreventNonHex = true;
840 switch ( unicode::getUnicodeType(uChar) )
842 case css::i18n::UnicodeType::SURROGATE:
843 if( bPreventNonHex )
845 mbAllowMoreChars = false;
846 return false;
849 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
851 maUtf16.append(uChar);
852 return true;
854 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
855 maUtf16.insert(0, uChar );
856 //end of hex strings, or unexpected order of high/low, so don't accept more
857 if( !maUtf16.isEmpty() )
858 maInput.append(maUtf16);
859 if( !maCombining.isEmpty() )
860 maInput.append(maCombining);
861 mbAllowMoreChars = false;
862 break;
864 case css::i18n::UnicodeType::NON_SPACING_MARK:
865 case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
866 if( bPreventNonHex )
868 mbAllowMoreChars = false;
869 return false;
872 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
873 if( !maUtf16.isEmpty() )
875 maInput = maUtf16;
876 if( !maCombining.isEmpty() )
877 maInput.append(maCombining);
878 mbAllowMoreChars = false;
879 return false;
881 maCombining.insert(0, uChar);
882 break;
884 default:
885 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
886 if( !maUtf16.isEmpty() )
888 maInput = maUtf16;
889 if( !maCombining.isEmpty() )
890 maInput.append(maCombining);
891 mbAllowMoreChars = false;
892 return false;
895 if( !maCombining.isEmpty() )
897 maCombining.insert(0, uChar);
898 maInput = maCombining;
899 mbAllowMoreChars = false;
900 return false;
903 // 0 - 1f are control characters. Do not process those.
904 if( uChar < 0x20 )
906 mbAllowMoreChars = false;
907 return false;
910 switch( uChar )
912 case 'u':
913 case 'U':
914 // U+ notation found. Continue looking for another one.
915 if( mbRequiresU )
917 mbRequiresU = false;
918 maInput.insert(0,"U+");
920 // treat as a normal character
921 else
923 mbAllowMoreChars = false;
924 if( !bPreventNonHex )
925 maInput.insertUtf32(0, uChar);
927 break;
928 case '+':
929 // + already found: skip when not U, or edge case of +U+xxxx
930 if( mbRequiresU || (maInput.indexOf("U+") == 0) )
931 mbAllowMoreChars = false;
932 // hex chars followed by '+' - now require a 'U'
933 else if ( !maInput.isEmpty() )
934 mbRequiresU = true;
935 // treat as a normal character
936 else
938 mbAllowMoreChars = false;
939 if( !bPreventNonHex )
940 maInput.insertUtf32(0, uChar);
942 break;
943 default:
944 // + already found. Since not U, cancel further input
945 if( mbRequiresU )
946 mbAllowMoreChars = false;
947 // maximum digits per notation is 8: only one notation
948 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
949 mbAllowMoreChars = false;
950 // maximum digits per notation is 8: previous notation found
951 else if( maInput.indexOf("U+") == 8 )
952 mbAllowMoreChars = false;
953 // a hex character. Add to string.
954 else if( rtl::isAsciiHexDigit(uChar) )
956 mbIsHexString = true;
957 maInput.insertUtf32(0, uChar);
959 // not a hex character: stop input. keep if it is the first input provided
960 else
962 mbAllowMoreChars = false;
963 if( maInput.isEmpty() )
964 maInput.insertUtf32(0, uChar);
968 return mbAllowMoreChars;
971 OUString ToggleUnicodeCodepoint::StringToReplace()
973 if( maInput.isEmpty() )
975 //edge case - input finished with incomplete low surrogate or combining characters without a base
976 if( mbAllowMoreChars )
978 if( !maUtf16.isEmpty() )
979 maInput = maUtf16;
980 if( !maCombining.isEmpty() )
981 maInput.append(maCombining);
983 return maInput.toString();
986 if( !mbIsHexString )
987 return maInput.toString();
989 //this function potentially modifies the input string. Prevent addition of further characters
990 mbAllowMoreChars = false;
992 //validate unicode notation.
993 OUString sIn;
994 sal_uInt32 nUnicode = 0;
995 sal_Int32 nUPlus = maInput.indexOf("U+");
996 //if U+ notation used, strip off all extra chars added not in U+ notation
997 if( nUPlus != -1 )
999 maInput.remove(0, nUPlus);
1000 sIn = maInput.copy(2).makeStringAndClear();
1001 nUPlus = sIn.indexOf("U+");
1003 else
1004 sIn = maInput.toString();
1005 while( nUPlus != -1 )
1007 nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1008 //prevent creating control characters or invalid Unicode values
1009 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1010 maInput = sIn.copy(nUPlus);
1011 sIn = sIn.copy(nUPlus+2);
1012 nUPlus = sIn.indexOf("U+");
1015 nUnicode = sIn.toUInt32(16);
1016 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1017 maInput.truncate().append( sIn[sIn.getLength()-1] );
1018 return maInput.toString();
1021 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
1023 OUString sIn = StringToReplace();
1024 sal_Int32 nPos = 0;
1025 sal_uInt32 counter = 0;
1026 while( nPos < sIn.getLength() )
1028 sIn.iterateCodePoints(&nPos);
1029 ++counter;
1031 return counter;
1034 OUString ToggleUnicodeCodepoint::ReplacementString()
1036 OUString sIn = StringToReplace();
1037 OUStringBuffer output = "";
1038 sal_Int32 nUPlus = sIn.indexOf("U+");
1039 // convert from hex notation to glyph
1040 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1042 sal_uInt32 nUnicode = 0;
1043 if( nUPlus == 0)
1045 sIn = sIn.copy(2);
1046 nUPlus = sIn.indexOf("U+");
1048 while( nUPlus > 0 )
1050 nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1051 output.appendUtf32( nUnicode );
1053 sIn = sIn.copy(nUPlus+2);
1054 nUPlus = sIn.indexOf("U+");
1056 nUnicode = sIn.toUInt32(16);
1057 output.appendUtf32( nUnicode );
1059 // convert from glyph to hex notation
1060 else
1062 sal_Int32 nPos = 0;
1063 while( nPos < sIn.getLength() )
1065 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1066 //pad with zeros - minimum length of 4.
1067 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1068 aTmp.insert( 0,"0" );
1069 output.append( "U+" );
1070 output.append( aTmp );
1073 return output.toString();
1076 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */