Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / i18nutil / source / utility / unicode.cxx
blobb98fa9cb29c40e0fd09f9ea4deeadc8374bea3d3
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include <unicode/uchar.h>
28 #include "unicode_data.h"
29 #include <rtl/character.hxx>
30 #include <o3tl/string_view.hxx>
31 #include <memory>
33 // Workaround for glibc braindamage:
34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36 #undef CURRENCY_SYMBOL
38 using namespace ::com::sun::star::i18n;
40 template<class L, typename T>
41 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
43 sal_Int16 i = 0;
44 css::i18n::UnicodeScript type = typeList[0].to;
45 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
46 type = typeList[++i].to;
49 return (type < UnicodeScript_kScriptCount &&
50 ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
51 typeList[i].value : unknownType;
54 sal_Int16
55 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
56 return getScriptType(ch, typeList, unknownType);
59 sal_Unicode
60 unicode::getUnicodeScriptStart( UnicodeScript type) {
61 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
64 sal_Unicode
65 unicode::getUnicodeScriptEnd( UnicodeScript type) {
66 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
69 sal_Int16
70 unicode::getUnicodeType( const sal_Unicode ch ) {
71 static sal_Unicode c = 0x00;
72 static sal_Int16 r = 0x00;
74 if (ch == c) return r;
75 else c = ch;
77 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
78 r = static_cast<sal_Int16>(
79 (address < UnicodeTypeNumberBlock)
80 ? UnicodeTypeBlockValue[address]
81 : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
82 return r;
85 sal_uInt8
86 unicode::getUnicodeDirection( const sal_Unicode ch ) {
87 static sal_Unicode c = 0x00;
88 static sal_uInt8 r = 0x00;
90 if (ch == c) return r;
91 else c = ch;
93 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
94 r = (address < UnicodeDirectionNumberBlock)
95 ? UnicodeDirectionBlockValue[address]
96 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
97 return r;
100 sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
101 nChar = u_charMirror(nChar);
102 return nChar;
105 #define bit(name) (1U << name)
107 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
109 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
111 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
113 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
114 bit(UnicodeType::MODIFIER_LETTER)|\
115 bit(UnicodeType::OTHER_LETTER)
117 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
118 bit(UnicodeType::LINE_SEPARATOR)|\
119 bit(UnicodeType::PARAGRAPH_SEPARATOR)
121 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
122 bit(UnicodeType::FORMAT)|\
123 bit(UnicodeType::LINE_SEPARATOR)|\
124 bit(UnicodeType::PARAGRAPH_SEPARATOR)
126 #define IsType(func, mask) \
127 bool func( const sal_Unicode ch) {\
128 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
131 IsType(unicode::isControl, CONTROLMASK)
132 IsType(unicode::isAlpha, ALPHAMASK)
133 IsType(unicode::isSpace, SPACEMASK)
135 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
136 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
138 bool unicode::isWhiteSpace( const sal_Unicode ch) {
139 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
142 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
144 //See unicode/uscript.h
145 sal_Int16 nRet;
146 switch (eScript)
148 case USCRIPT_INVALID_CODE:
149 case USCRIPT_COMMON:
150 case USCRIPT_INHERITED:
151 case USCRIPT_UNWRITTEN_LANGUAGES:
152 case USCRIPT_UNKNOWN:
153 case USCRIPT_MATHEMATICAL_NOTATION:
154 case USCRIPT_SYMBOLS:
155 case USCRIPT_WARANG_CITI:
156 nRet = ScriptType::WEAK;
157 break;
158 case USCRIPT_ARMENIAN:
159 case USCRIPT_CHEROKEE:
160 case USCRIPT_COPTIC:
161 case USCRIPT_CYRILLIC:
162 case USCRIPT_GEORGIAN:
163 case USCRIPT_GOTHIC:
164 case USCRIPT_GREEK:
165 case USCRIPT_LATIN:
166 case USCRIPT_OGHAM:
167 case USCRIPT_OLD_ITALIC:
168 case USCRIPT_RUNIC:
169 case USCRIPT_CANADIAN_ABORIGINAL:
170 case USCRIPT_BRAILLE:
171 case USCRIPT_CYPRIOT:
172 case USCRIPT_OSMANYA:
173 case USCRIPT_SHAVIAN:
174 case USCRIPT_KATAKANA_OR_HIRAGANA:
175 case USCRIPT_GLAGOLITIC:
176 case USCRIPT_CIRTH:
177 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
178 case USCRIPT_OLD_HUNGARIAN:
179 case USCRIPT_LATIN_FRAKTUR:
180 case USCRIPT_LATIN_GAELIC:
181 nRet = ScriptType::LATIN;
182 break;
183 case USCRIPT_BOPOMOFO:
184 case USCRIPT_HAN:
185 case USCRIPT_HANGUL:
186 case USCRIPT_HIRAGANA:
187 case USCRIPT_KATAKANA:
188 case USCRIPT_YI:
189 case USCRIPT_SIMPLIFIED_HAN:
190 case USCRIPT_TRADITIONAL_HAN:
191 case USCRIPT_JAPANESE:
192 case USCRIPT_KOREAN:
193 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
194 case USCRIPT_TANGUT:
195 #endif
196 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
197 case USCRIPT_KHITAN_SMALL_SCRIPT:
198 #endif
199 nRet = ScriptType::ASIAN;
200 break;
201 case USCRIPT_ARABIC:
202 case USCRIPT_BENGALI:
203 case USCRIPT_DESERET:
204 case USCRIPT_DEVANAGARI:
205 case USCRIPT_ETHIOPIC:
206 case USCRIPT_GUJARATI:
207 case USCRIPT_GURMUKHI:
208 case USCRIPT_HEBREW:
209 case USCRIPT_KANNADA:
210 case USCRIPT_KHMER:
211 case USCRIPT_LAO:
212 case USCRIPT_MALAYALAM:
213 case USCRIPT_MONGOLIAN:
214 case USCRIPT_MYANMAR:
215 case USCRIPT_ORIYA:
216 case USCRIPT_SINHALA:
217 case USCRIPT_SYRIAC:
218 case USCRIPT_TAMIL:
219 case USCRIPT_TELUGU:
220 case USCRIPT_THAANA:
221 case USCRIPT_THAI:
222 case USCRIPT_TIBETAN:
223 case USCRIPT_TAGALOG:
224 case USCRIPT_HANUNOO:
225 case USCRIPT_BUHID:
226 case USCRIPT_TAGBANWA:
227 case USCRIPT_LIMBU:
228 case USCRIPT_LINEAR_B:
229 case USCRIPT_TAI_LE:
230 case USCRIPT_UGARITIC:
231 case USCRIPT_BUGINESE:
232 case USCRIPT_KHAROSHTHI:
233 case USCRIPT_SYLOTI_NAGRI:
234 case USCRIPT_NEW_TAI_LUE:
235 case USCRIPT_TIFINAGH:
236 case USCRIPT_OLD_PERSIAN:
237 case USCRIPT_BALINESE:
238 case USCRIPT_BATAK:
239 case USCRIPT_BLISSYMBOLS:
240 case USCRIPT_BRAHMI:
241 case USCRIPT_CHAM:
242 case USCRIPT_DEMOTIC_EGYPTIAN:
243 case USCRIPT_HIERATIC_EGYPTIAN:
244 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
245 case USCRIPT_KHUTSURI:
246 case USCRIPT_PAHAWH_HMONG:
247 case USCRIPT_HARAPPAN_INDUS:
248 case USCRIPT_JAVANESE:
249 case USCRIPT_KAYAH_LI:
250 case USCRIPT_LEPCHA:
251 case USCRIPT_LINEAR_A:
252 case USCRIPT_MANDAEAN:
253 case USCRIPT_MAYAN_HIEROGLYPHS:
254 case USCRIPT_MEROITIC:
255 case USCRIPT_NKO:
256 case USCRIPT_ORKHON:
257 case USCRIPT_OLD_PERMIC:
258 case USCRIPT_PHAGS_PA:
259 case USCRIPT_PHOENICIAN:
260 case USCRIPT_PHONETIC_POLLARD:
261 case USCRIPT_RONGORONGO:
262 case USCRIPT_SARATI:
263 case USCRIPT_ESTRANGELO_SYRIAC:
264 case USCRIPT_WESTERN_SYRIAC:
265 case USCRIPT_EASTERN_SYRIAC:
266 case USCRIPT_TENGWAR:
267 case USCRIPT_VAI:
268 case USCRIPT_VISIBLE_SPEECH:
269 case USCRIPT_CUNEIFORM:
270 case USCRIPT_CARIAN:
271 case USCRIPT_LANNA:
272 case USCRIPT_LYCIAN:
273 case USCRIPT_LYDIAN:
274 case USCRIPT_OL_CHIKI:
275 case USCRIPT_REJANG:
276 case USCRIPT_SAURASHTRA:
277 case USCRIPT_SIGN_WRITING:
278 case USCRIPT_SUNDANESE:
279 case USCRIPT_MOON:
280 case USCRIPT_MEITEI_MAYEK:
281 case USCRIPT_IMPERIAL_ARAMAIC:
282 case USCRIPT_AVESTAN:
283 case USCRIPT_CHAKMA:
284 case USCRIPT_KAITHI:
285 case USCRIPT_MANICHAEAN:
286 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
287 case USCRIPT_PSALTER_PAHLAVI:
288 case USCRIPT_BOOK_PAHLAVI:
289 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
290 case USCRIPT_SAMARITAN:
291 case USCRIPT_TAI_VIET:
292 case USCRIPT_BAMUM:
293 case USCRIPT_LISU:
294 case USCRIPT_NAKHI_GEBA:
295 case USCRIPT_OLD_SOUTH_ARABIAN:
296 case USCRIPT_BASSA_VAH:
297 case USCRIPT_DUPLOYAN_SHORTAND:
298 case USCRIPT_ELBASAN:
299 case USCRIPT_GRANTHA:
300 case USCRIPT_KPELLE:
301 case USCRIPT_LOMA:
302 case USCRIPT_MENDE:
303 case USCRIPT_MEROITIC_CURSIVE:
304 case USCRIPT_OLD_NORTH_ARABIAN:
305 case USCRIPT_NABATAEAN:
306 case USCRIPT_PALMYRENE:
307 case USCRIPT_SINDHI:
308 default: // anything new is going to be pretty wild
309 nRet = ScriptType::COMPLEX;
310 break;
312 return nRet;
315 sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag )
317 constexpr int32_t nBuf = 42;
318 UScriptCode aBuf[nBuf];
319 if (rLanguageTag.hasScript())
321 aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
322 OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
324 else
326 OUString aName;
327 if (rLanguageTag.getCountry().isEmpty())
328 aName = rLanguageTag.getLanguage();
329 else
330 aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
331 UErrorCode status = U_ZERO_ERROR;
332 const int32_t nScripts = uscript_getCode(
333 OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
334 aBuf, nBuf, &status);
335 // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
336 // and required capacity returned, but really..
337 if (nScripts == 0 || !U_SUCCESS(status))
338 return css::i18n::ScriptType::LATIN;
340 return getScriptClassFromUScriptCode( aBuf[0]);
343 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
345 OString sRet;
346 switch (eScript)
348 case USCRIPT_CODE_LIMIT:
349 case USCRIPT_INVALID_CODE:
350 sRet = "zxx";
351 break;
352 case USCRIPT_COMMON:
353 case USCRIPT_INHERITED:
354 sRet = "und";
355 break;
356 case USCRIPT_MATHEMATICAL_NOTATION:
357 case USCRIPT_SYMBOLS:
358 sRet = "zxx";
359 break;
360 case USCRIPT_UNWRITTEN_LANGUAGES:
361 case USCRIPT_UNKNOWN:
362 sRet = "und";
363 break;
364 case USCRIPT_ARABIC:
365 sRet = "ar";
366 break;
367 case USCRIPT_ARMENIAN:
368 sRet = "hy";
369 break;
370 case USCRIPT_BENGALI:
371 sRet = "bn";
372 break;
373 case USCRIPT_BOPOMOFO:
374 sRet = "zh";
375 break;
376 case USCRIPT_CHEROKEE:
377 sRet = "chr";
378 break;
379 case USCRIPT_COPTIC:
380 sRet = "cop";
381 break;
382 case USCRIPT_CYRILLIC:
383 sRet = "ru";
384 break;
385 case USCRIPT_DESERET:
386 sRet = "en";
387 break;
388 case USCRIPT_DEVANAGARI:
389 sRet = "hi";
390 break;
391 case USCRIPT_ETHIOPIC:
392 sRet = "am";
393 break;
394 case USCRIPT_GEORGIAN:
395 sRet = "ka";
396 break;
397 case USCRIPT_GOTHIC:
398 sRet = "got";
399 break;
400 case USCRIPT_GREEK:
401 sRet = "el";
402 break;
403 case USCRIPT_GUJARATI:
404 sRet = "gu";
405 break;
406 case USCRIPT_GURMUKHI:
407 sRet = "pa";
408 break;
409 case USCRIPT_HAN:
410 sRet = "zh";
411 break;
412 case USCRIPT_HANGUL:
413 sRet = "ko";
414 break;
415 case USCRIPT_HEBREW:
416 sRet = "hr";
417 break;
418 case USCRIPT_HIRAGANA:
419 sRet = "ja";
420 break;
421 case USCRIPT_KANNADA:
422 sRet = "kn";
423 break;
424 case USCRIPT_KATAKANA:
425 sRet = "ja";
426 break;
427 case USCRIPT_KHMER:
428 sRet = "km";
429 break;
430 case USCRIPT_LAO:
431 sRet = "lo";
432 break;
433 case USCRIPT_LATIN:
434 sRet = "en";
435 break;
436 case USCRIPT_MALAYALAM:
437 sRet = "ml";
438 break;
439 case USCRIPT_MONGOLIAN:
440 sRet = "mn";
441 break;
442 case USCRIPT_MYANMAR:
443 sRet = "my";
444 break;
445 case USCRIPT_OGHAM:
446 sRet = "pgl";
447 break;
448 case USCRIPT_OLD_ITALIC:
449 sRet = "osc";
450 break;
451 case USCRIPT_ORIYA:
452 sRet = "or";
453 break;
454 case USCRIPT_RUNIC:
455 sRet = "ang";
456 break;
457 case USCRIPT_SINHALA:
458 sRet = "si";
459 break;
460 case USCRIPT_SYRIAC:
461 sRet = "syr";
462 break;
463 case USCRIPT_TAMIL:
464 sRet = "ta";
465 break;
466 case USCRIPT_TELUGU:
467 sRet = "te";
468 break;
469 case USCRIPT_THAANA:
470 sRet = "dv";
471 break;
472 case USCRIPT_THAI:
473 sRet = "th";
474 break;
475 case USCRIPT_TIBETAN:
476 sRet = "bo";
477 break;
478 case USCRIPT_CANADIAN_ABORIGINAL:
479 sRet = "iu";
480 break;
481 case USCRIPT_YI:
482 sRet = "ii";
483 break;
484 case USCRIPT_TAGALOG:
485 sRet = "tl";
486 break;
487 case USCRIPT_HANUNOO:
488 sRet = "hnn";
489 break;
490 case USCRIPT_BUHID:
491 sRet = "bku";
492 break;
493 case USCRIPT_TAGBANWA:
494 sRet = "tbw";
495 break;
496 case USCRIPT_BRAILLE:
497 sRet = "en";
498 break;
499 case USCRIPT_CYPRIOT:
500 sRet = "ecy";
501 break;
502 case USCRIPT_LIMBU:
503 sRet = "lif";
504 break;
505 case USCRIPT_LINEAR_B:
506 sRet = "gmy";
507 break;
508 case USCRIPT_OSMANYA:
509 sRet = "so";
510 break;
511 case USCRIPT_SHAVIAN:
512 sRet = "en";
513 break;
514 case USCRIPT_TAI_LE:
515 sRet = "tdd";
516 break;
517 case USCRIPT_UGARITIC:
518 sRet = "uga";
519 break;
520 case USCRIPT_KATAKANA_OR_HIRAGANA:
521 sRet = "ja";
522 break;
523 case USCRIPT_BUGINESE:
524 sRet = "bug";
525 break;
526 case USCRIPT_GLAGOLITIC:
527 sRet = "ch";
528 break;
529 case USCRIPT_KHAROSHTHI:
530 sRet = "pra";
531 break;
532 case USCRIPT_SYLOTI_NAGRI:
533 sRet = "syl";
534 break;
535 case USCRIPT_NEW_TAI_LUE:
536 sRet = "khb";
537 break;
538 case USCRIPT_TIFINAGH:
539 sRet = "tmh";
540 break;
541 case USCRIPT_OLD_PERSIAN:
542 sRet = "peo";
543 break;
544 case USCRIPT_BALINESE:
545 sRet = "ban";
546 break;
547 case USCRIPT_BATAK:
548 sRet = "btk";
549 break;
550 case USCRIPT_BLISSYMBOLS:
551 sRet = "en";
552 break;
553 case USCRIPT_BRAHMI:
554 sRet = "pra";
555 break;
556 case USCRIPT_CHAM:
557 sRet = "cja";
558 break;
559 case USCRIPT_CIRTH:
560 sRet = "sjn";
561 break;
562 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
563 sRet = "cu";
564 break;
565 case USCRIPT_DEMOTIC_EGYPTIAN:
566 case USCRIPT_HIERATIC_EGYPTIAN:
567 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
568 sRet = "egy";
569 break;
570 case USCRIPT_KHUTSURI:
571 sRet = "ka";
572 break;
573 case USCRIPT_SIMPLIFIED_HAN:
574 sRet = "zh";
575 break;
576 case USCRIPT_TRADITIONAL_HAN:
577 sRet = "zh";
578 break;
579 case USCRIPT_PAHAWH_HMONG:
580 sRet = "blu";
581 break;
582 case USCRIPT_OLD_HUNGARIAN:
583 sRet = "ohu";
584 break;
585 case USCRIPT_HARAPPAN_INDUS:
586 sRet = "xiv";
587 break;
588 case USCRIPT_JAVANESE:
589 sRet = "kaw";
590 break;
591 case USCRIPT_KAYAH_LI:
592 sRet = "eky";
593 break;
594 case USCRIPT_LATIN_FRAKTUR:
595 sRet = "de";
596 break;
597 case USCRIPT_LATIN_GAELIC:
598 sRet = "ga";
599 break;
600 case USCRIPT_LEPCHA:
601 sRet = "lep";
602 break;
603 case USCRIPT_LINEAR_A:
604 sRet = "ecr";
605 break;
606 case USCRIPT_MAYAN_HIEROGLYPHS:
607 sRet = "myn";
608 break;
609 case USCRIPT_MEROITIC:
610 sRet = "xmr";
611 break;
612 case USCRIPT_NKO:
613 sRet = "nqo";
614 break;
615 case USCRIPT_ORKHON:
616 sRet = "otk";
617 break;
618 case USCRIPT_OLD_PERMIC:
619 sRet = "kv";
620 break;
621 case USCRIPT_PHAGS_PA:
622 sRet = "xng";
623 break;
624 case USCRIPT_PHOENICIAN:
625 sRet = "phn";
626 break;
627 case USCRIPT_PHONETIC_POLLARD:
628 sRet = "hmd";
629 break;
630 case USCRIPT_RONGORONGO:
631 sRet = "rap";
632 break;
633 case USCRIPT_SARATI:
634 sRet = "qya";
635 break;
636 case USCRIPT_ESTRANGELO_SYRIAC:
637 sRet = "syr";
638 break;
639 case USCRIPT_WESTERN_SYRIAC:
640 sRet = "tru";
641 break;
642 case USCRIPT_EASTERN_SYRIAC:
643 sRet = "aii";
644 break;
645 case USCRIPT_TENGWAR:
646 sRet = "sjn";
647 break;
648 case USCRIPT_VAI:
649 sRet = "vai";
650 break;
651 case USCRIPT_VISIBLE_SPEECH:
652 sRet = "en";
653 break;
654 case USCRIPT_CUNEIFORM:
655 sRet = "akk";
656 break;
657 case USCRIPT_CARIAN:
658 sRet = "xcr";
659 break;
660 case USCRIPT_JAPANESE:
661 sRet = "ja";
662 break;
663 case USCRIPT_LANNA:
664 sRet = "nod";
665 break;
666 case USCRIPT_LYCIAN:
667 sRet = "xlc";
668 break;
669 case USCRIPT_LYDIAN:
670 sRet = "xld";
671 break;
672 case USCRIPT_OL_CHIKI:
673 sRet = "sat";
674 break;
675 case USCRIPT_REJANG:
676 sRet = "rej";
677 break;
678 case USCRIPT_SAURASHTRA:
679 sRet = "saz";
680 break;
681 case USCRIPT_SIGN_WRITING:
682 sRet = "en";
683 break;
684 case USCRIPT_SUNDANESE:
685 sRet = "su";
686 break;
687 case USCRIPT_MOON:
688 sRet = "en";
689 break;
690 case USCRIPT_MEITEI_MAYEK:
691 sRet = "mni";
692 break;
693 case USCRIPT_IMPERIAL_ARAMAIC:
694 sRet = "arc";
695 break;
696 case USCRIPT_AVESTAN:
697 sRet = "ae";
698 break;
699 case USCRIPT_CHAKMA:
700 sRet = "ccp";
701 break;
702 case USCRIPT_KOREAN:
703 sRet = "ko";
704 break;
705 case USCRIPT_KAITHI:
706 sRet = "awa";
707 break;
708 case USCRIPT_MANICHAEAN:
709 sRet = "xmn";
710 break;
711 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
712 case USCRIPT_PSALTER_PAHLAVI:
713 case USCRIPT_BOOK_PAHLAVI:
714 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
715 sRet = "xpr";
716 break;
717 case USCRIPT_SAMARITAN:
718 sRet = "heb";
719 break;
720 case USCRIPT_TAI_VIET:
721 sRet = "blt";
722 break;
723 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
724 sRet = "mic";
725 break;
726 case USCRIPT_NABATAEAN:
727 sRet = "mis-Nbat"; // Uncoded with script
728 break;
729 case USCRIPT_PALMYRENE:
730 sRet = "mis-Palm"; // Uncoded with script
731 break;
732 case USCRIPT_BAMUM:
733 sRet = "bax";
734 break;
735 case USCRIPT_LISU:
736 sRet = "lis";
737 break;
738 case USCRIPT_NAKHI_GEBA:
739 sRet = "nxq";
740 break;
741 case USCRIPT_OLD_SOUTH_ARABIAN:
742 sRet = "xsa";
743 break;
744 case USCRIPT_BASSA_VAH:
745 sRet = "bsq";
746 break;
747 case USCRIPT_DUPLOYAN_SHORTAND:
748 sRet = "fr";
749 break;
750 case USCRIPT_ELBASAN:
751 sRet = "sq";
752 break;
753 case USCRIPT_GRANTHA:
754 sRet = "ta";
755 break;
756 case USCRIPT_KPELLE:
757 sRet = "kpe";
758 break;
759 case USCRIPT_LOMA:
760 sRet = "lom";
761 break;
762 case USCRIPT_MENDE:
763 sRet = "men";
764 break;
765 case USCRIPT_MEROITIC_CURSIVE:
766 sRet = "xmr";
767 break;
768 case USCRIPT_OLD_NORTH_ARABIAN:
769 sRet = "xna";
770 break;
771 case USCRIPT_SINDHI:
772 sRet = "sd";
773 break;
774 case USCRIPT_WARANG_CITI:
775 sRet = "hoc";
776 break;
777 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
778 case USCRIPT_AFAKA:
779 sRet = "djk";
780 break;
781 case USCRIPT_JURCHEN:
782 sRet = "juc";
783 break;
784 case USCRIPT_MRO:
785 sRet = "cmr";
786 break;
787 case USCRIPT_NUSHU:
788 sRet = "mis-Nshu"; // Uncoded with script
789 break;
790 case USCRIPT_SHARADA:
791 sRet = "sa";
792 break;
793 case USCRIPT_SORA_SOMPENG:
794 sRet = "srb";
795 break;
796 case USCRIPT_TAKRI:
797 sRet = "doi";
798 break;
799 case USCRIPT_TANGUT:
800 sRet = "txg";
801 break;
802 case USCRIPT_WOLEAI:
803 sRet = "woe";
804 break;
805 #endif
806 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
807 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
808 sRet = "hlu";
809 break;
810 case USCRIPT_KHOJKI:
811 sRet = "gu";
812 break;
813 case USCRIPT_TIRHUTA:
814 sRet = "mai";
815 break;
816 #endif
817 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
818 case USCRIPT_CAUCASIAN_ALBANIAN:
819 sRet = "xag";
820 break;
821 case USCRIPT_MAHAJANI:
822 sRet = "mwr";
823 break;
824 #endif
825 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
826 case USCRIPT_AHOM:
827 sRet = "aho";
828 break;
829 case USCRIPT_HATRAN:
830 sRet = "qly-Hatr";
831 break;
832 case USCRIPT_MODI:
833 sRet = "mr-Modi";
834 break;
835 case USCRIPT_MULTANI:
836 sRet = "skr-Mutl";
837 break;
838 case USCRIPT_PAU_CIN_HAU:
839 sRet = "ctd-Pauc";
840 break;
841 case USCRIPT_SIDDHAM:
842 sRet = "sa-Sidd";
843 break;
844 #endif
845 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
846 case USCRIPT_ADLAM:
847 sRet = "mis-Adlm"; // Adlam for Fulani, no language code
848 break;
849 case USCRIPT_BHAIKSUKI:
850 sRet = "mis-Bhks"; // Bhaiksuki for some Buddhist texts, no language code
851 break;
852 case USCRIPT_MARCHEN:
853 sRet = "bo-Marc";
854 break;
855 case USCRIPT_NEWA:
856 sRet = "new-Newa";
857 break;
858 case USCRIPT_OSAGE:
859 sRet = "osa-Osge";
860 break;
861 case USCRIPT_HAN_WITH_BOPOMOFO:
862 sRet = "mis-Hanb"; // Han with Bopomofo, zh-Hanb ?
863 break;
864 case USCRIPT_JAMO:
865 sRet = "ko"; // Jamo - elements of Hangul Syllables
866 break;
867 case USCRIPT_SYMBOLS_EMOJI:
868 sRet = "mis-Zsye"; // Emoji variant
869 break;
870 #endif
871 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
872 case USCRIPT_MASARAM_GONDI:
873 sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno
874 break;
875 case USCRIPT_SOYOMBO:
876 sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
877 break;
878 case USCRIPT_ZANABAZAR_SQUARE:
879 sRet = "mn-Zanb"; // abugida to write Mongolian
880 break;
881 #endif
882 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
883 case USCRIPT_DOGRA:
884 sRet = "dgo"; // Dogri proper
885 break;
886 case USCRIPT_GUNJALA_GONDI:
887 sRet = "wsg"; // Adilabad Gondi
888 break;
889 case USCRIPT_MAKASAR:
890 sRet = "mak";
891 break;
892 case USCRIPT_MEDEFAIDRIN:
893 sRet = "dmf-Medf";
894 break;
895 case USCRIPT_HANIFI_ROHINGYA:
896 sRet = "rhg";
897 break;
898 case USCRIPT_SOGDIAN:
899 sRet = "sog";
900 break;
901 case USCRIPT_OLD_SOGDIAN:
902 sRet = "sog";
903 break;
904 #endif
905 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
906 case USCRIPT_ELYMAIC:
907 sRet = "arc-Elym";
908 break;
909 case USCRIPT_NYIAKENG_PUACHUE_HMONG:
910 sRet = "hmn-Hmnp"; // macrolanguage code
911 break;
912 case USCRIPT_NANDINAGARI:
913 sRet = "sa-Nand";
914 break;
915 case USCRIPT_WANCHO:
916 sRet = "nnp-Wcho";
917 break;
918 #endif
919 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
920 case USCRIPT_CHORASMIAN:
921 sRet = "xco-Chrs";
922 break;
923 case USCRIPT_DIVES_AKURU:
924 sRet = "dv-Diak";
925 break;
926 case USCRIPT_KHITAN_SMALL_SCRIPT:
927 sRet = "zkt-Kits";
928 break;
929 case USCRIPT_YEZIDI:
930 sRet = "kmr-Yezi";
931 break;
932 #endif
933 #if (U_ICU_VERSION_MAJOR_NUM >= 70)
934 case USCRIPT_CYPRO_MINOAN:
935 sRet = "mis-Cpmn"; // Uncoded with script
936 break;
937 case USCRIPT_OLD_UYGHUR:
938 sRet = "oui-Ougr";
939 break;
940 case USCRIPT_TANGSA:
941 sRet = "nst-Tnsa";
942 break;
943 case USCRIPT_TOTO:
944 sRet = "txo-Toto";
945 break;
946 case USCRIPT_VITHKUQI:
947 sRet = "sq-Vith"; // macrolanguage code
948 break;
949 #endif
950 #if (U_ICU_VERSION_MAJOR_NUM >= 72)
951 case USCRIPT_KAWI:
952 sRet = "mis-Kawi"; // Uncoded with script
953 break;
954 case USCRIPT_NAG_MUNDARI:
955 sRet = "unr-Nagm";
956 break;
957 #endif
959 return sRet;
962 //Format a number as a percentage according to the rules of the given
963 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
964 OUString unicode::formatPercent(double dNumber,
965 const LanguageTag &rLangTag)
967 // get a currency formatter for this locale ID
968 UErrorCode errorCode=U_ZERO_ERROR;
970 LanguageTag aLangTag(rLangTag);
972 // As of CLDR Version 24 these languages were not listed as using spacing
973 // between number and % but are reported as such by our l10n groups
974 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
975 // so format using French which has the desired rules
976 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
977 aLangTag.reset("fr-FR");
979 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
981 std::unique_ptr<icu::NumberFormat> xF(
982 icu::NumberFormat::createPercentInstance(aLocale, errorCode));
983 if(U_FAILURE(errorCode))
985 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
986 return OUString::number(dNumber) + "%";
989 icu::UnicodeString output;
990 xF->format(dNumber/100, output);
991 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
992 output.length());
993 if (rLangTag.getLanguage() == "de")
995 //narrow no-break space instead of (normal) no-break space
996 return aRet.replace(0x00A0, 0x202F);
998 return aRet;
1001 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
1003 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1004 if( maInput.getLength() > 255 )
1005 mbAllowMoreChars = false;
1007 if( !mbAllowMoreChars )
1008 return false;
1010 bool bPreventNonHex = false;
1011 if( maInput.indexOf("U+") != -1 )
1012 bPreventNonHex = true;
1014 switch ( unicode::getUnicodeType(uChar) )
1016 case css::i18n::UnicodeType::SURROGATE:
1017 if( bPreventNonHex )
1019 mbAllowMoreChars = false;
1020 return false;
1023 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
1025 maUtf16.append(uChar);
1026 return true;
1028 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
1029 maUtf16.insert(0, uChar );
1030 //end of hex strings, or unexpected order of high/low, so don't accept more
1031 if( !maUtf16.isEmpty() )
1032 maInput.append(maUtf16);
1033 if( !maCombining.isEmpty() )
1034 maInput.append(maCombining);
1035 mbAllowMoreChars = false;
1036 break;
1038 case css::i18n::UnicodeType::NON_SPACING_MARK:
1039 case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
1040 if( bPreventNonHex )
1042 mbAllowMoreChars = false;
1043 return false;
1046 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1047 if( !maUtf16.isEmpty() )
1049 maInput = maUtf16;
1050 if( !maCombining.isEmpty() )
1051 maInput.append(maCombining);
1052 mbAllowMoreChars = false;
1053 return false;
1055 maCombining.insert(0, uChar);
1056 break;
1058 default:
1059 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1060 if( !maUtf16.isEmpty() )
1062 maInput = maUtf16;
1063 if( !maCombining.isEmpty() )
1064 maInput.append(maCombining);
1065 mbAllowMoreChars = false;
1066 return false;
1069 if( !maCombining.isEmpty() )
1071 maCombining.insert(0, uChar);
1072 maInput = maCombining;
1073 mbAllowMoreChars = false;
1074 return false;
1077 // 0 - 1f are control characters. Do not process those.
1078 if( uChar < 0x20 )
1080 mbAllowMoreChars = false;
1081 return false;
1084 switch( uChar )
1086 case 'u':
1087 case 'U':
1088 // U+ notation found. Continue looking for another one.
1089 if( mbRequiresU )
1091 mbRequiresU = false;
1092 maInput.insert(0,"U+");
1094 // treat as a normal character
1095 else
1097 mbAllowMoreChars = false;
1098 if( !bPreventNonHex )
1099 maInput.insertUtf32(0, uChar);
1101 break;
1102 case '+':
1103 // + already found: skip when not U, or edge case of +U+xxxx
1104 if( mbRequiresU || (maInput.indexOf("U+") == 0) )
1105 mbAllowMoreChars = false;
1106 // hex chars followed by '+' - now require a 'U'
1107 else if ( !maInput.isEmpty() )
1108 mbRequiresU = true;
1109 // treat as a normal character
1110 else
1112 mbAllowMoreChars = false;
1113 if( !bPreventNonHex )
1114 maInput.insertUtf32(0, uChar);
1116 break;
1117 default:
1118 // + already found. Since not U, cancel further input
1119 if( mbRequiresU )
1120 mbAllowMoreChars = false;
1121 // maximum digits per notation is 8: only one notation
1122 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
1123 mbAllowMoreChars = false;
1124 // maximum digits per notation is 8: previous notation found
1125 else if( maInput.indexOf("U+") == 8 )
1126 mbAllowMoreChars = false;
1127 // a hex character. Add to string.
1128 else if( rtl::isAsciiHexDigit(uChar) )
1130 mbIsHexString = true;
1131 maInput.insertUtf32(0, uChar);
1133 // not a hex character: stop input. keep if it is the first input provided
1134 else
1136 mbAllowMoreChars = false;
1137 if( maInput.isEmpty() )
1138 maInput.insertUtf32(0, uChar);
1142 return mbAllowMoreChars;
1145 OUString ToggleUnicodeCodepoint::StringToReplace()
1147 if( maInput.isEmpty() )
1149 //edge case - input finished with incomplete low surrogate or combining characters without a base
1150 if( mbAllowMoreChars )
1152 if( !maUtf16.isEmpty() )
1153 maInput = maUtf16;
1154 if( !maCombining.isEmpty() )
1155 maInput.append(maCombining);
1157 return maInput.toString();
1160 if( !mbIsHexString )
1161 return maInput.toString();
1163 //this function potentially modifies the input string. Prevent addition of further characters
1164 mbAllowMoreChars = false;
1166 //validate unicode notation.
1167 OUString sIn;
1168 sal_uInt32 nUnicode = 0;
1169 sal_Int32 nUPlus = maInput.indexOf("U+");
1170 //if U+ notation used, strip off all extra chars added not in U+ notation
1171 if( nUPlus != -1 )
1173 maInput.remove(0, nUPlus);
1174 sIn = maInput.copy(2).makeStringAndClear();
1175 nUPlus = sIn.indexOf("U+");
1177 else
1178 sIn = maInput.toString();
1179 while( nUPlus != -1 )
1181 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1182 //prevent creating control characters or invalid Unicode values
1183 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1184 maInput = sIn.subView(nUPlus);
1185 sIn = sIn.copy(nUPlus+2);
1186 nUPlus = sIn.indexOf("U+");
1189 nUnicode = sIn.toUInt32(16);
1190 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1191 maInput.truncate().append( sIn[sIn.getLength()-1] );
1192 return maInput.toString();
1195 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
1197 OUString sIn = StringToReplace();
1198 sal_Int32 nPos = 0;
1199 sal_uInt32 counter = 0;
1200 while( nPos < sIn.getLength() )
1202 sIn.iterateCodePoints(&nPos);
1203 ++counter;
1205 return counter;
1208 OUString ToggleUnicodeCodepoint::ReplacementString()
1210 OUString sIn = StringToReplace();
1211 OUStringBuffer output = "";
1212 sal_Int32 nUPlus = sIn.indexOf("U+");
1213 // convert from hex notation to glyph
1214 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1216 sal_uInt32 nUnicode = 0;
1217 if( nUPlus == 0)
1219 sIn = sIn.copy(2);
1220 nUPlus = sIn.indexOf("U+");
1222 while( nUPlus > 0 )
1224 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1225 output.appendUtf32( nUnicode );
1227 sIn = sIn.copy(nUPlus+2);
1228 nUPlus = sIn.indexOf("U+");
1230 nUnicode = sIn.toUInt32(16);
1231 output.appendUtf32( nUnicode );
1233 // convert from glyph to hex notation
1234 else
1236 sal_Int32 nPos = 0;
1237 while( nPos < sIn.getLength() )
1239 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1240 //pad with zeros - minimum length of 4.
1241 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1242 aTmp.insert( 0,"0" );
1243 output.append( "U+" + aTmp );
1246 return output.makeStringAndClear();
1249 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */