1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include <unicode/uchar.h>
28 #include "unicode_data.h"
29 #include <rtl/character.hxx>
30 #include <o3tl/string_view.hxx>
33 // Workaround for glibc braindamage:
34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36 #undef CURRENCY_SYMBOL
38 using namespace ::com::sun::star::i18n
;
40 template<class L
, typename T
>
41 static T
getScriptType( const sal_Unicode ch
, const L
* typeList
, T unknownType
) {
44 css::i18n::UnicodeScript type
= typeList
[0].to
;
45 while (type
< UnicodeScript_kScriptCount
&& ch
> UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeTo
]) {
46 type
= typeList
[++i
].to
;
49 return (type
< UnicodeScript_kScriptCount
&&
50 ch
>= UnicodeScriptType
[static_cast<int>(typeList
[i
].from
)][int(UnicodeScriptTypeFrom
)]) ?
51 typeList
[i
].value
: unknownType
;
55 unicode::getUnicodeScriptType( const sal_Unicode ch
, const ScriptTypeList
* typeList
, sal_Int16 unknownType
) {
56 return getScriptType(ch
, typeList
, unknownType
);
60 unicode::getUnicodeScriptStart( UnicodeScript type
) {
61 return UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeFrom
];
65 unicode::getUnicodeScriptEnd( UnicodeScript type
) {
66 return UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeTo
];
70 unicode::getUnicodeType(const sal_uInt32 ch
)
72 static sal_uInt32 c
= 0x00;
73 static sal_uInt32 r
= 0x00;
75 if (ch
== c
) return r
;
78 switch (u_charType(ch
))
81 r
= css::i18n::UnicodeType::UNASSIGNED
;
83 case U_UPPERCASE_LETTER
:
84 r
= css::i18n::UnicodeType::UPPERCASE_LETTER
;
86 case U_LOWERCASE_LETTER
:
87 r
= css::i18n::UnicodeType::LOWERCASE_LETTER
;
89 case U_TITLECASE_LETTER
:
90 r
= css::i18n::UnicodeType::TITLECASE_LETTER
;
92 case U_MODIFIER_LETTER
:
93 r
= css::i18n::UnicodeType::MODIFIER_LETTER
;
96 r
= css::i18n::UnicodeType::OTHER_LETTER
;
98 case U_NON_SPACING_MARK
:
99 r
= css::i18n::UnicodeType::NON_SPACING_MARK
;
101 case U_ENCLOSING_MARK
:
102 r
= css::i18n::UnicodeType::ENCLOSING_MARK
;
104 case U_COMBINING_SPACING_MARK
:
105 r
= css::i18n::UnicodeType::COMBINING_SPACING_MARK
;
107 case U_DECIMAL_DIGIT_NUMBER
:
108 r
= css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER
;
110 case U_LETTER_NUMBER
:
111 r
= css::i18n::UnicodeType::LETTER_NUMBER
;
114 r
= css::i18n::UnicodeType::OTHER_NUMBER
;
116 case U_SPACE_SEPARATOR
:
117 r
= css::i18n::UnicodeType::SPACE_SEPARATOR
;
119 case U_LINE_SEPARATOR
:
120 r
= css::i18n::UnicodeType::LINE_SEPARATOR
;
122 case U_PARAGRAPH_SEPARATOR
:
123 r
= css::i18n::UnicodeType::PARAGRAPH_SEPARATOR
;
126 r
= css::i18n::UnicodeType::CONTROL
;
129 r
= css::i18n::UnicodeType::FORMAT
;
131 case U_PRIVATE_USE_CHAR
:
132 r
= css::i18n::UnicodeType::PRIVATE_USE
;
135 r
= css::i18n::UnicodeType::SURROGATE
;
137 case U_DASH_PUNCTUATION
:
138 r
= css::i18n::UnicodeType::DASH_PUNCTUATION
;
140 case U_INITIAL_PUNCTUATION
:
141 r
= css::i18n::UnicodeType::INITIAL_PUNCTUATION
;
143 case U_FINAL_PUNCTUATION
:
144 r
= css::i18n::UnicodeType::FINAL_PUNCTUATION
;
146 case U_CONNECTOR_PUNCTUATION
:
147 r
= css::i18n::UnicodeType::CONNECTOR_PUNCTUATION
;
149 case U_OTHER_PUNCTUATION
:
150 r
= css::i18n::UnicodeType::OTHER_PUNCTUATION
;
153 r
= css::i18n::UnicodeType::MATH_SYMBOL
;
155 case U_CURRENCY_SYMBOL
:
156 r
= css::i18n::UnicodeType::CURRENCY_SYMBOL
;
158 case U_MODIFIER_SYMBOL
:
159 r
= css::i18n::UnicodeType::MODIFIER_SYMBOL
;
162 r
= css::i18n::UnicodeType::OTHER_SYMBOL
;
164 case U_START_PUNCTUATION
:
165 r
= css::i18n::UnicodeType::START_PUNCTUATION
;
167 case U_END_PUNCTUATION
:
168 r
= css::i18n::UnicodeType::END_PUNCTUATION
;
176 unicode::getUnicodeDirection( const sal_Unicode ch
) {
177 static sal_Unicode c
= 0x00;
178 static sal_uInt8 r
= 0x00;
180 if (ch
== c
) return r
;
183 sal_Int16 address
= UnicodeDirectionIndex
[ch
>> 8];
184 r
= (address
< UnicodeDirectionNumberBlock
)
185 ? UnicodeDirectionBlockValue
[address
]
186 : UnicodeDirectionValue
[((address
- UnicodeDirectionNumberBlock
) << 8) + (ch
& 0xff)];
190 sal_uInt32
unicode::GetMirroredChar(sal_uInt32 nChar
) {
191 nChar
= u_charMirror(nChar
);
195 #define bit(name) (1U << name)
197 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
199 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
201 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
203 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
204 bit(UnicodeType::MODIFIER_LETTER)|\
205 bit(UnicodeType::OTHER_LETTER)
207 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
208 bit(UnicodeType::LINE_SEPARATOR)|\
209 bit(UnicodeType::PARAGRAPH_SEPARATOR)
211 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
212 bit(UnicodeType::FORMAT)|\
213 bit(UnicodeType::LINE_SEPARATOR)|\
214 bit(UnicodeType::PARAGRAPH_SEPARATOR)
216 #define IsType(func, mask) \
217 bool func( const sal_uInt32 ch) {\
218 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
221 IsType(unicode::isControl
, CONTROLMASK
)
222 IsType(unicode::isAlpha
, ALPHAMASK
)
223 IsType(unicode::isSpace
, SPACEMASK
)
225 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
226 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
228 bool unicode::isWhiteSpace(const sal_uInt32 ch
)
230 return (ch
!= 0xa0 && isSpace(ch
)) || (ch
<= 0x1F && (bit(ch
) & (CONTROLSPACE
)));
233 sal_Int16
unicode::getScriptClassFromUScriptCode(UScriptCode eScript
)
235 //See unicode/uscript.h
239 case USCRIPT_INVALID_CODE
:
241 case USCRIPT_INHERITED
:
242 case USCRIPT_UNWRITTEN_LANGUAGES
:
243 case USCRIPT_UNKNOWN
:
244 case USCRIPT_MATHEMATICAL_NOTATION
:
245 case USCRIPT_SYMBOLS
:
246 case USCRIPT_CODE_LIMIT
:
247 nRet
= ScriptType::WEAK
;
249 case USCRIPT_ARMENIAN
:
250 case USCRIPT_CHEROKEE
:
252 case USCRIPT_CYRILLIC
:
253 case USCRIPT_GEORGIAN
:
258 case USCRIPT_OLD_ITALIC
:
260 case USCRIPT_CANADIAN_ABORIGINAL
:
261 case USCRIPT_BRAILLE
:
262 case USCRIPT_CYPRIOT
:
263 case USCRIPT_OSMANYA
:
264 case USCRIPT_SHAVIAN
:
265 case USCRIPT_KATAKANA_OR_HIRAGANA
:
266 case USCRIPT_GLAGOLITIC
:
268 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC
:
269 case USCRIPT_OLD_HUNGARIAN
:
270 case USCRIPT_LATIN_FRAKTUR
:
271 case USCRIPT_LATIN_GAELIC
:
272 nRet
= ScriptType::LATIN
;
274 case USCRIPT_BOPOMOFO
:
277 case USCRIPT_HIRAGANA
:
278 case USCRIPT_KATAKANA
:
280 case USCRIPT_SIMPLIFIED_HAN
:
281 case USCRIPT_TRADITIONAL_HAN
:
282 case USCRIPT_JAPANESE
:
285 case USCRIPT_KHITAN_SMALL_SCRIPT
:
286 nRet
= ScriptType::ASIAN
;
289 case USCRIPT_BENGALI
:
290 case USCRIPT_DESERET
:
291 case USCRIPT_DEVANAGARI
:
292 case USCRIPT_ETHIOPIC
:
293 case USCRIPT_GUJARATI
:
294 case USCRIPT_GURMUKHI
:
296 case USCRIPT_KANNADA
:
299 case USCRIPT_MALAYALAM
:
300 case USCRIPT_MONGOLIAN
:
301 case USCRIPT_MYANMAR
:
303 case USCRIPT_SINHALA
:
309 case USCRIPT_TIBETAN
:
310 case USCRIPT_TAGALOG
:
311 case USCRIPT_HANUNOO
:
313 case USCRIPT_TAGBANWA
:
315 case USCRIPT_LINEAR_B
:
317 case USCRIPT_UGARITIC
:
318 case USCRIPT_BUGINESE
:
319 case USCRIPT_KHAROSHTHI
:
320 case USCRIPT_SYLOTI_NAGRI
:
321 case USCRIPT_NEW_TAI_LUE
:
322 case USCRIPT_TIFINAGH
:
323 case USCRIPT_OLD_PERSIAN
:
324 case USCRIPT_BALINESE
:
326 case USCRIPT_BLISSYMBOLS
:
329 case USCRIPT_DEMOTIC_EGYPTIAN
:
330 case USCRIPT_HIERATIC_EGYPTIAN
:
331 case USCRIPT_EGYPTIAN_HIEROGLYPHS
:
332 case USCRIPT_KHUTSURI
:
333 case USCRIPT_PAHAWH_HMONG
:
334 case USCRIPT_HARAPPAN_INDUS
:
335 case USCRIPT_JAVANESE
:
336 case USCRIPT_KAYAH_LI
:
338 case USCRIPT_LINEAR_A
:
339 case USCRIPT_MANDAEAN
:
340 case USCRIPT_MAYAN_HIEROGLYPHS
:
341 case USCRIPT_MEROITIC
:
344 case USCRIPT_OLD_PERMIC
:
345 case USCRIPT_PHAGS_PA
:
346 case USCRIPT_PHOENICIAN
:
347 case USCRIPT_PHONETIC_POLLARD
:
348 case USCRIPT_RONGORONGO
:
350 case USCRIPT_ESTRANGELO_SYRIAC
:
351 case USCRIPT_WESTERN_SYRIAC
:
352 case USCRIPT_EASTERN_SYRIAC
:
353 case USCRIPT_TENGWAR
:
355 case USCRIPT_VISIBLE_SPEECH
:
356 case USCRIPT_CUNEIFORM
:
361 case USCRIPT_OL_CHIKI
:
363 case USCRIPT_SAURASHTRA
:
364 case USCRIPT_SIGN_WRITING
:
365 case USCRIPT_SUNDANESE
:
367 case USCRIPT_MEITEI_MAYEK
:
368 case USCRIPT_IMPERIAL_ARAMAIC
:
369 case USCRIPT_AVESTAN
:
372 case USCRIPT_MANICHAEAN
:
373 case USCRIPT_INSCRIPTIONAL_PAHLAVI
:
374 case USCRIPT_PSALTER_PAHLAVI
:
375 case USCRIPT_BOOK_PAHLAVI
:
376 case USCRIPT_INSCRIPTIONAL_PARTHIAN
:
377 case USCRIPT_SAMARITAN
:
378 case USCRIPT_TAI_VIET
:
381 case USCRIPT_NAKHI_GEBA
:
382 case USCRIPT_OLD_SOUTH_ARABIAN
:
383 case USCRIPT_BASSA_VAH
:
384 case USCRIPT_DUPLOYAN_SHORTAND
:
385 case USCRIPT_ELBASAN
:
386 case USCRIPT_GRANTHA
:
390 case USCRIPT_MEROITIC_CURSIVE
:
391 case USCRIPT_OLD_NORTH_ARABIAN
:
392 case USCRIPT_NABATAEAN
:
393 case USCRIPT_PALMYRENE
:
395 case USCRIPT_WARANG_CITI
:
396 default: // anything new is going to be pretty wild
397 nRet
= ScriptType::COMPLEX
;
403 sal_Int16
unicode::getScriptClassFromLanguageTag( const LanguageTag
& rLanguageTag
)
405 constexpr int32_t nBuf
= 42;
406 UScriptCode aBuf
[nBuf
];
407 if (rLanguageTag
.hasScript())
409 aBuf
[0] = static_cast<UScriptCode
>(u_getPropertyValueEnum( UCHAR_SCRIPT
,
410 OUStringToOString( rLanguageTag
.getScript(), RTL_TEXTENCODING_ASCII_US
).getStr()));
415 if (rLanguageTag
.getCountry().isEmpty())
416 aName
= rLanguageTag
.getLanguage();
418 aName
= rLanguageTag
.getLanguage() + "-" + rLanguageTag
.getCountry();
419 UErrorCode status
= U_ZERO_ERROR
;
420 const int32_t nScripts
= uscript_getCode(
421 OUStringToOString( aName
, RTL_TEXTENCODING_ASCII_US
).getStr(),
422 aBuf
, nBuf
, &status
);
423 // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
424 // and required capacity returned, but really..
425 if (nScripts
== 0 || !U_SUCCESS(status
))
426 return css::i18n::ScriptType::LATIN
;
428 return getScriptClassFromUScriptCode( aBuf
[0]);
431 OString
unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript
)
436 case USCRIPT_CODE_LIMIT
:
437 case USCRIPT_INVALID_CODE
:
438 case USCRIPT_MATHEMATICAL_NOTATION
:
439 case USCRIPT_SYMBOLS
:
443 case USCRIPT_INHERITED
:
444 case USCRIPT_UNWRITTEN_LANGUAGES
:
445 case USCRIPT_UNKNOWN
:
451 case USCRIPT_ARMENIAN
:
454 case USCRIPT_BENGALI
:
457 case USCRIPT_BOPOMOFO
:
460 case USCRIPT_CHEROKEE
:
466 case USCRIPT_CYRILLIC
:
469 case USCRIPT_DESERET
:
472 case USCRIPT_DEVANAGARI
:
475 case USCRIPT_ETHIOPIC
:
478 case USCRIPT_GEORGIAN
:
479 case USCRIPT_KHUTSURI
:
488 case USCRIPT_GUJARATI
:
492 case USCRIPT_GURMUKHI
:
501 sRet
= "ko"_ostr
; // Jamo - elements of Hangul Syllables
506 case USCRIPT_HIRAGANA
:
509 case USCRIPT_KANNADA
:
512 case USCRIPT_KATAKANA
:
524 case USCRIPT_MALAYALAM
:
527 case USCRIPT_MONGOLIAN
:
530 case USCRIPT_MYANMAR
:
536 case USCRIPT_OLD_ITALIC
:
545 case USCRIPT_SINHALA
:
549 case USCRIPT_ESTRANGELO_SYRIAC
:
553 case USCRIPT_GRANTHA
:
565 case USCRIPT_TIBETAN
:
568 case USCRIPT_CANADIAN_ABORIGINAL
:
574 case USCRIPT_TAGALOG
:
577 case USCRIPT_HANUNOO
:
583 case USCRIPT_TAGBANWA
:
586 case USCRIPT_BRAILLE
:
589 case USCRIPT_CYPRIOT
:
595 case USCRIPT_LINEAR_B
:
598 case USCRIPT_OSMANYA
:
601 case USCRIPT_SHAVIAN
:
607 case USCRIPT_UGARITIC
:
610 case USCRIPT_KATAKANA_OR_HIRAGANA
:
613 case USCRIPT_BUGINESE
:
616 case USCRIPT_GLAGOLITIC
:
619 case USCRIPT_KHAROSHTHI
:
623 case USCRIPT_SYLOTI_NAGRI
:
626 case USCRIPT_NEW_TAI_LUE
:
629 case USCRIPT_TIFINAGH
:
632 case USCRIPT_OLD_PERSIAN
:
635 case USCRIPT_BALINESE
:
641 case USCRIPT_BLISSYMBOLS
:
648 case USCRIPT_TENGWAR
:
651 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC
:
654 case USCRIPT_DEMOTIC_EGYPTIAN
:
655 case USCRIPT_HIERATIC_EGYPTIAN
:
656 case USCRIPT_EGYPTIAN_HIEROGLYPHS
:
659 case USCRIPT_SIMPLIFIED_HAN
:
662 case USCRIPT_TRADITIONAL_HAN
:
665 case USCRIPT_PAHAWH_HMONG
:
668 case USCRIPT_OLD_HUNGARIAN
:
671 case USCRIPT_HARAPPAN_INDUS
:
674 case USCRIPT_JAVANESE
:
677 case USCRIPT_KAYAH_LI
:
680 case USCRIPT_LATIN_FRAKTUR
:
683 case USCRIPT_LATIN_GAELIC
:
689 case USCRIPT_LINEAR_A
:
692 case USCRIPT_MAYAN_HIEROGLYPHS
:
695 case USCRIPT_MEROITIC_CURSIVE
:
696 case USCRIPT_MEROITIC
:
705 case USCRIPT_OLD_PERMIC
:
708 case USCRIPT_PHAGS_PA
:
711 case USCRIPT_PHOENICIAN
:
714 case USCRIPT_PHONETIC_POLLARD
:
717 case USCRIPT_RONGORONGO
:
723 case USCRIPT_WESTERN_SYRIAC
:
726 case USCRIPT_EASTERN_SYRIAC
:
732 case USCRIPT_VISIBLE_SPEECH
:
735 case USCRIPT_CUNEIFORM
:
741 case USCRIPT_JAPANESE
:
753 case USCRIPT_OL_CHIKI
:
759 case USCRIPT_SAURASHTRA
:
762 case USCRIPT_SIGN_WRITING
:
765 case USCRIPT_SUNDANESE
:
771 case USCRIPT_MEITEI_MAYEK
:
774 case USCRIPT_IMPERIAL_ARAMAIC
:
777 case USCRIPT_AVESTAN
:
786 case USCRIPT_MANICHAEAN
:
789 case USCRIPT_INSCRIPTIONAL_PAHLAVI
:
790 case USCRIPT_PSALTER_PAHLAVI
:
791 case USCRIPT_BOOK_PAHLAVI
:
792 case USCRIPT_INSCRIPTIONAL_PARTHIAN
:
795 case USCRIPT_SAMARITAN
:
798 case USCRIPT_TAI_VIET
:
801 case USCRIPT_MANDAEAN
: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
804 case USCRIPT_NABATAEAN
:
805 sRet
= "mis-Nbat"_ostr
; // Uncoded with script
807 case USCRIPT_PALMYRENE
:
808 sRet
= "mis-Palm"_ostr
; // Uncoded with script
816 case USCRIPT_NAKHI_GEBA
:
819 case USCRIPT_OLD_SOUTH_ARABIAN
:
822 case USCRIPT_BASSA_VAH
:
825 case USCRIPT_DUPLOYAN_SHORTAND
:
828 case USCRIPT_ELBASAN
:
840 case USCRIPT_OLD_NORTH_ARABIAN
:
846 case USCRIPT_WARANG_CITI
:
852 case USCRIPT_JURCHEN
:
859 sRet
= "mis-Nshu"_ostr
; // Uncoded with script
861 case USCRIPT_SHARADA
:
864 case USCRIPT_SORA_SOMPENG
:
876 case USCRIPT_ANATOLIAN_HIEROGLYPHS
:
879 case USCRIPT_TIRHUTA
:
882 case USCRIPT_CAUCASIAN_ALBANIAN
:
885 case USCRIPT_MAHAJANI
:
892 sRet
= "qly-Hatr"_ostr
;
895 sRet
= "mr-Modi"_ostr
;
897 case USCRIPT_MULTANI
:
898 sRet
= "skr-Mutl"_ostr
;
900 case USCRIPT_PAU_CIN_HAU
:
901 sRet
= "ctd-Pauc"_ostr
;
903 case USCRIPT_SIDDHAM
:
904 sRet
= "sa-Sidd"_ostr
;
907 sRet
= "mis-Adlm"_ostr
; // Adlam for Fulani, no language code
909 case USCRIPT_BHAIKSUKI
:
910 sRet
= "mis-Bhks"_ostr
; // Bhaiksuki for some Buddhist texts, no language code
912 case USCRIPT_MARCHEN
:
913 sRet
= "bo-Marc"_ostr
;
916 sRet
= "new-Newa"_ostr
;
919 sRet
= "osa-Osge"_ostr
;
921 case USCRIPT_HAN_WITH_BOPOMOFO
:
922 sRet
= "mis-Hanb"_ostr
; // Han with Bopomofo, zh-Hanb ?
924 case USCRIPT_SYMBOLS_EMOJI
:
925 sRet
= "mis-Zsye"_ostr
; // Emoji variant
927 case USCRIPT_MASARAM_GONDI
:
928 sRet
= "gon-Gonm"_ostr
; // macro language code, could be wsg,esg,gno
930 case USCRIPT_SOYOMBO
:
931 sRet
= "mn-Soyo"_ostr
; // abugida to write Mongolian, also Tibetan and Sanskrit
933 case USCRIPT_ZANABAZAR_SQUARE
:
934 sRet
= "mn-Zanb"_ostr
; // abugida to write Mongolian
937 sRet
= "dgo"_ostr
; // Dogri proper
939 case USCRIPT_GUNJALA_GONDI
:
940 sRet
= "wsg"_ostr
; // Adilabad Gondi
942 case USCRIPT_MAKASAR
:
945 case USCRIPT_MEDEFAIDRIN
:
946 sRet
= "dmf-Medf"_ostr
;
948 case USCRIPT_HANIFI_ROHINGYA
:
951 case USCRIPT_SOGDIAN
:
952 case USCRIPT_OLD_SOGDIAN
:
955 case USCRIPT_ELYMAIC
:
956 sRet
= "arc-Elym"_ostr
;
958 case USCRIPT_NYIAKENG_PUACHUE_HMONG
:
959 sRet
= "hmn-Hmnp"_ostr
; // macrolanguage code
961 case USCRIPT_NANDINAGARI
:
962 sRet
= "sa-Nand"_ostr
;
965 sRet
= "nnp-Wcho"_ostr
;
967 case USCRIPT_CHORASMIAN
:
968 sRet
= "xco-Chrs"_ostr
;
970 case USCRIPT_DIVES_AKURU
:
971 sRet
= "dv-Diak"_ostr
;
973 case USCRIPT_KHITAN_SMALL_SCRIPT
:
974 sRet
= "zkt-Kits"_ostr
;
977 sRet
= "kmr-Yezi"_ostr
;
979 #if (U_ICU_VERSION_MAJOR_NUM >= 70)
980 case USCRIPT_CYPRO_MINOAN
:
981 sRet
= "mis-Cpmn"_ostr
; // Uncoded with script
983 case USCRIPT_OLD_UYGHUR
:
984 sRet
= "oui-Ougr"_ostr
;
987 sRet
= "nst-Tnsa"_ostr
;
990 sRet
= "txo-Toto"_ostr
;
992 case USCRIPT_VITHKUQI
:
993 sRet
= "sq-Vith"_ostr
; // macrolanguage code
996 #if (U_ICU_VERSION_MAJOR_NUM >= 72)
998 sRet
= "mis-Kawi"_ostr
; // Uncoded with script
1000 case USCRIPT_NAG_MUNDARI
:
1001 sRet
= "unr-Nagm"_ostr
;
1004 #if (U_ICU_VERSION_MAJOR_NUM >= 75)
1005 case USCRIPT_ARABIC_NASTALIQ
:
1006 sRet
= "fa-Aran"_ostr
;
1013 //Format a number as a percentage according to the rules of the given
1014 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
1015 OUString
unicode::formatPercent(double dNumber
,
1016 const LanguageTag
&rLangTag
)
1018 // get a currency formatter for this locale ID
1019 UErrorCode errorCode
=U_ZERO_ERROR
;
1021 LanguageTag
aLangTag(rLangTag
);
1023 // As of CLDR Version 24 these languages were not listed as using spacing
1024 // between number and % but are reported as such by our l10n groups
1025 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
1026 // so format using French which has the desired rules
1027 if (aLangTag
.getLanguage() == "es" || aLangTag
.getLanguage() == "sl")
1028 aLangTag
.reset(u
"fr-FR"_ustr
);
1030 icu::Locale aLocale
= LanguageTagIcu::getIcuLocale(aLangTag
);
1032 std::unique_ptr
<icu::NumberFormat
> xF(
1033 icu::NumberFormat::createPercentInstance(aLocale
, errorCode
));
1034 if(U_FAILURE(errorCode
))
1036 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
1037 return OUString::number(dNumber
) + "%";
1040 icu::UnicodeString output
;
1041 xF
->format(dNumber
/100, output
);
1042 OUString
aRet(reinterpret_cast<const sal_Unicode
*>(output
.getBuffer()),
1044 if (rLangTag
.getLanguage() == "de")
1046 //narrow no-break space instead of (normal) no-break space
1047 return aRet
.replace(0x00A0, 0x202F);
1052 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_uInt32 uChar
)
1054 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1055 if( maInput
.getLength() > 255 )
1056 mbAllowMoreChars
= false;
1058 if( !mbAllowMoreChars
)
1061 bool bPreventNonHex
= false;
1062 if( maInput
.indexOf("U+") != -1 )
1063 bPreventNonHex
= true;
1065 switch ( unicode::getUnicodeType(uChar
) )
1067 case css::i18n::UnicodeType::SURROGATE
:
1068 if( bPreventNonHex
)
1070 mbAllowMoreChars
= false;
1074 if( rtl::isLowSurrogate(uChar
) && maUtf16
.isEmpty() && maInput
.isEmpty() )
1076 maUtf16
.append(sal_Unicode(uChar
));
1079 if( rtl::isHighSurrogate(uChar
) && maInput
.isEmpty() )
1080 maUtf16
.insert(0, sal_Unicode(uChar
));
1081 if (maUtf16
.getLength() == 2)
1083 assert(rtl::isHighSurrogate(maUtf16
[0]) && rtl::isLowSurrogate(maUtf16
[1]));
1084 // The resulting codepoint may itself be combining, so may allow more
1085 sal_uInt32 nUCS4
= rtl::combineSurrogates(maUtf16
[0], maUtf16
[1]);
1086 maUtf16
.setLength(0);
1087 return AllowMoreInput(nUCS4
);
1089 // unexpected order of high/low, so don't accept more
1090 if( !maUtf16
.isEmpty() )
1091 maInput
.append(maUtf16
);
1092 if( !maCombining
.isEmpty() )
1093 maInput
.append(maCombining
);
1094 mbAllowMoreChars
= false;
1097 case css::i18n::UnicodeType::NON_SPACING_MARK
:
1098 case css::i18n::UnicodeType::COMBINING_SPACING_MARK
:
1099 if( bPreventNonHex
)
1101 mbAllowMoreChars
= false;
1105 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1106 if( !maUtf16
.isEmpty() )
1109 if( !maCombining
.isEmpty() )
1110 maInput
.append(maCombining
);
1111 mbAllowMoreChars
= false;
1114 maCombining
.insertUtf32(0, uChar
);
1118 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1119 if( !maUtf16
.isEmpty() )
1122 if( !maCombining
.isEmpty() )
1123 maInput
.append(maCombining
);
1124 mbAllowMoreChars
= false;
1128 if( !maCombining
.isEmpty() )
1130 maCombining
.insertUtf32(0, uChar
);
1131 maInput
= maCombining
;
1132 mbAllowMoreChars
= false;
1136 // 0 - 1f are control characters. Do not process those.
1139 mbAllowMoreChars
= false;
1147 // U+ notation found. Continue looking for another one.
1150 mbRequiresU
= false;
1151 maInput
.insert(0,"U+");
1153 // treat as a normal character
1156 mbAllowMoreChars
= false;
1157 if( !bPreventNonHex
)
1158 maInput
.insertUtf32(0, uChar
);
1162 // + already found: skip when not U, or edge case of +U+xxxx
1163 if( mbRequiresU
|| (maInput
.indexOf("U+") == 0) )
1164 mbAllowMoreChars
= false;
1165 // hex chars followed by '+' - now require a 'U'
1166 else if ( !maInput
.isEmpty() )
1168 // treat as a normal character
1171 mbAllowMoreChars
= false;
1172 if( !bPreventNonHex
)
1173 maInput
.insertUtf32(0, uChar
);
1177 // + already found. Since not U, cancel further input
1179 mbAllowMoreChars
= false;
1180 // maximum digits per notation is 8: only one notation
1181 else if( maInput
.indexOf("U+") == -1 && maInput
.getLength() == 8 )
1182 mbAllowMoreChars
= false;
1183 // maximum digits per notation is 8: previous notation found
1184 else if( maInput
.indexOf("U+") == 8 )
1185 mbAllowMoreChars
= false;
1186 // a hex character. Add to string.
1187 else if( rtl::isAsciiHexDigit(uChar
) )
1189 mbIsHexString
= true;
1190 maInput
.insertUtf32(0, uChar
);
1192 // not a hex character: stop input. keep if it is the first input provided
1195 mbAllowMoreChars
= false;
1196 if( maInput
.isEmpty() )
1197 maInput
.insertUtf32(0, uChar
);
1201 return mbAllowMoreChars
;
1204 OUString
ToggleUnicodeCodepoint::StringToReplace()
1206 if( maInput
.isEmpty() )
1208 //edge case - input finished with incomplete low surrogate or combining characters without a base
1209 if( mbAllowMoreChars
)
1211 if( !maUtf16
.isEmpty() )
1213 if( !maCombining
.isEmpty() )
1214 maInput
.append(maCombining
);
1216 return maInput
.toString();
1219 if( !mbIsHexString
)
1220 return maInput
.toString();
1222 //this function potentially modifies the input string. Prevent addition of further characters
1223 mbAllowMoreChars
= false;
1225 //validate unicode notation.
1227 sal_uInt32 nUnicode
= 0;
1228 sal_Int32 nUPlus
= maInput
.indexOf("U+");
1229 //if U+ notation used, strip off all extra chars added not in U+ notation
1232 maInput
.remove(0, nUPlus
);
1233 sIn
= maInput
.copy(2).makeStringAndClear();
1234 nUPlus
= sIn
.indexOf("U+");
1237 sIn
= maInput
.toString();
1238 while( nUPlus
!= -1 )
1240 nUnicode
= o3tl::toUInt32(sIn
.subView(0, nUPlus
), 16);
1241 //prevent creating control characters or invalid Unicode values
1242 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1243 maInput
= sIn
.subView(nUPlus
);
1244 sIn
= sIn
.copy(nUPlus
+2);
1245 nUPlus
= sIn
.indexOf("U+");
1248 nUnicode
= sIn
.toUInt32(16);
1249 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1250 maInput
.truncate().append( sIn
[sIn
.getLength()-1] );
1251 return maInput
.toString();
1254 OUString
ToggleUnicodeCodepoint::ReplacementString()
1256 OUString sIn
= StringToReplace();
1257 OUStringBuffer output
= "";
1258 sal_Int32 nUPlus
= sIn
.indexOf("U+");
1259 // convert from hex notation to glyph
1260 if( nUPlus
!= -1 || (sIn
.getLength() > 1 && mbIsHexString
) )
1262 sal_uInt32 nUnicode
= 0;
1266 nUPlus
= sIn
.indexOf("U+");
1270 nUnicode
= o3tl::toUInt32(sIn
.subView(0, nUPlus
), 16);
1271 output
.appendUtf32( nUnicode
);
1273 sIn
= sIn
.copy(nUPlus
+2);
1274 nUPlus
= sIn
.indexOf("U+");
1276 nUnicode
= sIn
.toUInt32(16);
1277 output
.appendUtf32( nUnicode
);
1279 // convert from glyph to hex notation
1283 while( nPos
< sIn
.getLength() )
1285 OUStringBuffer aTmp
= OUString::number(sIn
.iterateCodePoints(&nPos
),16);
1286 //pad with zeros - minimum length of 4.
1287 for( sal_Int32 i
= 4 - aTmp
.getLength(); i
> 0; --i
)
1288 aTmp
.insert( 0,"0" );
1289 output
.append( "U+" + aTmp
);
1292 return output
.makeStringAndClear();
1295 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */