1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include <unicode/uchar.h>
28 #include "unicode_data.h"
29 #include <rtl/character.hxx>
30 #include <o3tl/string_view.hxx>
33 // Workaround for glibc braindamage:
34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36 #undef CURRENCY_SYMBOL
38 using namespace ::com::sun::star::i18n
;
40 template<class L
, typename T
>
41 static T
getScriptType( const sal_Unicode ch
, const L
* typeList
, T unknownType
) {
44 css::i18n::UnicodeScript type
= typeList
[0].to
;
45 while (type
< UnicodeScript_kScriptCount
&& ch
> UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeTo
]) {
46 type
= typeList
[++i
].to
;
49 return (type
< UnicodeScript_kScriptCount
&&
50 ch
>= UnicodeScriptType
[static_cast<int>(typeList
[i
].from
)][int(UnicodeScriptTypeFrom
)]) ?
51 typeList
[i
].value
: unknownType
;
55 unicode::getUnicodeScriptType( const sal_Unicode ch
, const ScriptTypeList
* typeList
, sal_Int16 unknownType
) {
56 return getScriptType(ch
, typeList
, unknownType
);
60 unicode::getUnicodeScriptStart( UnicodeScript type
) {
61 return UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeFrom
];
65 unicode::getUnicodeScriptEnd( UnicodeScript type
) {
66 return UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeTo
];
70 unicode::getUnicodeType( const sal_Unicode ch
) {
71 static sal_Unicode c
= 0x00;
72 static sal_Int16 r
= 0x00;
74 if (ch
== c
) return r
;
77 sal_Int16 address
= UnicodeTypeIndex
[ch
>> 8];
78 r
= static_cast<sal_Int16
>(
79 (address
< UnicodeTypeNumberBlock
)
80 ? UnicodeTypeBlockValue
[address
]
81 : UnicodeTypeValue
[((address
- UnicodeTypeNumberBlock
) << 8) + (ch
& 0xff)]);
86 unicode::getUnicodeDirection( const sal_Unicode ch
) {
87 static sal_Unicode c
= 0x00;
88 static sal_uInt8 r
= 0x00;
90 if (ch
== c
) return r
;
93 sal_Int16 address
= UnicodeDirectionIndex
[ch
>> 8];
94 r
= (address
< UnicodeDirectionNumberBlock
)
95 ? UnicodeDirectionBlockValue
[address
]
96 : UnicodeDirectionValue
[((address
- UnicodeDirectionNumberBlock
) << 8) + (ch
& 0xff)];
100 sal_uInt32
unicode::GetMirroredChar(sal_uInt32 nChar
) {
101 nChar
= u_charMirror(nChar
);
105 #define bit(name) (1U << name)
107 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
109 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
111 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
113 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
114 bit(UnicodeType::MODIFIER_LETTER)|\
115 bit(UnicodeType::OTHER_LETTER)
117 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
118 bit(UnicodeType::LINE_SEPARATOR)|\
119 bit(UnicodeType::PARAGRAPH_SEPARATOR)
121 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
122 bit(UnicodeType::FORMAT)|\
123 bit(UnicodeType::LINE_SEPARATOR)|\
124 bit(UnicodeType::PARAGRAPH_SEPARATOR)
126 #define IsType(func, mask) \
127 bool func( const sal_Unicode ch) {\
128 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
131 IsType(unicode::isControl
, CONTROLMASK
)
132 IsType(unicode::isAlpha
, ALPHAMASK
)
133 IsType(unicode::isSpace
, SPACEMASK
)
135 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
136 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
138 bool unicode::isWhiteSpace( const sal_Unicode ch
) {
139 return (ch
!= 0xa0 && isSpace(ch
)) || (ch
<= 0x1F && (bit(ch
) & (CONTROLSPACE
)));
142 sal_Int16
unicode::getScriptClassFromUScriptCode(UScriptCode eScript
)
144 //See unicode/uscript.h
148 case USCRIPT_INVALID_CODE
:
150 case USCRIPT_INHERITED
:
151 case USCRIPT_UNWRITTEN_LANGUAGES
:
152 case USCRIPT_UNKNOWN
:
153 case USCRIPT_MATHEMATICAL_NOTATION
:
154 case USCRIPT_SYMBOLS
:
155 case USCRIPT_WARANG_CITI
:
156 nRet
= ScriptType::WEAK
;
158 case USCRIPT_ARMENIAN
:
159 case USCRIPT_CHEROKEE
:
161 case USCRIPT_CYRILLIC
:
162 case USCRIPT_GEORGIAN
:
167 case USCRIPT_OLD_ITALIC
:
169 case USCRIPT_CANADIAN_ABORIGINAL
:
170 case USCRIPT_BRAILLE
:
171 case USCRIPT_CYPRIOT
:
172 case USCRIPT_OSMANYA
:
173 case USCRIPT_SHAVIAN
:
174 case USCRIPT_KATAKANA_OR_HIRAGANA
:
175 case USCRIPT_GLAGOLITIC
:
177 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC
:
178 case USCRIPT_OLD_HUNGARIAN
:
179 case USCRIPT_LATIN_FRAKTUR
:
180 case USCRIPT_LATIN_GAELIC
:
181 nRet
= ScriptType::LATIN
;
183 case USCRIPT_BOPOMOFO
:
186 case USCRIPT_HIRAGANA
:
187 case USCRIPT_KATAKANA
:
189 case USCRIPT_SIMPLIFIED_HAN
:
190 case USCRIPT_TRADITIONAL_HAN
:
191 case USCRIPT_JAPANESE
:
193 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
196 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
197 case USCRIPT_KHITAN_SMALL_SCRIPT
:
199 nRet
= ScriptType::ASIAN
;
202 case USCRIPT_BENGALI
:
203 case USCRIPT_DESERET
:
204 case USCRIPT_DEVANAGARI
:
205 case USCRIPT_ETHIOPIC
:
206 case USCRIPT_GUJARATI
:
207 case USCRIPT_GURMUKHI
:
209 case USCRIPT_KANNADA
:
212 case USCRIPT_MALAYALAM
:
213 case USCRIPT_MONGOLIAN
:
214 case USCRIPT_MYANMAR
:
216 case USCRIPT_SINHALA
:
222 case USCRIPT_TIBETAN
:
223 case USCRIPT_TAGALOG
:
224 case USCRIPT_HANUNOO
:
226 case USCRIPT_TAGBANWA
:
228 case USCRIPT_LINEAR_B
:
230 case USCRIPT_UGARITIC
:
231 case USCRIPT_BUGINESE
:
232 case USCRIPT_KHAROSHTHI
:
233 case USCRIPT_SYLOTI_NAGRI
:
234 case USCRIPT_NEW_TAI_LUE
:
235 case USCRIPT_TIFINAGH
:
236 case USCRIPT_OLD_PERSIAN
:
237 case USCRIPT_BALINESE
:
239 case USCRIPT_BLISSYMBOLS
:
242 case USCRIPT_DEMOTIC_EGYPTIAN
:
243 case USCRIPT_HIERATIC_EGYPTIAN
:
244 case USCRIPT_EGYPTIAN_HIEROGLYPHS
:
245 case USCRIPT_KHUTSURI
:
246 case USCRIPT_PAHAWH_HMONG
:
247 case USCRIPT_HARAPPAN_INDUS
:
248 case USCRIPT_JAVANESE
:
249 case USCRIPT_KAYAH_LI
:
251 case USCRIPT_LINEAR_A
:
252 case USCRIPT_MANDAEAN
:
253 case USCRIPT_MAYAN_HIEROGLYPHS
:
254 case USCRIPT_MEROITIC
:
257 case USCRIPT_OLD_PERMIC
:
258 case USCRIPT_PHAGS_PA
:
259 case USCRIPT_PHOENICIAN
:
260 case USCRIPT_PHONETIC_POLLARD
:
261 case USCRIPT_RONGORONGO
:
263 case USCRIPT_ESTRANGELO_SYRIAC
:
264 case USCRIPT_WESTERN_SYRIAC
:
265 case USCRIPT_EASTERN_SYRIAC
:
266 case USCRIPT_TENGWAR
:
268 case USCRIPT_VISIBLE_SPEECH
:
269 case USCRIPT_CUNEIFORM
:
274 case USCRIPT_OL_CHIKI
:
276 case USCRIPT_SAURASHTRA
:
277 case USCRIPT_SIGN_WRITING
:
278 case USCRIPT_SUNDANESE
:
280 case USCRIPT_MEITEI_MAYEK
:
281 case USCRIPT_IMPERIAL_ARAMAIC
:
282 case USCRIPT_AVESTAN
:
285 case USCRIPT_MANICHAEAN
:
286 case USCRIPT_INSCRIPTIONAL_PAHLAVI
:
287 case USCRIPT_PSALTER_PAHLAVI
:
288 case USCRIPT_BOOK_PAHLAVI
:
289 case USCRIPT_INSCRIPTIONAL_PARTHIAN
:
290 case USCRIPT_SAMARITAN
:
291 case USCRIPT_TAI_VIET
:
294 case USCRIPT_NAKHI_GEBA
:
295 case USCRIPT_OLD_SOUTH_ARABIAN
:
296 case USCRIPT_BASSA_VAH
:
297 case USCRIPT_DUPLOYAN_SHORTAND
:
298 case USCRIPT_ELBASAN
:
299 case USCRIPT_GRANTHA
:
303 case USCRIPT_MEROITIC_CURSIVE
:
304 case USCRIPT_OLD_NORTH_ARABIAN
:
305 case USCRIPT_NABATAEAN
:
306 case USCRIPT_PALMYRENE
:
308 default: // anything new is going to be pretty wild
309 nRet
= ScriptType::COMPLEX
;
315 sal_Int16
unicode::getScriptClassFromLanguageTag( const LanguageTag
& rLanguageTag
)
317 constexpr int32_t nBuf
= 42;
318 UScriptCode aBuf
[nBuf
];
319 if (rLanguageTag
.hasScript())
321 aBuf
[0] = static_cast<UScriptCode
>(u_getPropertyValueEnum( UCHAR_SCRIPT
,
322 OUStringToOString( rLanguageTag
.getScript(), RTL_TEXTENCODING_ASCII_US
).getStr()));
327 if (rLanguageTag
.getCountry().isEmpty())
328 aName
= rLanguageTag
.getLanguage();
330 aName
= rLanguageTag
.getLanguage() + "-" + rLanguageTag
.getCountry();
331 UErrorCode status
= U_ZERO_ERROR
;
332 const int32_t nScripts
= uscript_getCode(
333 OUStringToOString( aName
, RTL_TEXTENCODING_ASCII_US
).getStr(),
334 aBuf
, nBuf
, &status
);
335 // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
336 // and required capacity returned, but really..
337 if (nScripts
== 0 || !U_SUCCESS(status
))
338 return css::i18n::ScriptType::LATIN
;
340 return getScriptClassFromUScriptCode( aBuf
[0]);
343 OString
unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript
)
348 case USCRIPT_CODE_LIMIT
:
349 case USCRIPT_INVALID_CODE
:
353 case USCRIPT_INHERITED
:
356 case USCRIPT_MATHEMATICAL_NOTATION
:
357 case USCRIPT_SYMBOLS
:
360 case USCRIPT_UNWRITTEN_LANGUAGES
:
361 case USCRIPT_UNKNOWN
:
367 case USCRIPT_ARMENIAN
:
370 case USCRIPT_BENGALI
:
373 case USCRIPT_BOPOMOFO
:
376 case USCRIPT_CHEROKEE
:
382 case USCRIPT_CYRILLIC
:
385 case USCRIPT_DESERET
:
388 case USCRIPT_DEVANAGARI
:
391 case USCRIPT_ETHIOPIC
:
394 case USCRIPT_GEORGIAN
:
403 case USCRIPT_GUJARATI
:
406 case USCRIPT_GURMUKHI
:
418 case USCRIPT_HIRAGANA
:
421 case USCRIPT_KANNADA
:
424 case USCRIPT_KATAKANA
:
436 case USCRIPT_MALAYALAM
:
439 case USCRIPT_MONGOLIAN
:
442 case USCRIPT_MYANMAR
:
448 case USCRIPT_OLD_ITALIC
:
457 case USCRIPT_SINHALA
:
475 case USCRIPT_TIBETAN
:
478 case USCRIPT_CANADIAN_ABORIGINAL
:
484 case USCRIPT_TAGALOG
:
487 case USCRIPT_HANUNOO
:
493 case USCRIPT_TAGBANWA
:
496 case USCRIPT_BRAILLE
:
499 case USCRIPT_CYPRIOT
:
505 case USCRIPT_LINEAR_B
:
508 case USCRIPT_OSMANYA
:
511 case USCRIPT_SHAVIAN
:
517 case USCRIPT_UGARITIC
:
520 case USCRIPT_KATAKANA_OR_HIRAGANA
:
523 case USCRIPT_BUGINESE
:
526 case USCRIPT_GLAGOLITIC
:
529 case USCRIPT_KHAROSHTHI
:
532 case USCRIPT_SYLOTI_NAGRI
:
535 case USCRIPT_NEW_TAI_LUE
:
538 case USCRIPT_TIFINAGH
:
541 case USCRIPT_OLD_PERSIAN
:
544 case USCRIPT_BALINESE
:
550 case USCRIPT_BLISSYMBOLS
:
562 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC
:
565 case USCRIPT_DEMOTIC_EGYPTIAN
:
566 case USCRIPT_HIERATIC_EGYPTIAN
:
567 case USCRIPT_EGYPTIAN_HIEROGLYPHS
:
570 case USCRIPT_KHUTSURI
:
573 case USCRIPT_SIMPLIFIED_HAN
:
576 case USCRIPT_TRADITIONAL_HAN
:
579 case USCRIPT_PAHAWH_HMONG
:
582 case USCRIPT_OLD_HUNGARIAN
:
585 case USCRIPT_HARAPPAN_INDUS
:
588 case USCRIPT_JAVANESE
:
591 case USCRIPT_KAYAH_LI
:
594 case USCRIPT_LATIN_FRAKTUR
:
597 case USCRIPT_LATIN_GAELIC
:
603 case USCRIPT_LINEAR_A
:
606 case USCRIPT_MAYAN_HIEROGLYPHS
:
609 case USCRIPT_MEROITIC
:
618 case USCRIPT_OLD_PERMIC
:
621 case USCRIPT_PHAGS_PA
:
624 case USCRIPT_PHOENICIAN
:
627 case USCRIPT_PHONETIC_POLLARD
:
630 case USCRIPT_RONGORONGO
:
636 case USCRIPT_ESTRANGELO_SYRIAC
:
639 case USCRIPT_WESTERN_SYRIAC
:
642 case USCRIPT_EASTERN_SYRIAC
:
645 case USCRIPT_TENGWAR
:
651 case USCRIPT_VISIBLE_SPEECH
:
654 case USCRIPT_CUNEIFORM
:
660 case USCRIPT_JAPANESE
:
672 case USCRIPT_OL_CHIKI
:
678 case USCRIPT_SAURASHTRA
:
681 case USCRIPT_SIGN_WRITING
:
684 case USCRIPT_SUNDANESE
:
690 case USCRIPT_MEITEI_MAYEK
:
693 case USCRIPT_IMPERIAL_ARAMAIC
:
696 case USCRIPT_AVESTAN
:
708 case USCRIPT_MANICHAEAN
:
711 case USCRIPT_INSCRIPTIONAL_PAHLAVI
:
712 case USCRIPT_PSALTER_PAHLAVI
:
713 case USCRIPT_BOOK_PAHLAVI
:
714 case USCRIPT_INSCRIPTIONAL_PARTHIAN
:
717 case USCRIPT_SAMARITAN
:
720 case USCRIPT_TAI_VIET
:
723 case USCRIPT_MANDAEAN
: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
726 case USCRIPT_NABATAEAN
:
727 sRet
= "mis-Nbat"; // Uncoded with script
729 case USCRIPT_PALMYRENE
:
730 sRet
= "mis-Palm"; // Uncoded with script
738 case USCRIPT_NAKHI_GEBA
:
741 case USCRIPT_OLD_SOUTH_ARABIAN
:
744 case USCRIPT_BASSA_VAH
:
747 case USCRIPT_DUPLOYAN_SHORTAND
:
750 case USCRIPT_ELBASAN
:
753 case USCRIPT_GRANTHA
:
765 case USCRIPT_MEROITIC_CURSIVE
:
768 case USCRIPT_OLD_NORTH_ARABIAN
:
774 case USCRIPT_WARANG_CITI
:
777 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
781 case USCRIPT_JURCHEN
:
788 sRet
= "mis-Nshu"; // Uncoded with script
790 case USCRIPT_SHARADA
:
793 case USCRIPT_SORA_SOMPENG
:
806 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
807 case USCRIPT_ANATOLIAN_HIEROGLYPHS
:
813 case USCRIPT_TIRHUTA
:
817 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
818 case USCRIPT_CAUCASIAN_ALBANIAN
:
821 case USCRIPT_MAHAJANI
:
825 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
835 case USCRIPT_MULTANI
:
838 case USCRIPT_PAU_CIN_HAU
:
841 case USCRIPT_SIDDHAM
:
845 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
847 sRet
= "mis-Adlm"; // Adlam for Fulani, no language code
849 case USCRIPT_BHAIKSUKI
:
850 sRet
= "mis-Bhks"; // Bhaiksuki for some Buddhist texts, no language code
852 case USCRIPT_MARCHEN
:
861 case USCRIPT_HAN_WITH_BOPOMOFO
:
862 sRet
= "mis-Hanb"; // Han with Bopomofo, zh-Hanb ?
865 sRet
= "ko"; // Jamo - elements of Hangul Syllables
867 case USCRIPT_SYMBOLS_EMOJI
:
868 sRet
= "mis-Zsye"; // Emoji variant
871 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
872 case USCRIPT_MASARAM_GONDI
:
873 sRet
= "gon-Gonm"; // macro language code, could be wsg,esg,gno
875 case USCRIPT_SOYOMBO
:
876 sRet
= "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
878 case USCRIPT_ZANABAZAR_SQUARE
:
879 sRet
= "mn-Zanb"; // abugida to write Mongolian
882 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
884 sRet
= "dgo"; // Dogri proper
886 case USCRIPT_GUNJALA_GONDI
:
887 sRet
= "wsg"; // Adilabad Gondi
889 case USCRIPT_MAKASAR
:
892 case USCRIPT_MEDEFAIDRIN
:
895 case USCRIPT_HANIFI_ROHINGYA
:
898 case USCRIPT_SOGDIAN
:
901 case USCRIPT_OLD_SOGDIAN
:
905 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
906 case USCRIPT_ELYMAIC
:
909 case USCRIPT_NYIAKENG_PUACHUE_HMONG
:
910 sRet
= "hmn-Hmnp"; // macrolanguage code
912 case USCRIPT_NANDINAGARI
:
919 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
920 case USCRIPT_CHORASMIAN
:
923 case USCRIPT_DIVES_AKURU
:
926 case USCRIPT_KHITAN_SMALL_SCRIPT
:
933 #if (U_ICU_VERSION_MAJOR_NUM >= 70)
934 case USCRIPT_CYPRO_MINOAN
:
935 sRet
= "mis-Cpmn"; // Uncoded with script
937 case USCRIPT_OLD_UYGHUR
:
946 case USCRIPT_VITHKUQI
:
947 sRet
= "sq-Vith"; // macrolanguage code
950 #if (U_ICU_VERSION_MAJOR_NUM >= 72)
952 sRet
= "mis-Kawi"; // Uncoded with script
954 case USCRIPT_NAG_MUNDARI
:
962 //Format a number as a percentage according to the rules of the given
963 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
964 OUString
unicode::formatPercent(double dNumber
,
965 const LanguageTag
&rLangTag
)
967 // get a currency formatter for this locale ID
968 UErrorCode errorCode
=U_ZERO_ERROR
;
970 LanguageTag
aLangTag(rLangTag
);
972 // As of CLDR Version 24 these languages were not listed as using spacing
973 // between number and % but are reported as such by our l10n groups
974 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
975 // so format using French which has the desired rules
976 if (aLangTag
.getLanguage() == "es" || aLangTag
.getLanguage() == "sl")
977 aLangTag
.reset("fr-FR");
979 icu::Locale aLocale
= LanguageTagIcu::getIcuLocale(aLangTag
);
981 std::unique_ptr
<icu::NumberFormat
> xF(
982 icu::NumberFormat::createPercentInstance(aLocale
, errorCode
));
983 if(U_FAILURE(errorCode
))
985 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
986 return OUString::number(dNumber
) + "%";
989 icu::UnicodeString output
;
990 xF
->format(dNumber
/100, output
);
991 OUString
aRet(reinterpret_cast<const sal_Unicode
*>(output
.getBuffer()),
993 if (rLangTag
.getLanguage() == "de")
995 //narrow no-break space instead of (normal) no-break space
996 return aRet
.replace(0x00A0, 0x202F);
1001 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar
)
1003 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1004 if( maInput
.getLength() > 255 )
1005 mbAllowMoreChars
= false;
1007 if( !mbAllowMoreChars
)
1010 bool bPreventNonHex
= false;
1011 if( maInput
.indexOf("U+") != -1 )
1012 bPreventNonHex
= true;
1014 switch ( unicode::getUnicodeType(uChar
) )
1016 case css::i18n::UnicodeType::SURROGATE
:
1017 if( bPreventNonHex
)
1019 mbAllowMoreChars
= false;
1023 if( rtl::isLowSurrogate(uChar
) && maUtf16
.isEmpty() && maInput
.isEmpty() )
1025 maUtf16
.append(uChar
);
1028 if( rtl::isHighSurrogate(uChar
) && maInput
.isEmpty() )
1029 maUtf16
.insert(0, uChar
);
1030 //end of hex strings, or unexpected order of high/low, so don't accept more
1031 if( !maUtf16
.isEmpty() )
1032 maInput
.append(maUtf16
);
1033 if( !maCombining
.isEmpty() )
1034 maInput
.append(maCombining
);
1035 mbAllowMoreChars
= false;
1038 case css::i18n::UnicodeType::NON_SPACING_MARK
:
1039 case css::i18n::UnicodeType::COMBINING_SPACING_MARK
:
1040 if( bPreventNonHex
)
1042 mbAllowMoreChars
= false;
1046 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1047 if( !maUtf16
.isEmpty() )
1050 if( !maCombining
.isEmpty() )
1051 maInput
.append(maCombining
);
1052 mbAllowMoreChars
= false;
1055 maCombining
.insert(0, uChar
);
1059 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1060 if( !maUtf16
.isEmpty() )
1063 if( !maCombining
.isEmpty() )
1064 maInput
.append(maCombining
);
1065 mbAllowMoreChars
= false;
1069 if( !maCombining
.isEmpty() )
1071 maCombining
.insert(0, uChar
);
1072 maInput
= maCombining
;
1073 mbAllowMoreChars
= false;
1077 // 0 - 1f are control characters. Do not process those.
1080 mbAllowMoreChars
= false;
1088 // U+ notation found. Continue looking for another one.
1091 mbRequiresU
= false;
1092 maInput
.insert(0,"U+");
1094 // treat as a normal character
1097 mbAllowMoreChars
= false;
1098 if( !bPreventNonHex
)
1099 maInput
.insertUtf32(0, uChar
);
1103 // + already found: skip when not U, or edge case of +U+xxxx
1104 if( mbRequiresU
|| (maInput
.indexOf("U+") == 0) )
1105 mbAllowMoreChars
= false;
1106 // hex chars followed by '+' - now require a 'U'
1107 else if ( !maInput
.isEmpty() )
1109 // treat as a normal character
1112 mbAllowMoreChars
= false;
1113 if( !bPreventNonHex
)
1114 maInput
.insertUtf32(0, uChar
);
1118 // + already found. Since not U, cancel further input
1120 mbAllowMoreChars
= false;
1121 // maximum digits per notation is 8: only one notation
1122 else if( maInput
.indexOf("U+") == -1 && maInput
.getLength() == 8 )
1123 mbAllowMoreChars
= false;
1124 // maximum digits per notation is 8: previous notation found
1125 else if( maInput
.indexOf("U+") == 8 )
1126 mbAllowMoreChars
= false;
1127 // a hex character. Add to string.
1128 else if( rtl::isAsciiHexDigit(uChar
) )
1130 mbIsHexString
= true;
1131 maInput
.insertUtf32(0, uChar
);
1133 // not a hex character: stop input. keep if it is the first input provided
1136 mbAllowMoreChars
= false;
1137 if( maInput
.isEmpty() )
1138 maInput
.insertUtf32(0, uChar
);
1142 return mbAllowMoreChars
;
1145 OUString
ToggleUnicodeCodepoint::StringToReplace()
1147 if( maInput
.isEmpty() )
1149 //edge case - input finished with incomplete low surrogate or combining characters without a base
1150 if( mbAllowMoreChars
)
1152 if( !maUtf16
.isEmpty() )
1154 if( !maCombining
.isEmpty() )
1155 maInput
.append(maCombining
);
1157 return maInput
.toString();
1160 if( !mbIsHexString
)
1161 return maInput
.toString();
1163 //this function potentially modifies the input string. Prevent addition of further characters
1164 mbAllowMoreChars
= false;
1166 //validate unicode notation.
1168 sal_uInt32 nUnicode
= 0;
1169 sal_Int32 nUPlus
= maInput
.indexOf("U+");
1170 //if U+ notation used, strip off all extra chars added not in U+ notation
1173 maInput
.remove(0, nUPlus
);
1174 sIn
= maInput
.copy(2).makeStringAndClear();
1175 nUPlus
= sIn
.indexOf("U+");
1178 sIn
= maInput
.toString();
1179 while( nUPlus
!= -1 )
1181 nUnicode
= o3tl::toUInt32(sIn
.subView(0, nUPlus
), 16);
1182 //prevent creating control characters or invalid Unicode values
1183 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1184 maInput
= sIn
.subView(nUPlus
);
1185 sIn
= sIn
.copy(nUPlus
+2);
1186 nUPlus
= sIn
.indexOf("U+");
1189 nUnicode
= sIn
.toUInt32(16);
1190 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1191 maInput
.truncate().append( sIn
[sIn
.getLength()-1] );
1192 return maInput
.toString();
1195 sal_uInt32
ToggleUnicodeCodepoint::CharsToDelete()
1197 OUString sIn
= StringToReplace();
1199 sal_uInt32 counter
= 0;
1200 while( nPos
< sIn
.getLength() )
1202 sIn
.iterateCodePoints(&nPos
);
1208 OUString
ToggleUnicodeCodepoint::ReplacementString()
1210 OUString sIn
= StringToReplace();
1211 OUStringBuffer output
= "";
1212 sal_Int32 nUPlus
= sIn
.indexOf("U+");
1213 // convert from hex notation to glyph
1214 if( nUPlus
!= -1 || (sIn
.getLength() > 1 && mbIsHexString
) )
1216 sal_uInt32 nUnicode
= 0;
1220 nUPlus
= sIn
.indexOf("U+");
1224 nUnicode
= o3tl::toUInt32(sIn
.subView(0, nUPlus
), 16);
1225 output
.appendUtf32( nUnicode
);
1227 sIn
= sIn
.copy(nUPlus
+2);
1228 nUPlus
= sIn
.indexOf("U+");
1230 nUnicode
= sIn
.toUInt32(16);
1231 output
.appendUtf32( nUnicode
);
1233 // convert from glyph to hex notation
1237 while( nPos
< sIn
.getLength() )
1239 OUStringBuffer aTmp
= OUString::number(sIn
.iterateCodePoints(&nPos
),16);
1240 //pad with zeros - minimum length of 4.
1241 for( sal_Int32 i
= 4 - aTmp
.getLength(); i
> 0; --i
)
1242 aTmp
.insert( 0,"0" );
1243 output
.append( "U+" + aTmp
);
1246 return output
.makeStringAndClear();
1249 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */