1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include "unicode_data.h"
28 #include <rtl/character.hxx>
31 // Workaround for glibc braindamage:
32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 #undef CURRENCY_SYMBOL
36 using namespace ::com::sun::star::i18n
;
38 template<class L
, typename T
>
39 static T
getScriptType( const sal_Unicode ch
, const L
* typeList
, T unknownType
) {
42 css::i18n::UnicodeScript type
= typeList
[0].to
;
43 while (type
< UnicodeScript_kScriptCount
&& ch
> UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeTo
]) {
44 type
= typeList
[++i
].to
;
47 return (type
< UnicodeScript_kScriptCount
&&
48 ch
>= UnicodeScriptType
[static_cast<int>(typeList
[i
].from
)][int(UnicodeScriptTypeFrom
)]) ?
49 typeList
[i
].value
: unknownType
;
53 unicode::getUnicodeScriptType( const sal_Unicode ch
, const ScriptTypeList
* typeList
, sal_Int16 unknownType
) {
54 return getScriptType(ch
, typeList
, unknownType
);
58 unicode::getUnicodeScriptStart( UnicodeScript type
) {
59 return UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeFrom
];
63 unicode::getUnicodeScriptEnd( UnicodeScript type
) {
64 return UnicodeScriptType
[static_cast<int>(type
)][UnicodeScriptTypeTo
];
68 unicode::getUnicodeType( const sal_Unicode ch
) {
69 static sal_Unicode c
= 0x00;
70 static sal_Int16 r
= 0x00;
72 if (ch
== c
) return r
;
75 sal_Int16 address
= UnicodeTypeIndex
[ch
>> 8];
76 r
= static_cast<sal_Int16
>(
77 (address
< UnicodeTypeNumberBlock
)
78 ? UnicodeTypeBlockValue
[address
]
79 : UnicodeTypeValue
[((address
- UnicodeTypeNumberBlock
) << 8) + (ch
& 0xff)]);
84 unicode::getUnicodeDirection( const sal_Unicode ch
) {
85 static sal_Unicode c
= 0x00;
86 static sal_uInt8 r
= 0x00;
88 if (ch
== c
) return r
;
91 sal_Int16 address
= UnicodeDirectionIndex
[ch
>> 8];
92 r
= (address
< UnicodeDirectionNumberBlock
)
93 ? UnicodeDirectionBlockValue
[address
]
94 : UnicodeDirectionValue
[((address
- UnicodeDirectionNumberBlock
) << 8) + (ch
& 0xff)];
98 #define bit(name) (1U << name)
100 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
102 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
104 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
106 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
107 bit(UnicodeType::MODIFIER_LETTER)|\
108 bit(UnicodeType::OTHER_LETTER)
110 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
111 bit(UnicodeType::LINE_SEPARATOR)|\
112 bit(UnicodeType::PARAGRAPH_SEPARATOR)
114 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
115 bit(UnicodeType::FORMAT)|\
116 bit(UnicodeType::LINE_SEPARATOR)|\
117 bit(UnicodeType::PARAGRAPH_SEPARATOR)
119 #define IsType(func, mask) \
120 bool func( const sal_Unicode ch) {\
121 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
124 IsType(unicode::isControl
, CONTROLMASK
)
125 IsType(unicode::isAlpha
, ALPHAMASK
)
126 IsType(unicode::isSpace
, SPACEMASK
)
128 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
129 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
131 bool unicode::isWhiteSpace( const sal_Unicode ch
) {
132 return (ch
!= 0xa0 && isSpace(ch
)) || (ch
<= 0x1F && (bit(ch
) & (CONTROLSPACE
)));
135 sal_Int16
unicode::getScriptClassFromUScriptCode(UScriptCode eScript
)
137 //See unicode/uscript.h
138 static const sal_Int16 scriptTypes
[] =
140 ScriptType::WEAK
, ScriptType::WEAK
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
,
141 ScriptType::ASIAN
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
,
142 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::LATIN
,
144 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::ASIAN
, ScriptType::ASIAN
, ScriptType::COMPLEX
,
145 ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
146 ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
,
148 ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
149 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
150 ScriptType::LATIN
, ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
152 ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
153 ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
,
154 ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
156 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
157 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
,
158 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::ASIAN
, ScriptType::ASIAN
,
160 ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
161 ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
162 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
164 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
165 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
166 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::WEAK
, ScriptType::WEAK
, ScriptType::COMPLEX
,
168 ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
169 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
170 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::ASIAN
,
172 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
173 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::WEAK
, ScriptType::WEAK
,
174 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
176 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
177 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
183 if (eScript
< USCRIPT_COMMON
)
184 nRet
= ScriptType::WEAK
;
185 else if (static_cast<size_t>(eScript
) >= SAL_N_ELEMENTS(scriptTypes
))
186 nRet
= ScriptType::COMPLEX
; // anything new is going to be pretty wild
188 nRet
= scriptTypes
[eScript
];
192 OString
unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript
)
197 case USCRIPT_CODE_LIMIT
:
198 case USCRIPT_INVALID_CODE
:
202 case USCRIPT_INHERITED
:
205 case USCRIPT_MATHEMATICAL_NOTATION
:
206 case USCRIPT_SYMBOLS
:
209 case USCRIPT_UNWRITTEN_LANGUAGES
:
210 case USCRIPT_UNKNOWN
:
216 case USCRIPT_ARMENIAN
:
219 case USCRIPT_BENGALI
:
222 case USCRIPT_BOPOMOFO
:
225 case USCRIPT_CHEROKEE
:
231 case USCRIPT_CYRILLIC
:
234 case USCRIPT_DESERET
:
237 case USCRIPT_DEVANAGARI
:
240 case USCRIPT_ETHIOPIC
:
243 case USCRIPT_GEORGIAN
:
252 case USCRIPT_GUJARATI
:
255 case USCRIPT_GURMUKHI
:
267 case USCRIPT_HIRAGANA
:
270 case USCRIPT_KANNADA
:
273 case USCRIPT_KATAKANA
:
285 case USCRIPT_MALAYALAM
:
288 case USCRIPT_MONGOLIAN
:
291 case USCRIPT_MYANMAR
:
297 case USCRIPT_OLD_ITALIC
:
306 case USCRIPT_SINHALA
:
324 case USCRIPT_TIBETAN
:
327 case USCRIPT_CANADIAN_ABORIGINAL
:
333 case USCRIPT_TAGALOG
:
336 case USCRIPT_HANUNOO
:
342 case USCRIPT_TAGBANWA
:
345 case USCRIPT_BRAILLE
:
348 case USCRIPT_CYPRIOT
:
354 case USCRIPT_LINEAR_B
:
357 case USCRIPT_OSMANYA
:
360 case USCRIPT_SHAVIAN
:
366 case USCRIPT_UGARITIC
:
369 case USCRIPT_KATAKANA_OR_HIRAGANA
:
372 case USCRIPT_BUGINESE
:
375 case USCRIPT_GLAGOLITIC
:
378 case USCRIPT_KHAROSHTHI
:
381 case USCRIPT_SYLOTI_NAGRI
:
384 case USCRIPT_NEW_TAI_LUE
:
387 case USCRIPT_TIFINAGH
:
390 case USCRIPT_OLD_PERSIAN
:
393 case USCRIPT_BALINESE
:
399 case USCRIPT_BLISSYMBOLS
:
411 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC
:
414 case USCRIPT_DEMOTIC_EGYPTIAN
:
415 case USCRIPT_HIERATIC_EGYPTIAN
:
416 case USCRIPT_EGYPTIAN_HIEROGLYPHS
:
419 case USCRIPT_KHUTSURI
:
422 case USCRIPT_SIMPLIFIED_HAN
:
425 case USCRIPT_TRADITIONAL_HAN
:
428 case USCRIPT_PAHAWH_HMONG
:
431 case USCRIPT_OLD_HUNGARIAN
:
434 case USCRIPT_HARAPPAN_INDUS
:
437 case USCRIPT_JAVANESE
:
440 case USCRIPT_KAYAH_LI
:
443 case USCRIPT_LATIN_FRAKTUR
:
446 case USCRIPT_LATIN_GAELIC
:
452 case USCRIPT_LINEAR_A
:
455 case USCRIPT_MAYAN_HIEROGLYPHS
:
458 case USCRIPT_MEROITIC
:
467 case USCRIPT_OLD_PERMIC
:
470 case USCRIPT_PHAGS_PA
:
473 case USCRIPT_PHOENICIAN
:
476 case USCRIPT_PHONETIC_POLLARD
:
479 case USCRIPT_RONGORONGO
:
485 case USCRIPT_ESTRANGELO_SYRIAC
:
488 case USCRIPT_WESTERN_SYRIAC
:
491 case USCRIPT_EASTERN_SYRIAC
:
494 case USCRIPT_TENGWAR
:
500 case USCRIPT_VISIBLE_SPEECH
:
503 case USCRIPT_CUNEIFORM
:
509 case USCRIPT_JAPANESE
:
521 case USCRIPT_OL_CHIKI
:
527 case USCRIPT_SAURASHTRA
:
530 case USCRIPT_SIGN_WRITING
:
533 case USCRIPT_SUNDANESE
:
539 case USCRIPT_MEITEI_MAYEK
:
542 case USCRIPT_IMPERIAL_ARAMAIC
:
545 case USCRIPT_AVESTAN
:
557 case USCRIPT_MANICHAEAN
:
560 case USCRIPT_INSCRIPTIONAL_PAHLAVI
:
561 case USCRIPT_PSALTER_PAHLAVI
:
562 case USCRIPT_BOOK_PAHLAVI
:
563 case USCRIPT_INSCRIPTIONAL_PARTHIAN
:
566 case USCRIPT_SAMARITAN
:
569 case USCRIPT_TAI_VIET
:
572 case USCRIPT_MANDAEAN
: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
575 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
576 case USCRIPT_NABATAEAN
: //no language with an assigned code yet
579 case USCRIPT_PALMYRENE
: //no language with an assigned code yet
588 case USCRIPT_NAKHI_GEBA
:
591 case USCRIPT_OLD_SOUTH_ARABIAN
:
594 case USCRIPT_BASSA_VAH
:
597 case USCRIPT_DUPLOYAN_SHORTAND
:
600 case USCRIPT_ELBASAN
:
603 case USCRIPT_GRANTHA
:
615 case USCRIPT_MEROITIC_CURSIVE
:
618 case USCRIPT_OLD_NORTH_ARABIAN
:
624 case USCRIPT_WARANG_CITI
:
628 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
632 case USCRIPT_JURCHEN
:
638 case USCRIPT_NUSHU
: //no language with an assigned code yet
641 case USCRIPT_SHARADA
:
644 case USCRIPT_SORA_SOMPENG
:
657 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
658 case USCRIPT_ANATOLIAN_HIEROGLYPHS
:
664 case USCRIPT_TIRHUTA
:
668 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
669 case USCRIPT_CAUCASIAN_ALBANIAN
:
672 case USCRIPT_MAHAJANI
:
676 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
686 case USCRIPT_MULTANI
:
689 case USCRIPT_PAU_CIN_HAU
:
692 case USCRIPT_SIDDHAM
:
696 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
698 sRet
= "mis"; // Adlm - Adlam for Fulani, no language code
700 case USCRIPT_BHAIKSUKI
:
701 sRet
= "mis"; // Bhks - Bhaiksuki for some Buddhist texts, no language code
703 case USCRIPT_MARCHEN
:
712 case USCRIPT_HAN_WITH_BOPOMOFO
:
713 sRet
= "mis"; // Hanb - Han with Bopomofo, zh-Hanb ?
716 sRet
= "ko"; // Jamo - elements of Hangul Syllables
718 case USCRIPT_SYMBOLS_EMOJI
:
719 sRet
= "mis"; // Zsye - Emoji variant
722 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
723 case USCRIPT_MASARAM_GONDI
:
724 sRet
= "gon-Gonm"; // macro language code, could be wsg,esg,gno
726 case USCRIPT_SOYOMBO
:
727 sRet
= "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
729 case USCRIPT_ZANABAZAR_SQUARE
:
730 sRet
= "mn-Zanb"; // abugida to write Mongolian
733 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
735 sRet
= "dgo"; // Dogri proper
737 case USCRIPT_GUNJALA_GONDI
:
738 sRet
= "wsg"; // Adilabad Gondi
740 case USCRIPT_MAKASAR
:
743 case USCRIPT_MEDEFAIDRIN
:
744 sRet
= "mis-Medf"; // Uncoded with script
746 case USCRIPT_HANIFI_ROHINGYA
:
749 case USCRIPT_SOGDIAN
:
752 case USCRIPT_OLD_SOGDIAN
:
756 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
757 case USCRIPT_ELYMAIC
:
760 case USCRIPT_NYIAKENG_PUACHUE_HMONG
:
761 sRet
= "hmn-Hmnp"; // macrolanguage code
763 case USCRIPT_NANDINAGARI
:
770 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
771 case USCRIPT_CHORASMIAN
:
774 case USCRIPT_DIVES_AKURU
:
777 case USCRIPT_KHITAN_SMALL_SCRIPT
:
788 //Format a number as a percentage according to the rules of the given
789 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
790 OUString
unicode::formatPercent(double dNumber
,
791 const LanguageTag
&rLangTag
)
793 // get a currency formatter for this locale ID
794 UErrorCode errorCode
=U_ZERO_ERROR
;
796 LanguageTag
aLangTag(rLangTag
);
798 // As of CLDR Version 24 these languages were not listed as using spacing
799 // between number and % but are reported as such by our l10n groups
800 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
801 // so format using French which has the desired rules
802 if (aLangTag
.getLanguage() == "es" || aLangTag
.getLanguage() == "sl")
803 aLangTag
.reset("fr-FR");
805 icu::Locale aLocale
= LanguageTagIcu::getIcuLocale(aLangTag
);
807 std::unique_ptr
<icu::NumberFormat
> xF(
808 icu::NumberFormat::createPercentInstance(aLocale
, errorCode
));
809 if(U_FAILURE(errorCode
))
811 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
812 return OUString::number(dNumber
) + "%";
815 icu::UnicodeString output
;
816 xF
->format(dNumber
/100, output
);
817 OUString
aRet(reinterpret_cast<const sal_Unicode
*>(output
.getBuffer()),
819 if (rLangTag
.getLanguage() == "de")
821 //narrow no-break space instead of (normal) no-break space
822 return aRet
.replace(0x00A0, 0x202F);
827 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar
)
829 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
830 if( maInput
.getLength() > 255 )
831 mbAllowMoreChars
= false;
833 if( !mbAllowMoreChars
)
836 bool bPreventNonHex
= false;
837 if( maInput
.indexOf("U+") != -1 )
838 bPreventNonHex
= true;
840 switch ( unicode::getUnicodeType(uChar
) )
842 case css::i18n::UnicodeType::SURROGATE
:
845 mbAllowMoreChars
= false;
849 if( rtl::isLowSurrogate(uChar
) && maUtf16
.isEmpty() && maInput
.isEmpty() )
851 maUtf16
.append(uChar
);
854 if( rtl::isHighSurrogate(uChar
) && maInput
.isEmpty() )
855 maUtf16
.insert(0, uChar
);
856 //end of hex strings, or unexpected order of high/low, so don't accept more
857 if( !maUtf16
.isEmpty() )
858 maInput
.append(maUtf16
);
859 if( !maCombining
.isEmpty() )
860 maInput
.append(maCombining
);
861 mbAllowMoreChars
= false;
864 case css::i18n::UnicodeType::NON_SPACING_MARK
:
865 case css::i18n::UnicodeType::COMBINING_SPACING_MARK
:
868 mbAllowMoreChars
= false;
872 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
873 if( !maUtf16
.isEmpty() )
876 if( !maCombining
.isEmpty() )
877 maInput
.append(maCombining
);
878 mbAllowMoreChars
= false;
881 maCombining
.insert(0, uChar
);
885 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
886 if( !maUtf16
.isEmpty() )
889 if( !maCombining
.isEmpty() )
890 maInput
.append(maCombining
);
891 mbAllowMoreChars
= false;
895 if( !maCombining
.isEmpty() )
897 maCombining
.insert(0, uChar
);
898 maInput
= maCombining
;
899 mbAllowMoreChars
= false;
903 // 0 - 1f are control characters. Do not process those.
906 mbAllowMoreChars
= false;
914 // U+ notation found. Continue looking for another one.
918 maInput
.insert(0,"U+");
920 // treat as a normal character
923 mbAllowMoreChars
= false;
924 if( !bPreventNonHex
)
925 maInput
.insertUtf32(0, uChar
);
929 // + already found: skip when not U, or edge case of +U+xxxx
930 if( mbRequiresU
|| (maInput
.indexOf("U+") == 0) )
931 mbAllowMoreChars
= false;
932 // hex chars followed by '+' - now require a 'U'
933 else if ( !maInput
.isEmpty() )
935 // treat as a normal character
938 mbAllowMoreChars
= false;
939 if( !bPreventNonHex
)
940 maInput
.insertUtf32(0, uChar
);
944 // + already found. Since not U, cancel further input
946 mbAllowMoreChars
= false;
947 // maximum digits per notation is 8: only one notation
948 else if( maInput
.indexOf("U+") == -1 && maInput
.getLength() == 8 )
949 mbAllowMoreChars
= false;
950 // maximum digits per notation is 8: previous notation found
951 else if( maInput
.indexOf("U+") == 8 )
952 mbAllowMoreChars
= false;
953 // a hex character. Add to string.
954 else if( rtl::isAsciiHexDigit(uChar
) )
956 mbIsHexString
= true;
957 maInput
.insertUtf32(0, uChar
);
959 // not a hex character: stop input. keep if it is the first input provided
962 mbAllowMoreChars
= false;
963 if( maInput
.isEmpty() )
964 maInput
.insertUtf32(0, uChar
);
968 return mbAllowMoreChars
;
971 OUString
ToggleUnicodeCodepoint::StringToReplace()
973 if( maInput
.isEmpty() )
975 //edge case - input finished with incomplete low surrogate or combining characters without a base
976 if( mbAllowMoreChars
)
978 if( !maUtf16
.isEmpty() )
980 if( !maCombining
.isEmpty() )
981 maInput
.append(maCombining
);
983 return maInput
.toString();
987 return maInput
.toString();
989 //this function potentially modifies the input string. Prevent addition of further characters
990 mbAllowMoreChars
= false;
992 //validate unicode notation.
994 sal_uInt32 nUnicode
= 0;
995 sal_Int32 nUPlus
= maInput
.indexOf("U+");
996 //if U+ notation used, strip off all extra chars added not in U+ notation
999 maInput
.remove(0, nUPlus
);
1000 sIn
= maInput
.copy(2).makeStringAndClear();
1001 nUPlus
= sIn
.indexOf("U+");
1004 sIn
= maInput
.toString();
1005 while( nUPlus
!= -1 )
1007 nUnicode
= sIn
.copy(0, nUPlus
).toUInt32(16);
1008 //prevent creating control characters or invalid Unicode values
1009 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1010 maInput
= sIn
.copy(nUPlus
);
1011 sIn
= sIn
.copy(nUPlus
+2);
1012 nUPlus
= sIn
.indexOf("U+");
1015 nUnicode
= sIn
.toUInt32(16);
1016 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1017 maInput
.truncate().append( sIn
[sIn
.getLength()-1] );
1018 return maInput
.toString();
1021 sal_uInt32
ToggleUnicodeCodepoint::CharsToDelete()
1023 OUString sIn
= StringToReplace();
1025 sal_uInt32 counter
= 0;
1026 while( nPos
< sIn
.getLength() )
1028 sIn
.iterateCodePoints(&nPos
);
1034 OUString
ToggleUnicodeCodepoint::ReplacementString()
1036 OUString sIn
= StringToReplace();
1037 OUStringBuffer output
= "";
1038 sal_Int32 nUPlus
= sIn
.indexOf("U+");
1039 // convert from hex notation to glyph
1040 if( nUPlus
!= -1 || (sIn
.getLength() > 1 && mbIsHexString
) )
1042 sal_uInt32 nUnicode
= 0;
1046 nUPlus
= sIn
.indexOf("U+");
1050 nUnicode
= sIn
.copy(0, nUPlus
).toUInt32(16);
1051 output
.appendUtf32( nUnicode
);
1053 sIn
= sIn
.copy(nUPlus
+2);
1054 nUPlus
= sIn
.indexOf("U+");
1056 nUnicode
= sIn
.toUInt32(16);
1057 output
.appendUtf32( nUnicode
);
1059 // convert from glyph to hex notation
1063 while( nPos
< sIn
.getLength() )
1065 OUStringBuffer aTmp
= OUString::number(sIn
.iterateCodePoints(&nPos
),16);
1066 //pad with zeros - minimum length of 4.
1067 for( sal_Int32 i
= 4 - aTmp
.getLength(); i
> 0; --i
)
1068 aTmp
.insert( 0,"0" );
1069 output
.append( "U+" );
1070 output
.append( aTmp
);
1073 return output
.toString();
1076 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */