1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/KCharacterType.hpp>
22 #include <com/sun/star/i18n/ScriptType.hpp>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <i18nutil/unicode.hxx>
26 #include <sal/log.hxx>
27 #include <unicode/numfmt.h>
28 #include "unicode_data.h"
29 #include <com/sun/star/i18n/UnicodeType.hpp>
30 #include <rtl/character.hxx>
33 // Workaround for glibc braindamage:
34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36 #undef CURRENCY_SYMBOL
38 using namespace ::com::sun::star::i18n
;
40 static const ScriptTypeList defaultTypeList
[] = {
41 { UnicodeScript_kBasicLatin
,
42 UnicodeScript_kBasicLatin
,
43 UnicodeScript_kBasicLatin
}, // 0,
44 { UnicodeScript_kLatin1Supplement
,
45 UnicodeScript_kLatin1Supplement
,
46 UnicodeScript_kLatin1Supplement
},// 1,
47 { UnicodeScript_kLatinExtendedA
,
48 UnicodeScript_kLatinExtendedA
,
49 UnicodeScript_kLatinExtendedA
}, // 2,
50 { UnicodeScript_kLatinExtendedB
,
51 UnicodeScript_kLatinExtendedB
,
52 UnicodeScript_kLatinExtendedB
}, // 3,
53 { UnicodeScript_kIPAExtension
,
54 UnicodeScript_kIPAExtension
,
55 UnicodeScript_kIPAExtension
}, // 4,
56 { UnicodeScript_kSpacingModifier
,
57 UnicodeScript_kSpacingModifier
,
58 UnicodeScript_kSpacingModifier
}, // 5,
59 { UnicodeScript_kCombiningDiacritical
,
60 UnicodeScript_kCombiningDiacritical
,
61 UnicodeScript_kCombiningDiacritical
}, // 6,
62 { UnicodeScript_kGreek
,
64 UnicodeScript_kGreek
}, // 7,
65 { UnicodeScript_kCyrillic
,
66 UnicodeScript_kCyrillic
,
67 UnicodeScript_kCyrillic
}, // 8,
68 { UnicodeScript_kArmenian
,
69 UnicodeScript_kArmenian
,
70 UnicodeScript_kArmenian
}, // 9,
71 { UnicodeScript_kHebrew
,
72 UnicodeScript_kHebrew
,
73 UnicodeScript_kHebrew
}, // 10,
74 { UnicodeScript_kArabic
,
75 UnicodeScript_kArabic
,
76 UnicodeScript_kArabic
}, // 11,
77 { UnicodeScript_kSyriac
,
78 UnicodeScript_kSyriac
,
79 UnicodeScript_kSyriac
}, // 12,
80 { UnicodeScript_kThaana
,
81 UnicodeScript_kThaana
,
82 UnicodeScript_kThaana
}, // 13,
83 { UnicodeScript_kDevanagari
,
84 UnicodeScript_kDevanagari
,
85 UnicodeScript_kDevanagari
}, // 14,
86 { UnicodeScript_kBengali
,
87 UnicodeScript_kBengali
,
88 UnicodeScript_kBengali
}, // 15,
89 { UnicodeScript_kGurmukhi
,
90 UnicodeScript_kGurmukhi
,
91 UnicodeScript_kGurmukhi
}, // 16,
92 { UnicodeScript_kGujarati
,
93 UnicodeScript_kGujarati
,
94 UnicodeScript_kGujarati
}, // 17,
95 { UnicodeScript_kOriya
,
97 UnicodeScript_kOriya
}, // 18,
98 { UnicodeScript_kTamil
,
100 UnicodeScript_kTamil
}, // 19,
101 { UnicodeScript_kTelugu
,
102 UnicodeScript_kTelugu
,
103 UnicodeScript_kTelugu
}, // 20,
104 { UnicodeScript_kKannada
,
105 UnicodeScript_kKannada
,
106 UnicodeScript_kKannada
}, // 21,
107 { UnicodeScript_kMalayalam
,
108 UnicodeScript_kMalayalam
,
109 UnicodeScript_kMalayalam
}, // 22,
110 { UnicodeScript_kSinhala
,
111 UnicodeScript_kSinhala
,
112 UnicodeScript_kSinhala
}, // 23,
113 { UnicodeScript_kThai
,
115 UnicodeScript_kThai
}, // 24,
116 { UnicodeScript_kLao
,
118 UnicodeScript_kLao
}, // 25,
119 { UnicodeScript_kTibetan
,
120 UnicodeScript_kTibetan
,
121 UnicodeScript_kTibetan
}, // 26,
122 { UnicodeScript_kMyanmar
,
123 UnicodeScript_kMyanmar
,
124 UnicodeScript_kMyanmar
}, // 27,
125 { UnicodeScript_kGeorgian
,
126 UnicodeScript_kGeorgian
,
127 UnicodeScript_kGeorgian
}, // 28,
128 { UnicodeScript_kHangulJamo
,
129 UnicodeScript_kHangulJamo
,
130 UnicodeScript_kHangulJamo
}, // 29,
131 { UnicodeScript_kEthiopic
,
132 UnicodeScript_kEthiopic
,
133 UnicodeScript_kEthiopic
}, // 30,
134 { UnicodeScript_kCherokee
,
135 UnicodeScript_kCherokee
,
136 UnicodeScript_kCherokee
}, // 31,
137 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics
,
138 UnicodeScript_kUnifiedCanadianAboriginalSyllabics
,
139 UnicodeScript_kUnifiedCanadianAboriginalSyllabics
}, // 32,
140 { UnicodeScript_kOgham
,
141 UnicodeScript_kOgham
,
142 UnicodeScript_kOgham
}, // 33,
143 { UnicodeScript_kRunic
,
144 UnicodeScript_kRunic
,
145 UnicodeScript_kRunic
}, // 34,
146 { UnicodeScript_kKhmer
,
147 UnicodeScript_kKhmer
,
148 UnicodeScript_kKhmer
}, // 35,
149 { UnicodeScript_kMongolian
,
150 UnicodeScript_kMongolian
,
151 UnicodeScript_kMongolian
}, // 36,
152 { UnicodeScript_kLatinExtendedAdditional
,
153 UnicodeScript_kLatinExtendedAdditional
,
154 UnicodeScript_kLatinExtendedAdditional
}, // 37,
155 { UnicodeScript_kGreekExtended
,
156 UnicodeScript_kGreekExtended
,
157 UnicodeScript_kGreekExtended
}, // 38,
158 { UnicodeScript_kGeneralPunctuation
,
159 UnicodeScript_kGeneralPunctuation
,
160 UnicodeScript_kGeneralPunctuation
}, // 39,
161 { UnicodeScript_kSuperSubScript
,
162 UnicodeScript_kSuperSubScript
,
163 UnicodeScript_kSuperSubScript
}, // 40,
164 { UnicodeScript_kCurrencySymbolScript
,
165 UnicodeScript_kCurrencySymbolScript
,
166 UnicodeScript_kCurrencySymbolScript
}, // 41,
167 { UnicodeScript_kSymbolCombiningMark
,
168 UnicodeScript_kSymbolCombiningMark
,
169 UnicodeScript_kSymbolCombiningMark
}, // 42,
170 { UnicodeScript_kLetterlikeSymbol
,
171 UnicodeScript_kLetterlikeSymbol
,
172 UnicodeScript_kLetterlikeSymbol
}, // 43,
173 { UnicodeScript_kNumberForm
,
174 UnicodeScript_kNumberForm
,
175 UnicodeScript_kNumberForm
}, // 44,
176 { UnicodeScript_kArrow
,
177 UnicodeScript_kArrow
,
178 UnicodeScript_kArrow
}, // 45,
179 { UnicodeScript_kMathOperator
,
180 UnicodeScript_kMathOperator
,
181 UnicodeScript_kMathOperator
}, // 46,
182 { UnicodeScript_kMiscTechnical
,
183 UnicodeScript_kMiscTechnical
,
184 UnicodeScript_kMiscTechnical
}, // 47,
185 { UnicodeScript_kControlPicture
,
186 UnicodeScript_kControlPicture
,
187 UnicodeScript_kControlPicture
}, // 48,
188 { UnicodeScript_kOpticalCharacter
,
189 UnicodeScript_kOpticalCharacter
,
190 UnicodeScript_kOpticalCharacter
}, // 49,
191 { UnicodeScript_kEnclosedAlphanumeric
,
192 UnicodeScript_kEnclosedAlphanumeric
,
193 UnicodeScript_kEnclosedAlphanumeric
}, // 50,
194 { UnicodeScript_kBoxDrawing
,
195 UnicodeScript_kBoxDrawing
,
196 UnicodeScript_kBoxDrawing
}, // 51,
197 { UnicodeScript_kBlockElement
,
198 UnicodeScript_kBlockElement
,
199 UnicodeScript_kBlockElement
}, // 52,
200 { UnicodeScript_kGeometricShape
,
201 UnicodeScript_kGeometricShape
,
202 UnicodeScript_kGeometricShape
}, // 53,
203 { UnicodeScript_kMiscSymbol
,
204 UnicodeScript_kMiscSymbol
,
205 UnicodeScript_kMiscSymbol
}, // 54,
206 { UnicodeScript_kDingbat
,
207 UnicodeScript_kDingbat
,
208 UnicodeScript_kDingbat
}, // 55,
209 { UnicodeScript_kBraillePatterns
,
210 UnicodeScript_kBraillePatterns
,
211 UnicodeScript_kBraillePatterns
}, // 56,
212 { UnicodeScript_kCJKRadicalsSupplement
,
213 UnicodeScript_kCJKRadicalsSupplement
,
214 UnicodeScript_kCJKRadicalsSupplement
}, // 57,
215 { UnicodeScript_kKangxiRadicals
,
216 UnicodeScript_kKangxiRadicals
,
217 UnicodeScript_kKangxiRadicals
}, // 58,
218 { UnicodeScript_kIdeographicDescriptionCharacters
,
219 UnicodeScript_kIdeographicDescriptionCharacters
,
220 UnicodeScript_kIdeographicDescriptionCharacters
}, // 59,
221 { UnicodeScript_kCJKSymbolPunctuation
,
222 UnicodeScript_kCJKSymbolPunctuation
,
223 UnicodeScript_kCJKSymbolPunctuation
}, // 60,
224 { UnicodeScript_kHiragana
,
225 UnicodeScript_kHiragana
,
226 UnicodeScript_kHiragana
}, // 61,
227 { UnicodeScript_kKatakana
,
228 UnicodeScript_kKatakana
,
229 UnicodeScript_kKatakana
}, // 62,
230 { UnicodeScript_kBopomofo
,
231 UnicodeScript_kBopomofo
,
232 UnicodeScript_kBopomofo
}, // 63,
233 { UnicodeScript_kHangulCompatibilityJamo
,
234 UnicodeScript_kHangulCompatibilityJamo
,
235 UnicodeScript_kHangulCompatibilityJamo
}, // 64,
236 { UnicodeScript_kKanbun
,
237 UnicodeScript_kKanbun
,
238 UnicodeScript_kKanbun
}, // 65,
239 { UnicodeScript_kBopomofoExtended
,
240 UnicodeScript_kBopomofoExtended
,
241 UnicodeScript_kBopomofoExtended
}, // 66,
242 { UnicodeScript_kEnclosedCJKLetterMonth
,
243 UnicodeScript_kEnclosedCJKLetterMonth
,
244 UnicodeScript_kEnclosedCJKLetterMonth
}, // 67,
245 { UnicodeScript_kCJKCompatibility
,
246 UnicodeScript_kCJKCompatibility
,
247 UnicodeScript_kCJKCompatibility
}, // 68,
248 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA
,
249 UnicodeScript_k_CJKUnifiedIdeographsExtensionA
,
250 UnicodeScript_k_CJKUnifiedIdeographsExtensionA
}, // 69,
251 { UnicodeScript_kCJKUnifiedIdeograph
,
252 UnicodeScript_kCJKUnifiedIdeograph
,
253 UnicodeScript_kCJKUnifiedIdeograph
}, // 70,
254 { UnicodeScript_kYiSyllables
,
255 UnicodeScript_kYiSyllables
,
256 UnicodeScript_kYiSyllables
}, // 71,
257 { UnicodeScript_kYiRadicals
,
258 UnicodeScript_kYiRadicals
,
259 UnicodeScript_kYiRadicals
}, // 72,
260 { UnicodeScript_kHangulSyllable
,
261 UnicodeScript_kHangulSyllable
,
262 UnicodeScript_kHangulSyllable
}, // 73,
263 { UnicodeScript_kHighSurrogate
,
264 UnicodeScript_kHighSurrogate
,
265 UnicodeScript_kHighSurrogate
}, // 74,
266 { UnicodeScript_kHighPrivateUseSurrogate
,
267 UnicodeScript_kHighPrivateUseSurrogate
,
268 UnicodeScript_kHighPrivateUseSurrogate
}, // 75,
269 { UnicodeScript_kLowSurrogate
,
270 UnicodeScript_kLowSurrogate
,
271 UnicodeScript_kLowSurrogate
}, // 76,
272 { UnicodeScript_kPrivateUse
,
273 UnicodeScript_kPrivateUse
,
274 UnicodeScript_kPrivateUse
}, // 77,
275 { UnicodeScript_kCJKCompatibilityIdeograph
,
276 UnicodeScript_kCJKCompatibilityIdeograph
,
277 UnicodeScript_kCJKCompatibilityIdeograph
}, // 78,
278 { UnicodeScript_kAlphabeticPresentation
,
279 UnicodeScript_kAlphabeticPresentation
,
280 UnicodeScript_kAlphabeticPresentation
}, // 79,
281 { UnicodeScript_kArabicPresentationA
,
282 UnicodeScript_kArabicPresentationA
,
283 UnicodeScript_kArabicPresentationA
}, // 80,
284 { UnicodeScript_kCombiningHalfMark
,
285 UnicodeScript_kCombiningHalfMark
,
286 UnicodeScript_kCombiningHalfMark
}, // 81,
287 { UnicodeScript_kCJKCompatibilityForm
,
288 UnicodeScript_kCJKCompatibilityForm
,
289 UnicodeScript_kCJKCompatibilityForm
}, // 82,
290 { UnicodeScript_kSmallFormVariant
,
291 UnicodeScript_kSmallFormVariant
,
292 UnicodeScript_kSmallFormVariant
}, // 83,
293 { UnicodeScript_kArabicPresentationB
,
294 UnicodeScript_kArabicPresentationB
,
295 UnicodeScript_kArabicPresentationB
}, // 84,
296 { UnicodeScript_kNoScript
,
297 UnicodeScript_kNoScript
,
298 UnicodeScript_kNoScript
}, // 85,
299 { UnicodeScript_kHalfwidthFullwidthForm
,
300 UnicodeScript_kHalfwidthFullwidthForm
,
301 UnicodeScript_kHalfwidthFullwidthForm
}, // 86,
302 { UnicodeScript_kScriptCount
,
303 UnicodeScript_kScriptCount
,
304 UnicodeScript_kNoScript
} // 87,
308 unicode::getUnicodeScriptType( const sal_Unicode ch
, const ScriptTypeList
* typeList
, sal_Int16 unknownType
) {
311 typeList
= defaultTypeList
;
312 unknownType
= UnicodeScript_kNoScript
;
315 sal_Int16 i
= 0, type
= typeList
[0].to
;
316 while (type
< UnicodeScript_kScriptCount
&& ch
> UnicodeScriptType
[type
][UnicodeScriptTypeTo
]) {
317 type
= typeList
[++i
].to
;
320 return (type
< UnicodeScript_kScriptCount
&&
321 ch
>= UnicodeScriptType
[typeList
[i
].from
][UnicodeScriptTypeFrom
]) ?
322 typeList
[i
].value
: unknownType
;
326 unicode::getUnicodeScriptStart( UnicodeScript type
) {
327 return UnicodeScriptType
[type
][UnicodeScriptTypeFrom
];
331 unicode::getUnicodeScriptEnd( UnicodeScript type
) {
332 return UnicodeScriptType
[type
][UnicodeScriptTypeTo
];
336 unicode::getUnicodeType( const sal_Unicode ch
) {
337 static sal_Unicode c
= 0x00;
338 static sal_Int16 r
= 0x00;
340 if (ch
== c
) return r
;
343 sal_Int16 address
= UnicodeTypeIndex
[ch
>> 8];
344 return r
= (sal_Int16
)((address
< UnicodeTypeNumberBlock
) ? UnicodeTypeBlockValue
[address
] :
345 UnicodeTypeValue
[((address
- UnicodeTypeNumberBlock
) << 8) + (ch
& 0xff)]);
349 unicode::getUnicodeDirection( const sal_Unicode ch
) {
350 static sal_Unicode c
= 0x00;
351 static sal_uInt8 r
= 0x00;
353 if (ch
== c
) return r
;
356 sal_Int16 address
= UnicodeDirectionIndex
[ch
>> 8];
357 return r
= ((address
< UnicodeDirectionNumberBlock
) ? UnicodeDirectionBlockValue
[address
] :
358 UnicodeDirectionValue
[((address
- UnicodeDirectionNumberBlock
) << 8) + (ch
& 0xff)]);
362 #define bit(name) (1U << name)
364 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
366 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
368 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
370 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
371 bit(UnicodeType::MODIFIER_LETTER)|\
372 bit(UnicodeType::OTHER_LETTER)
374 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
375 bit(UnicodeType::LINE_SEPARATOR)|\
376 bit(UnicodeType::PARAGRAPH_SEPARATOR)
378 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
379 bit(UnicodeType::FORMAT)|\
380 bit(UnicodeType::LINE_SEPARATOR)|\
381 bit(UnicodeType::PARAGRAPH_SEPARATOR)
383 #define IsType(func, mask) \
384 bool SAL_CALL func( const sal_Unicode ch) {\
385 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
388 IsType(unicode::isControl
, CONTROLMASK
)
389 IsType(unicode::isAlpha
, ALPHAMASK
)
390 IsType(unicode::isSpace
, SPACEMASK
)
392 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
393 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
395 bool SAL_CALL
unicode::isWhiteSpace( const sal_Unicode ch
) {
396 return (ch
!= 0xa0 && isSpace(ch
)) || (ch
<= 0x1F && (bit(ch
) & (CONTROLSPACE
)));
399 sal_Int16 SAL_CALL
unicode::getScriptClassFromUScriptCode(UScriptCode eScript
)
401 //See unicode/uscript.h
402 static const sal_Int16 scriptTypes
[] =
404 ScriptType::WEAK
, ScriptType::WEAK
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
,
405 ScriptType::ASIAN
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
,
406 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::LATIN
,
408 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::ASIAN
, ScriptType::ASIAN
, ScriptType::COMPLEX
,
409 ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
410 ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
,
412 ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
413 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
414 ScriptType::LATIN
, ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
416 ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
417 ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
,
418 ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
420 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
421 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
,
422 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::ASIAN
, ScriptType::ASIAN
,
424 ScriptType::COMPLEX
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
425 ScriptType::LATIN
, ScriptType::LATIN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
426 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
428 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
429 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
430 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::WEAK
, ScriptType::WEAK
, ScriptType::COMPLEX
,
432 ScriptType::ASIAN
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
433 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
434 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::ASIAN
,
436 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
437 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::WEAK
, ScriptType::WEAK
,
438 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
440 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
441 ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
, ScriptType::COMPLEX
,
447 if (eScript
< USCRIPT_COMMON
)
448 nRet
= ScriptType::WEAK
;
449 else if (static_cast<size_t>(eScript
) >= SAL_N_ELEMENTS(scriptTypes
))
450 nRet
= ScriptType::COMPLEX
; // anything new is going to be pretty wild
452 nRet
= scriptTypes
[eScript
];
456 OString SAL_CALL
unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript
)
461 case USCRIPT_CODE_LIMIT
:
462 case USCRIPT_INVALID_CODE
:
466 case USCRIPT_INHERITED
:
469 case USCRIPT_MATHEMATICAL_NOTATION
:
470 case USCRIPT_SYMBOLS
:
473 case USCRIPT_UNWRITTEN_LANGUAGES
:
474 case USCRIPT_UNKNOWN
:
480 case USCRIPT_ARMENIAN
:
483 case USCRIPT_BENGALI
:
486 case USCRIPT_BOPOMOFO
:
489 case USCRIPT_CHEROKEE
:
495 case USCRIPT_CYRILLIC
:
498 case USCRIPT_DESERET
:
501 case USCRIPT_DEVANAGARI
:
504 case USCRIPT_ETHIOPIC
:
507 case USCRIPT_GEORGIAN
:
516 case USCRIPT_GUJARATI
:
519 case USCRIPT_GURMUKHI
:
531 case USCRIPT_HIRAGANA
:
534 case USCRIPT_KANNADA
:
537 case USCRIPT_KATAKANA
:
549 case USCRIPT_MALAYALAM
:
552 case USCRIPT_MONGOLIAN
:
555 case USCRIPT_MYANMAR
:
561 case USCRIPT_OLD_ITALIC
:
570 case USCRIPT_SINHALA
:
588 case USCRIPT_TIBETAN
:
591 case USCRIPT_CANADIAN_ABORIGINAL
:
597 case USCRIPT_TAGALOG
:
600 case USCRIPT_HANUNOO
:
606 case USCRIPT_TAGBANWA
:
609 case USCRIPT_BRAILLE
:
612 case USCRIPT_CYPRIOT
:
618 case USCRIPT_LINEAR_B
:
621 case USCRIPT_OSMANYA
:
624 case USCRIPT_SHAVIAN
:
630 case USCRIPT_UGARITIC
:
633 case USCRIPT_KATAKANA_OR_HIRAGANA
:
636 case USCRIPT_BUGINESE
:
639 case USCRIPT_GLAGOLITIC
:
642 case USCRIPT_KHAROSHTHI
:
645 case USCRIPT_SYLOTI_NAGRI
:
648 case USCRIPT_NEW_TAI_LUE
:
651 case USCRIPT_TIFINAGH
:
654 case USCRIPT_OLD_PERSIAN
:
657 case USCRIPT_BALINESE
:
663 case USCRIPT_BLISSYMBOLS
:
675 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC
:
678 case USCRIPT_DEMOTIC_EGYPTIAN
:
679 case USCRIPT_HIERATIC_EGYPTIAN
:
680 case USCRIPT_EGYPTIAN_HIEROGLYPHS
:
683 case USCRIPT_KHUTSURI
:
686 case USCRIPT_SIMPLIFIED_HAN
:
689 case USCRIPT_TRADITIONAL_HAN
:
692 case USCRIPT_PAHAWH_HMONG
:
695 case USCRIPT_OLD_HUNGARIAN
:
698 case USCRIPT_HARAPPAN_INDUS
:
701 case USCRIPT_JAVANESE
:
704 case USCRIPT_KAYAH_LI
:
707 case USCRIPT_LATIN_FRAKTUR
:
710 case USCRIPT_LATIN_GAELIC
:
716 case USCRIPT_LINEAR_A
:
719 case USCRIPT_MAYAN_HIEROGLYPHS
:
722 case USCRIPT_MEROITIC
:
731 case USCRIPT_OLD_PERMIC
:
734 case USCRIPT_PHAGS_PA
:
737 case USCRIPT_PHOENICIAN
:
740 case USCRIPT_PHONETIC_POLLARD
:
743 case USCRIPT_RONGORONGO
:
749 case USCRIPT_ESTRANGELO_SYRIAC
:
752 case USCRIPT_WESTERN_SYRIAC
:
755 case USCRIPT_EASTERN_SYRIAC
:
758 case USCRIPT_TENGWAR
:
764 case USCRIPT_VISIBLE_SPEECH
:
767 case USCRIPT_CUNEIFORM
:
773 case USCRIPT_JAPANESE
:
785 case USCRIPT_OL_CHIKI
:
791 case USCRIPT_SAURASHTRA
:
794 case USCRIPT_SIGN_WRITING
:
797 case USCRIPT_SUNDANESE
:
803 case USCRIPT_MEITEI_MAYEK
:
806 case USCRIPT_IMPERIAL_ARAMAIC
:
809 case USCRIPT_AVESTAN
:
821 case USCRIPT_MANICHAEAN
:
824 case USCRIPT_INSCRIPTIONAL_PAHLAVI
:
825 case USCRIPT_PSALTER_PAHLAVI
:
826 case USCRIPT_BOOK_PAHLAVI
:
827 case USCRIPT_INSCRIPTIONAL_PARTHIAN
:
830 case USCRIPT_SAMARITAN
:
833 case USCRIPT_TAI_VIET
:
836 case USCRIPT_MANDAEAN
: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
839 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
840 case USCRIPT_NABATAEAN
: //no language with an assigned code yet
843 case USCRIPT_PALMYRENE
: //no language with an assigned code yet
852 case USCRIPT_NAKHI_GEBA
:
855 case USCRIPT_OLD_SOUTH_ARABIAN
:
858 case USCRIPT_BASSA_VAH
:
861 case USCRIPT_DUPLOYAN_SHORTAND
:
864 case USCRIPT_ELBASAN
:
867 case USCRIPT_GRANTHA
:
879 case USCRIPT_MEROITIC_CURSIVE
:
882 case USCRIPT_OLD_NORTH_ARABIAN
:
888 case USCRIPT_WARANG_CITI
:
892 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
896 case USCRIPT_JURCHEN
:
902 case USCRIPT_NUSHU
: //no language with an assigned code yet
905 case USCRIPT_SHARADA
:
908 case USCRIPT_SORA_SOMPENG
:
921 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
922 case USCRIPT_ANATOLIAN_HIEROGLYPHS
:
928 case USCRIPT_TIRHUTA
:
932 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
933 case USCRIPT_CAUCASIAN_ALBANIAN
:
936 case USCRIPT_MAHAJANI
:
940 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
950 case USCRIPT_MULTANI
:
953 case USCRIPT_PAU_CIN_HAU
:
956 case USCRIPT_SIDDHAM
:
964 //Format a number as a percentage according to the rules of the given
965 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
966 OUString SAL_CALL
unicode::formatPercent(double dNumber
,
967 const LanguageTag
&rLangTag
)
969 // get a currency formatter for this locale ID
970 UErrorCode errorCode
=U_ZERO_ERROR
;
972 LanguageTag
aLangTag(rLangTag
);
974 // As of CLDR Version 24 these languages were not listed as using spacing
975 // between number and % but are reported as such by our l10n groups
976 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
977 // so format using French which has the desired rules
978 if (aLangTag
.getLanguage() == "es" || aLangTag
.getLanguage() == "sl")
979 aLangTag
= LanguageTag("fr-FR");
981 icu::Locale aLocale
= LanguageTagIcu::getIcuLocale(aLangTag
);
983 std::unique_ptr
<NumberFormat
> xF(
984 NumberFormat::createPercentInstance(aLocale
, errorCode
));
985 if(U_FAILURE(errorCode
))
987 SAL_WARN("i18n", "NumberFormat::createPercentInstance failed");
988 return OUString::number(dNumber
) + "%";
991 UnicodeString output
;
992 xF
->format(dNumber
/100, output
);
993 OUString
aRet(reinterpret_cast<const sal_Unicode
*>(output
.getBuffer()),
995 if (rLangTag
.getLanguage() == "de")
997 //narrow no-break space instead of (normal) no-break space
998 return aRet
.replace(0x00A0, 0x202F);
1003 ToggleUnicodeCodepoint::ToggleUnicodeCodepoint ()
1005 maInput
= OUStringBuffer();
1006 maOutput
= OUStringBuffer();
1007 maUtf16
= OUStringBuffer();
1008 maCombining
= OUStringBuffer();
1011 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar
)
1013 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1014 if( maInput
.getLength() > 255 )
1015 mbAllowMoreChars
= false;
1017 if( !mbAllowMoreChars
)
1020 bool bPreventNonHex
= false;
1021 if( maInput
.indexOf("U+") != -1 )
1022 bPreventNonHex
= true;
1024 switch ( unicode::getUnicodeType(uChar
) )
1026 case css::i18n::UnicodeType::SURROGATE
:
1027 if( bPreventNonHex
)
1029 mbAllowMoreChars
= false;
1033 if( rtl::isLowSurrogate(uChar
) && maUtf16
.isEmpty() && maInput
.isEmpty() )
1035 maUtf16
.append(uChar
);
1038 if( rtl::isHighSurrogate(uChar
) && maInput
.isEmpty() )
1039 maUtf16
.insert(0, uChar
);
1040 //end of hex strings, or unexpected order of high/low, so don't accept more
1041 if( !maUtf16
.isEmpty() )
1042 maInput
.append(maUtf16
);
1043 if( !maCombining
.isEmpty() )
1044 maInput
.append(maCombining
);
1045 mbAllowMoreChars
= false;
1048 case css::i18n::UnicodeType::NON_SPACING_MARK
:
1049 case css::i18n::UnicodeType::COMBINING_SPACING_MARK
:
1050 if( bPreventNonHex
)
1052 mbAllowMoreChars
= false;
1056 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1057 if( !maUtf16
.isEmpty() )
1060 if( !maCombining
.isEmpty() )
1061 maInput
.append(maCombining
);
1062 mbAllowMoreChars
= false;
1065 maCombining
.insert(0, uChar
);
1069 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1070 if( !maUtf16
.isEmpty() )
1073 if( !maCombining
.isEmpty() )
1074 maInput
.append(maCombining
);
1075 mbAllowMoreChars
= false;
1079 if( !maCombining
.isEmpty() )
1081 maCombining
.insert(0, uChar
);
1082 maInput
= maCombining
;
1083 mbAllowMoreChars
= false;
1087 // 0 - 1f are control characters. Do not process those.
1090 mbAllowMoreChars
= false;
1098 // U+ notation found. Continue looking for another one.
1101 mbRequiresU
= false;
1102 maInput
.insert(0,"U+");
1104 // treat as a normal character
1107 mbAllowMoreChars
= false;
1108 if( !bPreventNonHex
)
1109 maInput
.insertUtf32(0, uChar
);
1113 // + already found: skip when not U, or edge case of +U+xxxx
1114 if( mbRequiresU
|| (maInput
.indexOf("U+") == 0) )
1115 mbAllowMoreChars
= false;
1116 // hex chars followed by '+' - now require a 'U'
1117 else if ( !maInput
.isEmpty() )
1119 // treat as a normal character
1122 mbAllowMoreChars
= false;
1123 if( !bPreventNonHex
)
1124 maInput
.insertUtf32(0, uChar
);
1128 // + already found. Since not U, cancel further input
1130 mbAllowMoreChars
= false;
1131 // maximum digits per notation is 8: only one notation
1132 else if( maInput
.indexOf("U+") == -1 && maInput
.getLength() == 8 )
1133 mbAllowMoreChars
= false;
1134 // maximum digits per notation is 8: previous notation found
1135 else if( maInput
.indexOf("U+") == 8 )
1136 mbAllowMoreChars
= false;
1137 // a hex character. Add to string.
1138 else if( isxdigit(uChar
) )
1140 mbIsHexString
= true;
1141 maInput
.insertUtf32(0, uChar
);
1143 // not a hex character: stop input. keep if it is the first input provided
1146 mbAllowMoreChars
= false;
1147 if( maInput
.isEmpty() )
1148 maInput
.insertUtf32(0, uChar
);
1152 return mbAllowMoreChars
;
1155 OUString
ToggleUnicodeCodepoint::StringToReplace()
1157 if( maInput
.isEmpty() )
1159 //edge case - input finished with incomplete low surrogate or combining characters without a base
1160 if( mbAllowMoreChars
)
1162 if( !maUtf16
.isEmpty() )
1164 if( !maCombining
.isEmpty() )
1165 maInput
.append(maCombining
);
1167 return maInput
.toString();
1170 if( !mbIsHexString
)
1171 return maInput
.toString();
1173 //this function potentially modifies the input string. Prevent addition of further characters
1174 mbAllowMoreChars
= false;
1176 //validate unicode notation.
1178 sal_uInt32 nUnicode
= 0;
1179 sal_Int32 nUPlus
= maInput
.indexOf("U+");
1180 //if U+ notation used, strip off all extra chars added not in U+ notation
1183 maInput
= maInput
.copy(nUPlus
);
1184 sIn
= maInput
.copy(2);
1185 nUPlus
= sIn
.indexOf("U+");
1189 while( nUPlus
!= -1 )
1191 nUnicode
= sIn
.copy(0, nUPlus
).toString().toUInt32(16);
1192 //prevent creating control characters or invalid Unicode values
1193 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1194 maInput
= sIn
.copy(nUPlus
);
1195 sIn
= sIn
.copy(nUPlus
+2);
1196 nUPlus
= sIn
.indexOf("U+");
1199 nUnicode
= sIn
.toString().toUInt32(16);
1200 if( !rtl::isUnicodeCodePoint(nUnicode
) || nUnicode
< 0x20 )
1201 maInput
.truncate().append( sIn
[sIn
.getLength()-1] );
1202 return maInput
.toString();
1205 sal_uInt32
ToggleUnicodeCodepoint::CharsToDelete()
1207 OUString sIn
= StringToReplace();
1209 sal_uInt32 counter
= 0;
1210 while( nPos
< sIn
.getLength() )
1212 sIn
.iterateCodePoints(&nPos
);
1218 OUString
ToggleUnicodeCodepoint::ReplacementString()
1220 OUString sIn
= StringToReplace();
1222 sal_Int32 nUPlus
= sIn
.indexOf("U+");
1223 // convert from hex notation to glyph
1224 if( nUPlus
!= -1 || (sIn
.getLength() > 1 && mbIsHexString
) )
1226 sal_uInt32 nUnicode
= 0;
1230 nUPlus
= sIn
.indexOf("U+");
1234 nUnicode
= sIn
.copy(0, nUPlus
).toUInt32(16);
1235 maOutput
.appendUtf32( nUnicode
);
1237 sIn
= sIn
.copy(nUPlus
+2);
1238 nUPlus
= sIn
.indexOf("U+");
1240 nUnicode
= sIn
.toUInt32(16);
1241 maOutput
.appendUtf32( nUnicode
);
1243 // convert from glyph to hex notation
1247 while( nPos
< sIn
.getLength() )
1249 OUStringBuffer aTmp
= OUString::number(sIn
.iterateCodePoints(&nPos
),16);
1250 //pad with zeros - minimum length of 4.
1251 for( sal_Int32 i
= 4 - aTmp
.getLength(); i
> 0; --i
)
1252 aTmp
.insert( 0,"0" );
1253 maOutput
.append( "U+" );
1254 maOutput
.append( aTmp
);
1257 return maOutput
.toString();
1260 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */