Bump version to 5.0-14
[LibreOffice.git] / i18nutil / source / utility / unicode.cxx
blob458a417d64965a7f093ab8b5d72a1e41bbc952ec
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <boost/scoped_ptr.hpp>
21 #include <com/sun/star/i18n/UnicodeType.hpp>
22 #include <com/sun/star/i18n/KCharacterType.hpp>
23 #include <com/sun/star/i18n/ScriptType.hpp>
24 #include <i18nlangtag/languagetag.hxx>
25 #include <i18nlangtag/languagetagicu.hxx>
26 #include <i18nutil/unicode.hxx>
27 #include <sal/log.hxx>
28 #include <unicode/numfmt.h>
29 #include "unicode_data.h"
31 // Workaround for glibc braindamage:
32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 #undef CURRENCY_SYMBOL
36 using namespace ::com::sun::star::i18n;
38 static const ScriptTypeList defaultTypeList[] = {
39 { UnicodeScript_kBasicLatin,
40 UnicodeScript_kBasicLatin,
41 UnicodeScript_kBasicLatin }, // 0,
42 { UnicodeScript_kLatin1Supplement,
43 UnicodeScript_kLatin1Supplement,
44 UnicodeScript_kLatin1Supplement },// 1,
45 { UnicodeScript_kLatinExtendedA,
46 UnicodeScript_kLatinExtendedA,
47 UnicodeScript_kLatinExtendedA }, // 2,
48 { UnicodeScript_kLatinExtendedB,
49 UnicodeScript_kLatinExtendedB,
50 UnicodeScript_kLatinExtendedB }, // 3,
51 { UnicodeScript_kIPAExtension,
52 UnicodeScript_kIPAExtension,
53 UnicodeScript_kIPAExtension }, // 4,
54 { UnicodeScript_kSpacingModifier,
55 UnicodeScript_kSpacingModifier,
56 UnicodeScript_kSpacingModifier }, // 5,
57 { UnicodeScript_kCombiningDiacritical,
58 UnicodeScript_kCombiningDiacritical,
59 UnicodeScript_kCombiningDiacritical }, // 6,
60 { UnicodeScript_kGreek,
61 UnicodeScript_kGreek,
62 UnicodeScript_kGreek }, // 7,
63 { UnicodeScript_kCyrillic,
64 UnicodeScript_kCyrillic,
65 UnicodeScript_kCyrillic }, // 8,
66 { UnicodeScript_kArmenian,
67 UnicodeScript_kArmenian,
68 UnicodeScript_kArmenian }, // 9,
69 { UnicodeScript_kHebrew,
70 UnicodeScript_kHebrew,
71 UnicodeScript_kHebrew }, // 10,
72 { UnicodeScript_kArabic,
73 UnicodeScript_kArabic,
74 UnicodeScript_kArabic }, // 11,
75 { UnicodeScript_kSyriac,
76 UnicodeScript_kSyriac,
77 UnicodeScript_kSyriac }, // 12,
78 { UnicodeScript_kThaana,
79 UnicodeScript_kThaana,
80 UnicodeScript_kThaana }, // 13,
81 { UnicodeScript_kDevanagari,
82 UnicodeScript_kDevanagari,
83 UnicodeScript_kDevanagari }, // 14,
84 { UnicodeScript_kBengali,
85 UnicodeScript_kBengali,
86 UnicodeScript_kBengali }, // 15,
87 { UnicodeScript_kGurmukhi,
88 UnicodeScript_kGurmukhi,
89 UnicodeScript_kGurmukhi }, // 16,
90 { UnicodeScript_kGujarati,
91 UnicodeScript_kGujarati,
92 UnicodeScript_kGujarati }, // 17,
93 { UnicodeScript_kOriya,
94 UnicodeScript_kOriya,
95 UnicodeScript_kOriya }, // 18,
96 { UnicodeScript_kTamil,
97 UnicodeScript_kTamil,
98 UnicodeScript_kTamil }, // 19,
99 { UnicodeScript_kTelugu,
100 UnicodeScript_kTelugu,
101 UnicodeScript_kTelugu }, // 20,
102 { UnicodeScript_kKannada,
103 UnicodeScript_kKannada,
104 UnicodeScript_kKannada }, // 21,
105 { UnicodeScript_kMalayalam,
106 UnicodeScript_kMalayalam,
107 UnicodeScript_kMalayalam }, // 22,
108 { UnicodeScript_kSinhala,
109 UnicodeScript_kSinhala,
110 UnicodeScript_kSinhala }, // 23,
111 { UnicodeScript_kThai,
112 UnicodeScript_kThai,
113 UnicodeScript_kThai }, // 24,
114 { UnicodeScript_kLao,
115 UnicodeScript_kLao,
116 UnicodeScript_kLao }, // 25,
117 { UnicodeScript_kTibetan,
118 UnicodeScript_kTibetan,
119 UnicodeScript_kTibetan }, // 26,
120 { UnicodeScript_kMyanmar,
121 UnicodeScript_kMyanmar,
122 UnicodeScript_kMyanmar }, // 27,
123 { UnicodeScript_kGeorgian,
124 UnicodeScript_kGeorgian,
125 UnicodeScript_kGeorgian }, // 28,
126 { UnicodeScript_kHangulJamo,
127 UnicodeScript_kHangulJamo,
128 UnicodeScript_kHangulJamo }, // 29,
129 { UnicodeScript_kEthiopic,
130 UnicodeScript_kEthiopic,
131 UnicodeScript_kEthiopic }, // 30,
132 { UnicodeScript_kCherokee,
133 UnicodeScript_kCherokee,
134 UnicodeScript_kCherokee }, // 31,
135 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
136 UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
137 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
138 { UnicodeScript_kOgham,
139 UnicodeScript_kOgham,
140 UnicodeScript_kOgham }, // 33,
141 { UnicodeScript_kRunic,
142 UnicodeScript_kRunic,
143 UnicodeScript_kRunic }, // 34,
144 { UnicodeScript_kKhmer,
145 UnicodeScript_kKhmer,
146 UnicodeScript_kKhmer }, // 35,
147 { UnicodeScript_kMongolian,
148 UnicodeScript_kMongolian,
149 UnicodeScript_kMongolian }, // 36,
150 { UnicodeScript_kLatinExtendedAdditional,
151 UnicodeScript_kLatinExtendedAdditional,
152 UnicodeScript_kLatinExtendedAdditional }, // 37,
153 { UnicodeScript_kGreekExtended,
154 UnicodeScript_kGreekExtended,
155 UnicodeScript_kGreekExtended }, // 38,
156 { UnicodeScript_kGeneralPunctuation,
157 UnicodeScript_kGeneralPunctuation,
158 UnicodeScript_kGeneralPunctuation }, // 39,
159 { UnicodeScript_kSuperSubScript,
160 UnicodeScript_kSuperSubScript,
161 UnicodeScript_kSuperSubScript }, // 40,
162 { UnicodeScript_kCurrencySymbolScript,
163 UnicodeScript_kCurrencySymbolScript,
164 UnicodeScript_kCurrencySymbolScript }, // 41,
165 { UnicodeScript_kSymbolCombiningMark,
166 UnicodeScript_kSymbolCombiningMark,
167 UnicodeScript_kSymbolCombiningMark }, // 42,
168 { UnicodeScript_kLetterlikeSymbol,
169 UnicodeScript_kLetterlikeSymbol,
170 UnicodeScript_kLetterlikeSymbol }, // 43,
171 { UnicodeScript_kNumberForm,
172 UnicodeScript_kNumberForm,
173 UnicodeScript_kNumberForm }, // 44,
174 { UnicodeScript_kArrow,
175 UnicodeScript_kArrow,
176 UnicodeScript_kArrow }, // 45,
177 { UnicodeScript_kMathOperator,
178 UnicodeScript_kMathOperator,
179 UnicodeScript_kMathOperator }, // 46,
180 { UnicodeScript_kMiscTechnical,
181 UnicodeScript_kMiscTechnical,
182 UnicodeScript_kMiscTechnical }, // 47,
183 { UnicodeScript_kControlPicture,
184 UnicodeScript_kControlPicture,
185 UnicodeScript_kControlPicture }, // 48,
186 { UnicodeScript_kOpticalCharacter,
187 UnicodeScript_kOpticalCharacter,
188 UnicodeScript_kOpticalCharacter }, // 49,
189 { UnicodeScript_kEnclosedAlphanumeric,
190 UnicodeScript_kEnclosedAlphanumeric,
191 UnicodeScript_kEnclosedAlphanumeric }, // 50,
192 { UnicodeScript_kBoxDrawing,
193 UnicodeScript_kBoxDrawing,
194 UnicodeScript_kBoxDrawing }, // 51,
195 { UnicodeScript_kBlockElement,
196 UnicodeScript_kBlockElement,
197 UnicodeScript_kBlockElement }, // 52,
198 { UnicodeScript_kGeometricShape,
199 UnicodeScript_kGeometricShape,
200 UnicodeScript_kGeometricShape }, // 53,
201 { UnicodeScript_kMiscSymbol,
202 UnicodeScript_kMiscSymbol,
203 UnicodeScript_kMiscSymbol }, // 54,
204 { UnicodeScript_kDingbat,
205 UnicodeScript_kDingbat,
206 UnicodeScript_kDingbat }, // 55,
207 { UnicodeScript_kBraillePatterns,
208 UnicodeScript_kBraillePatterns,
209 UnicodeScript_kBraillePatterns }, // 56,
210 { UnicodeScript_kCJKRadicalsSupplement,
211 UnicodeScript_kCJKRadicalsSupplement,
212 UnicodeScript_kCJKRadicalsSupplement }, // 57,
213 { UnicodeScript_kKangxiRadicals,
214 UnicodeScript_kKangxiRadicals,
215 UnicodeScript_kKangxiRadicals }, // 58,
216 { UnicodeScript_kIdeographicDescriptionCharacters,
217 UnicodeScript_kIdeographicDescriptionCharacters,
218 UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
219 { UnicodeScript_kCJKSymbolPunctuation,
220 UnicodeScript_kCJKSymbolPunctuation,
221 UnicodeScript_kCJKSymbolPunctuation }, // 60,
222 { UnicodeScript_kHiragana,
223 UnicodeScript_kHiragana,
224 UnicodeScript_kHiragana }, // 61,
225 { UnicodeScript_kKatakana,
226 UnicodeScript_kKatakana,
227 UnicodeScript_kKatakana }, // 62,
228 { UnicodeScript_kBopomofo,
229 UnicodeScript_kBopomofo,
230 UnicodeScript_kBopomofo }, // 63,
231 { UnicodeScript_kHangulCompatibilityJamo,
232 UnicodeScript_kHangulCompatibilityJamo,
233 UnicodeScript_kHangulCompatibilityJamo }, // 64,
234 { UnicodeScript_kKanbun,
235 UnicodeScript_kKanbun,
236 UnicodeScript_kKanbun }, // 65,
237 { UnicodeScript_kBopomofoExtended,
238 UnicodeScript_kBopomofoExtended,
239 UnicodeScript_kBopomofoExtended }, // 66,
240 { UnicodeScript_kEnclosedCJKLetterMonth,
241 UnicodeScript_kEnclosedCJKLetterMonth,
242 UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
243 { UnicodeScript_kCJKCompatibility,
244 UnicodeScript_kCJKCompatibility,
245 UnicodeScript_kCJKCompatibility }, // 68,
246 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
247 UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
248 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
249 { UnicodeScript_kCJKUnifiedIdeograph,
250 UnicodeScript_kCJKUnifiedIdeograph,
251 UnicodeScript_kCJKUnifiedIdeograph }, // 70,
252 { UnicodeScript_kYiSyllables,
253 UnicodeScript_kYiSyllables,
254 UnicodeScript_kYiSyllables }, // 71,
255 { UnicodeScript_kYiRadicals,
256 UnicodeScript_kYiRadicals,
257 UnicodeScript_kYiRadicals }, // 72,
258 { UnicodeScript_kHangulSyllable,
259 UnicodeScript_kHangulSyllable,
260 UnicodeScript_kHangulSyllable }, // 73,
261 { UnicodeScript_kHighSurrogate,
262 UnicodeScript_kHighSurrogate,
263 UnicodeScript_kHighSurrogate }, // 74,
264 { UnicodeScript_kHighPrivateUseSurrogate,
265 UnicodeScript_kHighPrivateUseSurrogate,
266 UnicodeScript_kHighPrivateUseSurrogate }, // 75,
267 { UnicodeScript_kLowSurrogate,
268 UnicodeScript_kLowSurrogate,
269 UnicodeScript_kLowSurrogate }, // 76,
270 { UnicodeScript_kPrivateUse,
271 UnicodeScript_kPrivateUse,
272 UnicodeScript_kPrivateUse }, // 77,
273 { UnicodeScript_kCJKCompatibilityIdeograph,
274 UnicodeScript_kCJKCompatibilityIdeograph,
275 UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
276 { UnicodeScript_kAlphabeticPresentation,
277 UnicodeScript_kAlphabeticPresentation,
278 UnicodeScript_kAlphabeticPresentation }, // 79,
279 { UnicodeScript_kArabicPresentationA,
280 UnicodeScript_kArabicPresentationA,
281 UnicodeScript_kArabicPresentationA }, // 80,
282 { UnicodeScript_kCombiningHalfMark,
283 UnicodeScript_kCombiningHalfMark,
284 UnicodeScript_kCombiningHalfMark }, // 81,
285 { UnicodeScript_kCJKCompatibilityForm,
286 UnicodeScript_kCJKCompatibilityForm,
287 UnicodeScript_kCJKCompatibilityForm }, // 82,
288 { UnicodeScript_kSmallFormVariant,
289 UnicodeScript_kSmallFormVariant,
290 UnicodeScript_kSmallFormVariant }, // 83,
291 { UnicodeScript_kArabicPresentationB,
292 UnicodeScript_kArabicPresentationB,
293 UnicodeScript_kArabicPresentationB }, // 84,
294 { UnicodeScript_kNoScript,
295 UnicodeScript_kNoScript,
296 UnicodeScript_kNoScript }, // 85,
297 { UnicodeScript_kHalfwidthFullwidthForm,
298 UnicodeScript_kHalfwidthFullwidthForm,
299 UnicodeScript_kHalfwidthFullwidthForm }, // 86,
300 { UnicodeScript_kScriptCount,
301 UnicodeScript_kScriptCount,
302 UnicodeScript_kNoScript } // 87,
305 sal_Int16 SAL_CALL
306 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
308 if (!typeList) {
309 typeList = defaultTypeList;
310 unknownType = UnicodeScript_kNoScript;
313 sal_Int16 i = 0, type = typeList[0].to;
314 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
315 type = typeList[++i].to;
318 return (type < UnicodeScript_kScriptCount &&
319 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
320 typeList[i].value : unknownType;
323 sal_Unicode SAL_CALL
324 unicode::getUnicodeScriptStart( UnicodeScript type) {
325 return UnicodeScriptType[type][UnicodeScriptTypeFrom];
328 sal_Unicode SAL_CALL
329 unicode::getUnicodeScriptEnd( UnicodeScript type) {
330 return UnicodeScriptType[type][UnicodeScriptTypeTo];
333 sal_Int16 SAL_CALL
334 unicode::getUnicodeType( const sal_Unicode ch ) {
335 static sal_Unicode c = 0x00;
336 static sal_Int16 r = 0x00;
338 if (ch == c) return r;
339 else c = ch;
341 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
342 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
343 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
346 sal_uInt8 SAL_CALL
347 unicode::getUnicodeDirection( const sal_Unicode ch ) {
348 static sal_Unicode c = 0x00;
349 static sal_uInt8 r = 0x00;
351 if (ch == c) return r;
352 else c = ch;
354 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
355 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
356 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
360 #define bit(name) (1U << name)
362 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
364 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
366 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
368 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
369 bit(UnicodeType::MODIFIER_LETTER)|\
370 bit(UnicodeType::OTHER_LETTER)
372 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
373 bit(UnicodeType::LINE_SEPARATOR)|\
374 bit(UnicodeType::PARAGRAPH_SEPARATOR)
376 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
377 bit(UnicodeType::FORMAT)|\
378 bit(UnicodeType::LINE_SEPARATOR)|\
379 bit(UnicodeType::PARAGRAPH_SEPARATOR)
381 #define IsType(func, mask) \
382 bool SAL_CALL func( const sal_Unicode ch) {\
383 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
386 IsType(unicode::isControl, CONTROLMASK)
387 IsType(unicode::isAlpha, ALPHAMASK)
388 IsType(unicode::isSpace, SPACEMASK)
390 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
391 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
393 bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
394 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
397 sal_Int16 SAL_CALL unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
399 //See unicode/uscript.h
400 static const sal_Int16 scriptTypes[] =
402 ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
403 ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
404 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
405 // 15
406 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
407 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
408 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
409 // 30
410 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
411 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
412 ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
413 // 45
414 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
415 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
416 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
417 // 60
418 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
419 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
420 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
421 // 75
422 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
423 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
424 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
425 // 90
426 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
427 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
428 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
429 // 105
430 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
431 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
432 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
433 // 120
434 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
435 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
436 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
437 // 135
438 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
439 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
440 ScriptType::COMPLEX,
441 ScriptType::WEAK
444 sal_Int16 nRet;
445 if (eScript < USCRIPT_COMMON)
446 nRet = ScriptType::WEAK;
447 else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
448 nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
449 else
450 nRet = scriptTypes[eScript];
451 return nRet;
454 OString SAL_CALL unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
456 OString sRet;
457 switch (eScript)
459 case USCRIPT_CODE_LIMIT:
460 case USCRIPT_INVALID_CODE:
461 sRet = "zxx";
462 break;
463 case USCRIPT_COMMON:
464 case USCRIPT_INHERITED:
465 sRet = "und";
466 break;
467 case USCRIPT_MATHEMATICAL_NOTATION:
468 case USCRIPT_SYMBOLS:
469 sRet = "zxx";
470 break;
471 case USCRIPT_UNWRITTEN_LANGUAGES:
472 case USCRIPT_UNKNOWN:
473 sRet = "und";
474 break;
475 case USCRIPT_ARABIC:
476 sRet = "ar";
477 break;
478 case USCRIPT_ARMENIAN:
479 sRet = "hy";
480 break;
481 case USCRIPT_BENGALI:
482 sRet = "bn";
483 break;
484 case USCRIPT_BOPOMOFO:
485 sRet = "zh";
486 break;
487 case USCRIPT_CHEROKEE:
488 sRet = "chr";
489 break;
490 case USCRIPT_COPTIC:
491 sRet = "cop";
492 break;
493 case USCRIPT_CYRILLIC:
494 sRet = "ru";
495 break;
496 case USCRIPT_DESERET:
497 sRet = "en";
498 break;
499 case USCRIPT_DEVANAGARI:
500 sRet = "hi";
501 break;
502 case USCRIPT_ETHIOPIC:
503 sRet = "am";
504 break;
505 case USCRIPT_GEORGIAN:
506 sRet = "ka";
507 break;
508 case USCRIPT_GOTHIC:
509 sRet = "got";
510 break;
511 case USCRIPT_GREEK:
512 sRet = "el";
513 break;
514 case USCRIPT_GUJARATI:
515 sRet = "gu";
516 break;
517 case USCRIPT_GURMUKHI:
518 sRet = "pa";
519 break;
520 case USCRIPT_HAN:
521 sRet = "zh";
522 break;
523 case USCRIPT_HANGUL:
524 sRet = "ko";
525 break;
526 case USCRIPT_HEBREW:
527 sRet = "hr";
528 break;
529 case USCRIPT_HIRAGANA:
530 sRet = "ja";
531 break;
532 case USCRIPT_KANNADA:
533 sRet = "kn";
534 break;
535 case USCRIPT_KATAKANA:
536 sRet = "ja";
537 break;
538 case USCRIPT_KHMER:
539 sRet = "km";
540 break;
541 case USCRIPT_LAO:
542 sRet = "lo";
543 break;
544 case USCRIPT_LATIN:
545 sRet = "en";
546 break;
547 case USCRIPT_MALAYALAM:
548 sRet = "ml";
549 break;
550 case USCRIPT_MONGOLIAN:
551 sRet = "mn";
552 break;
553 case USCRIPT_MYANMAR:
554 sRet = "my";
555 break;
556 case USCRIPT_OGHAM:
557 sRet = "pgl";
558 break;
559 case USCRIPT_OLD_ITALIC:
560 sRet = "osc";
561 break;
562 case USCRIPT_ORIYA:
563 sRet = "or";
564 break;
565 case USCRIPT_RUNIC:
566 sRet = "ang";
567 break;
568 case USCRIPT_SINHALA:
569 sRet = "si";
570 break;
571 case USCRIPT_SYRIAC:
572 sRet = "syr";
573 break;
574 case USCRIPT_TAMIL:
575 sRet = "ta";
576 break;
577 case USCRIPT_TELUGU:
578 sRet = "te";
579 break;
580 case USCRIPT_THAANA:
581 sRet = "dv";
582 break;
583 case USCRIPT_THAI:
584 sRet = "th";
585 break;
586 case USCRIPT_TIBETAN:
587 sRet = "bo";
588 break;
589 case USCRIPT_CANADIAN_ABORIGINAL:
590 sRet = "iu";
591 break;
592 case USCRIPT_YI:
593 sRet = "ii";
594 break;
595 case USCRIPT_TAGALOG:
596 sRet = "tl";
597 break;
598 case USCRIPT_HANUNOO:
599 sRet = "hnn";
600 break;
601 case USCRIPT_BUHID:
602 sRet = "bku";
603 break;
604 case USCRIPT_TAGBANWA:
605 sRet = "tbw";
606 break;
607 case USCRIPT_BRAILLE:
608 sRet = "en";
609 break;
610 case USCRIPT_CYPRIOT:
611 sRet = "ecy";
612 break;
613 case USCRIPT_LIMBU:
614 sRet = "lif";
615 break;
616 case USCRIPT_LINEAR_B:
617 sRet = "gmy";
618 break;
619 case USCRIPT_OSMANYA:
620 sRet = "so";
621 break;
622 case USCRIPT_SHAVIAN:
623 sRet = "en";
624 break;
625 case USCRIPT_TAI_LE:
626 sRet = "tdd";
627 break;
628 case USCRIPT_UGARITIC:
629 sRet = "uga";
630 break;
631 case USCRIPT_KATAKANA_OR_HIRAGANA:
632 sRet = "ja";
633 break;
634 case USCRIPT_BUGINESE:
635 sRet = "bug";
636 break;
637 case USCRIPT_GLAGOLITIC:
638 sRet = "ch";
639 break;
640 case USCRIPT_KHAROSHTHI:
641 sRet = "pra";
642 break;
643 case USCRIPT_SYLOTI_NAGRI:
644 sRet = "syl";
645 break;
646 case USCRIPT_NEW_TAI_LUE:
647 sRet = "khb";
648 break;
649 case USCRIPT_TIFINAGH:
650 sRet = "tmh";
651 break;
652 case USCRIPT_OLD_PERSIAN:
653 sRet = "peo";
654 break;
655 case USCRIPT_BALINESE:
656 sRet = "ban";
657 break;
658 case USCRIPT_BATAK:
659 sRet = "btk";
660 break;
661 case USCRIPT_BLISSYMBOLS:
662 sRet = "en";
663 break;
664 case USCRIPT_BRAHMI:
665 sRet = "pra";
666 break;
667 case USCRIPT_CHAM:
668 sRet = "cja";
669 break;
670 case USCRIPT_CIRTH:
671 sRet = "sjn";
672 break;
673 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
674 sRet = "cu";
675 break;
676 case USCRIPT_DEMOTIC_EGYPTIAN:
677 case USCRIPT_HIERATIC_EGYPTIAN:
678 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
679 sRet = "egy";
680 break;
681 case USCRIPT_KHUTSURI:
682 sRet = "ka";
683 break;
684 case USCRIPT_SIMPLIFIED_HAN:
685 sRet = "zh";
686 break;
687 case USCRIPT_TRADITIONAL_HAN:
688 sRet = "zh";
689 break;
690 case USCRIPT_PAHAWH_HMONG:
691 sRet = "blu";
692 break;
693 case USCRIPT_OLD_HUNGARIAN:
694 sRet = "ohu";
695 break;
696 case USCRIPT_HARAPPAN_INDUS:
697 sRet = "xiv";
698 break;
699 case USCRIPT_JAVANESE:
700 sRet = "kaw";
701 break;
702 case USCRIPT_KAYAH_LI:
703 sRet = "eky";
704 break;
705 case USCRIPT_LATIN_FRAKTUR:
706 sRet = "de";
707 break;
708 case USCRIPT_LATIN_GAELIC:
709 sRet = "ga";
710 break;
711 case USCRIPT_LEPCHA:
712 sRet = "lep";
713 break;
714 case USCRIPT_LINEAR_A:
715 sRet = "ecr";
716 break;
717 case USCRIPT_MAYAN_HIEROGLYPHS:
718 sRet = "myn";
719 break;
720 case USCRIPT_MEROITIC:
721 sRet = "xmr";
722 break;
723 case USCRIPT_NKO:
724 sRet = "nqo";
725 break;
726 case USCRIPT_ORKHON:
727 sRet = "otk";
728 break;
729 case USCRIPT_OLD_PERMIC:
730 sRet = "kv";
731 break;
732 case USCRIPT_PHAGS_PA:
733 sRet = "xng";
734 break;
735 case USCRIPT_PHOENICIAN:
736 sRet = "phn";
737 break;
738 case USCRIPT_PHONETIC_POLLARD:
739 sRet = "hmd";
740 break;
741 case USCRIPT_RONGORONGO:
742 sRet = "rap";
743 break;
744 case USCRIPT_SARATI:
745 sRet = "qya";
746 break;
747 case USCRIPT_ESTRANGELO_SYRIAC:
748 sRet = "syr";
749 break;
750 case USCRIPT_WESTERN_SYRIAC:
751 sRet = "tru";
752 break;
753 case USCRIPT_EASTERN_SYRIAC:
754 sRet = "aii";
755 break;
756 case USCRIPT_TENGWAR:
757 sRet = "sjn";
758 break;
759 case USCRIPT_VAI:
760 sRet = "vai";
761 break;
762 case USCRIPT_VISIBLE_SPEECH:
763 sRet = "en";
764 break;
765 case USCRIPT_CUNEIFORM:
766 sRet = "akk";
767 break;
768 case USCRIPT_CARIAN:
769 sRet = "xcr";
770 break;
771 case USCRIPT_JAPANESE:
772 sRet = "ja";
773 break;
774 case USCRIPT_LANNA:
775 sRet = "nod";
776 break;
777 case USCRIPT_LYCIAN:
778 sRet = "xlc";
779 break;
780 case USCRIPT_LYDIAN:
781 sRet = "xld";
782 break;
783 case USCRIPT_OL_CHIKI:
784 sRet = "sat";
785 break;
786 case USCRIPT_REJANG:
787 sRet = "rej";
788 break;
789 case USCRIPT_SAURASHTRA:
790 sRet = "saz";
791 break;
792 case USCRIPT_SIGN_WRITING:
793 sRet = "en";
794 break;
795 case USCRIPT_SUNDANESE:
796 sRet = "su";
797 break;
798 case USCRIPT_MOON:
799 sRet = "en";
800 break;
801 case USCRIPT_MEITEI_MAYEK:
802 sRet = "mni";
803 break;
804 case USCRIPT_IMPERIAL_ARAMAIC:
805 sRet = "arc";
806 break;
807 case USCRIPT_AVESTAN:
808 sRet = "ae";
809 break;
810 case USCRIPT_CHAKMA:
811 sRet = "ccp";
812 break;
813 case USCRIPT_KOREAN:
814 sRet = "ko";
815 break;
816 case USCRIPT_KAITHI:
817 sRet = "awa";
818 break;
819 case USCRIPT_MANICHAEAN:
820 sRet = "xmn";
821 break;
822 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
823 case USCRIPT_PSALTER_PAHLAVI:
824 case USCRIPT_BOOK_PAHLAVI:
825 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
826 sRet = "xpr";
827 break;
828 case USCRIPT_SAMARITAN:
829 sRet = "heb";
830 break;
831 case USCRIPT_TAI_VIET:
832 sRet = "blt";
833 break;
834 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
835 sRet = "mic";
836 break;
837 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
838 case USCRIPT_NABATAEAN: //no language with an assigned code yet
839 sRet = "mis";
840 break;
841 case USCRIPT_PALMYRENE: //no language with an assigned code yet
842 sRet = "mis";
843 break;
844 case USCRIPT_BAMUM:
845 sRet = "bax";
846 break;
847 case USCRIPT_LISU:
848 sRet = "lis";
849 break;
850 case USCRIPT_NAKHI_GEBA:
851 sRet = "nxq";
852 break;
853 case USCRIPT_OLD_SOUTH_ARABIAN:
854 sRet = "xsa";
855 break;
856 case USCRIPT_BASSA_VAH:
857 sRet = "bsq";
858 break;
859 case USCRIPT_DUPLOYAN_SHORTAND:
860 sRet = "fr";
861 break;
862 case USCRIPT_ELBASAN:
863 sRet = "sq";
864 break;
865 case USCRIPT_GRANTHA:
866 sRet = "ta";
867 break;
868 case USCRIPT_KPELLE:
869 sRet = "kpe";
870 break;
871 case USCRIPT_LOMA:
872 sRet = "lom";
873 break;
874 case USCRIPT_MENDE:
875 sRet = "men";
876 break;
877 case USCRIPT_MEROITIC_CURSIVE:
878 sRet = "xmr";
879 break;
880 case USCRIPT_OLD_NORTH_ARABIAN:
881 sRet = "xna";
882 break;
883 case USCRIPT_SINDHI:
884 sRet = "sd";
885 break;
886 case USCRIPT_WARANG_CITI:
887 sRet = "hoc";
888 break;
889 #endif
890 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
891 case USCRIPT_AFAKA:
892 sRet = "djk";
893 break;
894 case USCRIPT_JURCHEN:
895 sRet = "juc";
896 break;
897 case USCRIPT_MRO:
898 sRet = "cmr";
899 break;
900 case USCRIPT_NUSHU: //no language with an assigned code yet
901 sRet = "mis";
902 break;
903 case USCRIPT_SHARADA:
904 sRet = "sa";
905 break;
906 case USCRIPT_SORA_SOMPENG:
907 sRet = "srb";
908 break;
909 case USCRIPT_TAKRI:
910 sRet = "doi";
911 break;
912 case USCRIPT_TANGUT:
913 sRet = "txg";
914 break;
915 case USCRIPT_WOLEAI:
916 sRet = "woe";
917 break;
918 #endif
919 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
920 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
921 sRet = "hlu";
922 break;
923 case USCRIPT_KHOJKI:
924 sRet = "gu";
925 break;
926 case USCRIPT_TIRHUTA:
927 sRet = "mai";
928 break;
929 #endif
930 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
931 case USCRIPT_CAUCASIAN_ALBANIAN:
932 sRet = "xag";
933 break;
934 case USCRIPT_MAHAJANI:
935 sRet = "mwr";
936 break;
937 #endif
938 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
939 case USCRIPT_AHOM:
940 sRet = "aho";
941 break;
942 case USCRIPT_HATRAN:
943 sRet = "qly-Hatr";
944 break;
945 case USCRIPT_MODI:
946 sRet = "mr-Modi";
947 break;
948 case USCRIPT_MULTANI:
949 sRet = "skr-Mutl";
950 break;
951 case USCRIPT_PAU_CIN_HAU:
952 sRet = "ctd-Pauc";
953 break;
954 case USCRIPT_SIDDHAM:
955 sRet = "sa-Sidd";
956 break;
957 #endif
959 return sRet;
962 //Format a number as a percentage according to the rules of the given
963 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
964 OUString SAL_CALL unicode::formatPercent(double dNumber,
965 const LanguageTag &rLangTag)
967 // get a currency formatter for this locale ID
968 UErrorCode errorCode=U_ZERO_ERROR;
970 LanguageTag aLangTag(rLangTag);
972 // As of CLDR Version 24 these languages were not listed as using spacing
973 // between number and % but are reported as such by our l10n groups
974 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
975 // so format using French which has the desired rules
976 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
977 aLangTag = LanguageTag("fr-FR");
979 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
981 boost::scoped_ptr<NumberFormat> xF(
982 NumberFormat::createPercentInstance(aLocale, errorCode));
983 if(U_FAILURE(errorCode))
985 SAL_WARN("i18n", "NumberFormat::createPercentInstance failed");
986 return OUString::number(dNumber) + "%";
989 UnicodeString output;
990 xF->format(dNumber/100, output);
991 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
992 output.length());
993 if (rLangTag.getLanguage() == "de")
995 //narrow no-break space instead of (normal) no-break space
996 return aRet.replace(0x00A0, 0x202F);
998 return aRet;
1001 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */