Version 4.0.2.1, tag libreoffice-4.0.2.1
[LibreOffice.git] / i18nutil / source / utility / unicode.cxx
blob2f6f6dd1795299911640ec252aafb1c2b5a646eb
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/KCharacterType.hpp>
22 #include <com/sun/star/i18n/ScriptType.hpp>
23 #include <i18nutil/unicode.hxx>
24 #include "unicode_data.h"
26 // Workaround for glibc braindamage:
27 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
28 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
29 #undef CURRENCY_SYMBOL
31 using namespace ::com::sun::star::i18n;
33 static ScriptTypeList defaultTypeList[] = {
34 { UnicodeScript_kBasicLatin,
35 UnicodeScript_kBasicLatin,
36 UnicodeScript_kBasicLatin }, // 0,
37 { UnicodeScript_kLatin1Supplement,
38 UnicodeScript_kLatin1Supplement,
39 UnicodeScript_kLatin1Supplement },// 1,
40 { UnicodeScript_kLatinExtendedA,
41 UnicodeScript_kLatinExtendedA,
42 UnicodeScript_kLatinExtendedA }, // 2,
43 { UnicodeScript_kLatinExtendedB,
44 UnicodeScript_kLatinExtendedB,
45 UnicodeScript_kLatinExtendedB }, // 3,
46 { UnicodeScript_kIPAExtension,
47 UnicodeScript_kIPAExtension,
48 UnicodeScript_kIPAExtension }, // 4,
49 { UnicodeScript_kSpacingModifier,
50 UnicodeScript_kSpacingModifier,
51 UnicodeScript_kSpacingModifier }, // 5,
52 { UnicodeScript_kCombiningDiacritical,
53 UnicodeScript_kCombiningDiacritical,
54 UnicodeScript_kCombiningDiacritical }, // 6,
55 { UnicodeScript_kGreek,
56 UnicodeScript_kGreek,
57 UnicodeScript_kGreek }, // 7,
58 { UnicodeScript_kCyrillic,
59 UnicodeScript_kCyrillic,
60 UnicodeScript_kCyrillic }, // 8,
61 { UnicodeScript_kArmenian,
62 UnicodeScript_kArmenian,
63 UnicodeScript_kArmenian }, // 9,
64 { UnicodeScript_kHebrew,
65 UnicodeScript_kHebrew,
66 UnicodeScript_kHebrew }, // 10,
67 { UnicodeScript_kArabic,
68 UnicodeScript_kArabic,
69 UnicodeScript_kArabic }, // 11,
70 { UnicodeScript_kSyriac,
71 UnicodeScript_kSyriac,
72 UnicodeScript_kSyriac }, // 12,
73 { UnicodeScript_kThaana,
74 UnicodeScript_kThaana,
75 UnicodeScript_kThaana }, // 13,
76 { UnicodeScript_kDevanagari,
77 UnicodeScript_kDevanagari,
78 UnicodeScript_kDevanagari }, // 14,
79 { UnicodeScript_kBengali,
80 UnicodeScript_kBengali,
81 UnicodeScript_kBengali }, // 15,
82 { UnicodeScript_kGurmukhi,
83 UnicodeScript_kGurmukhi,
84 UnicodeScript_kGurmukhi }, // 16,
85 { UnicodeScript_kGujarati,
86 UnicodeScript_kGujarati,
87 UnicodeScript_kGujarati }, // 17,
88 { UnicodeScript_kOriya,
89 UnicodeScript_kOriya,
90 UnicodeScript_kOriya }, // 18,
91 { UnicodeScript_kTamil,
92 UnicodeScript_kTamil,
93 UnicodeScript_kTamil }, // 19,
94 { UnicodeScript_kTelugu,
95 UnicodeScript_kTelugu,
96 UnicodeScript_kTelugu }, // 20,
97 { UnicodeScript_kKannada,
98 UnicodeScript_kKannada,
99 UnicodeScript_kKannada }, // 21,
100 { UnicodeScript_kMalayalam,
101 UnicodeScript_kMalayalam,
102 UnicodeScript_kMalayalam }, // 22,
103 { UnicodeScript_kSinhala,
104 UnicodeScript_kSinhala,
105 UnicodeScript_kSinhala }, // 23,
106 { UnicodeScript_kThai,
107 UnicodeScript_kThai,
108 UnicodeScript_kThai }, // 24,
109 { UnicodeScript_kLao,
110 UnicodeScript_kLao,
111 UnicodeScript_kLao }, // 25,
112 { UnicodeScript_kTibetan,
113 UnicodeScript_kTibetan,
114 UnicodeScript_kTibetan }, // 26,
115 { UnicodeScript_kMyanmar,
116 UnicodeScript_kMyanmar,
117 UnicodeScript_kMyanmar }, // 27,
118 { UnicodeScript_kGeorgian,
119 UnicodeScript_kGeorgian,
120 UnicodeScript_kGeorgian }, // 28,
121 { UnicodeScript_kHangulJamo,
122 UnicodeScript_kHangulJamo,
123 UnicodeScript_kHangulJamo }, // 29,
124 { UnicodeScript_kEthiopic,
125 UnicodeScript_kEthiopic,
126 UnicodeScript_kEthiopic }, // 30,
127 { UnicodeScript_kCherokee,
128 UnicodeScript_kCherokee,
129 UnicodeScript_kCherokee }, // 31,
130 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
131 UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
132 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
133 { UnicodeScript_kOgham,
134 UnicodeScript_kOgham,
135 UnicodeScript_kOgham }, // 33,
136 { UnicodeScript_kRunic,
137 UnicodeScript_kRunic,
138 UnicodeScript_kRunic }, // 34,
139 { UnicodeScript_kKhmer,
140 UnicodeScript_kKhmer,
141 UnicodeScript_kKhmer }, // 35,
142 { UnicodeScript_kMongolian,
143 UnicodeScript_kMongolian,
144 UnicodeScript_kMongolian }, // 36,
145 { UnicodeScript_kLatinExtendedAdditional,
146 UnicodeScript_kLatinExtendedAdditional,
147 UnicodeScript_kLatinExtendedAdditional }, // 37,
148 { UnicodeScript_kGreekExtended,
149 UnicodeScript_kGreekExtended,
150 UnicodeScript_kGreekExtended }, // 38,
151 { UnicodeScript_kGeneralPunctuation,
152 UnicodeScript_kGeneralPunctuation,
153 UnicodeScript_kGeneralPunctuation }, // 39,
154 { UnicodeScript_kSuperSubScript,
155 UnicodeScript_kSuperSubScript,
156 UnicodeScript_kSuperSubScript }, // 40,
157 { UnicodeScript_kCurrencySymbolScript,
158 UnicodeScript_kCurrencySymbolScript,
159 UnicodeScript_kCurrencySymbolScript }, // 41,
160 { UnicodeScript_kSymbolCombiningMark,
161 UnicodeScript_kSymbolCombiningMark,
162 UnicodeScript_kSymbolCombiningMark }, // 42,
163 { UnicodeScript_kLetterlikeSymbol,
164 UnicodeScript_kLetterlikeSymbol,
165 UnicodeScript_kLetterlikeSymbol }, // 43,
166 { UnicodeScript_kNumberForm,
167 UnicodeScript_kNumberForm,
168 UnicodeScript_kNumberForm }, // 44,
169 { UnicodeScript_kArrow,
170 UnicodeScript_kArrow,
171 UnicodeScript_kArrow }, // 45,
172 { UnicodeScript_kMathOperator,
173 UnicodeScript_kMathOperator,
174 UnicodeScript_kMathOperator }, // 46,
175 { UnicodeScript_kMiscTechnical,
176 UnicodeScript_kMiscTechnical,
177 UnicodeScript_kMiscTechnical }, // 47,
178 { UnicodeScript_kControlPicture,
179 UnicodeScript_kControlPicture,
180 UnicodeScript_kControlPicture }, // 48,
181 { UnicodeScript_kOpticalCharacter,
182 UnicodeScript_kOpticalCharacter,
183 UnicodeScript_kOpticalCharacter }, // 49,
184 { UnicodeScript_kEnclosedAlphanumeric,
185 UnicodeScript_kEnclosedAlphanumeric,
186 UnicodeScript_kEnclosedAlphanumeric }, // 50,
187 { UnicodeScript_kBoxDrawing,
188 UnicodeScript_kBoxDrawing,
189 UnicodeScript_kBoxDrawing }, // 51,
190 { UnicodeScript_kBlockElement,
191 UnicodeScript_kBlockElement,
192 UnicodeScript_kBlockElement }, // 52,
193 { UnicodeScript_kGeometricShape,
194 UnicodeScript_kGeometricShape,
195 UnicodeScript_kGeometricShape }, // 53,
196 { UnicodeScript_kMiscSymbol,
197 UnicodeScript_kMiscSymbol,
198 UnicodeScript_kMiscSymbol }, // 54,
199 { UnicodeScript_kDingbat,
200 UnicodeScript_kDingbat,
201 UnicodeScript_kDingbat }, // 55,
202 { UnicodeScript_kBraillePatterns,
203 UnicodeScript_kBraillePatterns,
204 UnicodeScript_kBraillePatterns }, // 56,
205 { UnicodeScript_kCJKRadicalsSupplement,
206 UnicodeScript_kCJKRadicalsSupplement,
207 UnicodeScript_kCJKRadicalsSupplement }, // 57,
208 { UnicodeScript_kKangxiRadicals,
209 UnicodeScript_kKangxiRadicals,
210 UnicodeScript_kKangxiRadicals }, // 58,
211 { UnicodeScript_kIdeographicDescriptionCharacters,
212 UnicodeScript_kIdeographicDescriptionCharacters,
213 UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
214 { UnicodeScript_kCJKSymbolPunctuation,
215 UnicodeScript_kCJKSymbolPunctuation,
216 UnicodeScript_kCJKSymbolPunctuation }, // 60,
217 { UnicodeScript_kHiragana,
218 UnicodeScript_kHiragana,
219 UnicodeScript_kHiragana }, // 61,
220 { UnicodeScript_kKatakana,
221 UnicodeScript_kKatakana,
222 UnicodeScript_kKatakana }, // 62,
223 { UnicodeScript_kBopomofo,
224 UnicodeScript_kBopomofo,
225 UnicodeScript_kBopomofo }, // 63,
226 { UnicodeScript_kHangulCompatibilityJamo,
227 UnicodeScript_kHangulCompatibilityJamo,
228 UnicodeScript_kHangulCompatibilityJamo }, // 64,
229 { UnicodeScript_kKanbun,
230 UnicodeScript_kKanbun,
231 UnicodeScript_kKanbun }, // 65,
232 { UnicodeScript_kBopomofoExtended,
233 UnicodeScript_kBopomofoExtended,
234 UnicodeScript_kBopomofoExtended }, // 66,
235 { UnicodeScript_kEnclosedCJKLetterMonth,
236 UnicodeScript_kEnclosedCJKLetterMonth,
237 UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
238 { UnicodeScript_kCJKCompatibility,
239 UnicodeScript_kCJKCompatibility,
240 UnicodeScript_kCJKCompatibility }, // 68,
241 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
242 UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
243 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
244 { UnicodeScript_kCJKUnifiedIdeograph,
245 UnicodeScript_kCJKUnifiedIdeograph,
246 UnicodeScript_kCJKUnifiedIdeograph }, // 70,
247 { UnicodeScript_kYiSyllables,
248 UnicodeScript_kYiSyllables,
249 UnicodeScript_kYiSyllables }, // 71,
250 { UnicodeScript_kYiRadicals,
251 UnicodeScript_kYiRadicals,
252 UnicodeScript_kYiRadicals }, // 72,
253 { UnicodeScript_kHangulSyllable,
254 UnicodeScript_kHangulSyllable,
255 UnicodeScript_kHangulSyllable }, // 73,
256 { UnicodeScript_kHighSurrogate,
257 UnicodeScript_kHighSurrogate,
258 UnicodeScript_kHighSurrogate }, // 74,
259 { UnicodeScript_kHighPrivateUseSurrogate,
260 UnicodeScript_kHighPrivateUseSurrogate,
261 UnicodeScript_kHighPrivateUseSurrogate }, // 75,
262 { UnicodeScript_kLowSurrogate,
263 UnicodeScript_kLowSurrogate,
264 UnicodeScript_kLowSurrogate }, // 76,
265 { UnicodeScript_kPrivateUse,
266 UnicodeScript_kPrivateUse,
267 UnicodeScript_kPrivateUse }, // 77,
268 { UnicodeScript_kCJKCompatibilityIdeograph,
269 UnicodeScript_kCJKCompatibilityIdeograph,
270 UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
271 { UnicodeScript_kAlphabeticPresentation,
272 UnicodeScript_kAlphabeticPresentation,
273 UnicodeScript_kAlphabeticPresentation }, // 79,
274 { UnicodeScript_kArabicPresentationA,
275 UnicodeScript_kArabicPresentationA,
276 UnicodeScript_kArabicPresentationA }, // 80,
277 { UnicodeScript_kCombiningHalfMark,
278 UnicodeScript_kCombiningHalfMark,
279 UnicodeScript_kCombiningHalfMark }, // 81,
280 { UnicodeScript_kCJKCompatibilityForm,
281 UnicodeScript_kCJKCompatibilityForm,
282 UnicodeScript_kCJKCompatibilityForm }, // 82,
283 { UnicodeScript_kSmallFormVariant,
284 UnicodeScript_kSmallFormVariant,
285 UnicodeScript_kSmallFormVariant }, // 83,
286 { UnicodeScript_kArabicPresentationB,
287 UnicodeScript_kArabicPresentationB,
288 UnicodeScript_kArabicPresentationB }, // 84,
289 { UnicodeScript_kNoScript,
290 UnicodeScript_kNoScript,
291 UnicodeScript_kNoScript }, // 85,
292 { UnicodeScript_kHalfwidthFullwidthForm,
293 UnicodeScript_kHalfwidthFullwidthForm,
294 UnicodeScript_kHalfwidthFullwidthForm }, // 86,
295 { UnicodeScript_kScriptCount,
296 UnicodeScript_kScriptCount,
297 UnicodeScript_kNoScript } // 87,
300 sal_Int16 SAL_CALL
301 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
303 if (!typeList) {
304 typeList = defaultTypeList;
305 unknownType = UnicodeScript_kNoScript;
308 sal_Int16 i = 0, type = typeList[0].to;
309 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
310 type = typeList[++i].to;
313 return (type < UnicodeScript_kScriptCount &&
314 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
315 typeList[i].value : unknownType;
318 sal_Unicode SAL_CALL
319 unicode::getUnicodeScriptStart( UnicodeScript type) {
320 return UnicodeScriptType[type][UnicodeScriptTypeFrom];
323 sal_Unicode SAL_CALL
324 unicode::getUnicodeScriptEnd( UnicodeScript type) {
325 return UnicodeScriptType[type][UnicodeScriptTypeTo];
328 sal_Int16 SAL_CALL
329 unicode::getUnicodeType( const sal_Unicode ch ) {
330 static sal_Unicode c = 0x00;
331 static sal_Int16 r = 0x00;
333 if (ch == c) return r;
334 else c = ch;
336 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
337 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
338 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
341 sal_uInt8 SAL_CALL
342 unicode::getUnicodeDirection( const sal_Unicode ch ) {
343 static sal_Unicode c = 0x00;
344 static sal_uInt8 r = 0x00;
346 if (ch == c) return r;
347 else c = ch;
349 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
350 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
351 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
355 #define bit(name) (1 << name)
357 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
359 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
361 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
363 #define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
364 bit(UnicodeType::LETTER_NUMBER)|\
365 bit(UnicodeType::OTHER_NUMBER)
367 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
368 bit(UnicodeType::MODIFIER_LETTER)|\
369 bit(UnicodeType::OTHER_LETTER)
371 #define BASEMASK DIGITMASK|ALPHAMASK|\
372 bit(UnicodeType::NON_SPACING_MARK)|\
373 bit(UnicodeType::ENCLOSING_MARK)|\
374 bit(UnicodeType::COMBINING_SPACING_MARK)
376 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
377 bit(UnicodeType::LINE_SEPARATOR)|\
378 bit(UnicodeType::PARAGRAPH_SEPARATOR)
380 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
381 bit(UnicodeType::INITIAL_PUNCTUATION)|\
382 bit(UnicodeType::FINAL_PUNCTUATION)|\
383 bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
384 bit(UnicodeType::OTHER_PUNCTUATION)
386 #define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\
387 bit(UnicodeType::CURRENCY_SYMBOL)|\
388 bit(UnicodeType::MODIFIER_SYMBOL)|\
389 bit(UnicodeType::OTHER_SYMBOL)
391 #define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
393 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
394 bit(UnicodeType::FORMAT)|\
395 bit(UnicodeType::LINE_SEPARATOR)|\
396 bit(UnicodeType::PARAGRAPH_SEPARATOR)
398 #define IsType(func, mask) \
399 sal_Bool SAL_CALL func( const sal_Unicode ch) {\
400 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
403 IsType(unicode::isUpper, UPPERMASK)
404 IsType(unicode::isLower, LOWERMASK)
405 IsType(unicode::isControl, CONTROLMASK)
406 IsType(unicode::isPrint, PRINTMASK)
407 IsType(unicode::isAlpha, ALPHAMASK)
408 IsType(unicode::isDigit, DIGITMASK)
409 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
410 IsType(unicode::isSpace, SPACEMASK)
412 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
413 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
415 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
416 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
419 sal_Int16 SAL_CALL unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
421 //See unicode/uscript.h
422 static sal_Int16 scriptTypes[] =
424 ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
425 ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
426 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
427 // 15
428 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
429 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
430 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
431 // 30
432 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
433 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
434 ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
435 // 45
436 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
437 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
438 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
439 // 60
440 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
441 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
442 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
443 // 75
444 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
445 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
446 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
447 // 90
448 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
449 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
450 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
451 // 105
452 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
453 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
454 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
455 // 120
456 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
457 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
458 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
459 // 135
460 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
461 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
462 ScriptType::COMPLEX,
463 ScriptType::WEAK
466 sal_Int16 nRet;
467 if (eScript < USCRIPT_COMMON)
468 nRet = ScriptType::WEAK;
469 else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
470 nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
471 else
472 nRet = scriptTypes[eScript];
473 return nRet;
476 OString SAL_CALL unicode::getExemplerLanguageForUScriptCode(UScriptCode eScript)
478 OString sRet;
479 switch (eScript)
481 case USCRIPT_CODE_LIMIT:
482 case USCRIPT_INVALID_CODE:
483 sRet = "zxx";
484 break;
485 case USCRIPT_COMMON:
486 case USCRIPT_INHERITED:
487 sRet = "und";
488 break;
489 case USCRIPT_MATHEMATICAL_NOTATION:
490 case USCRIPT_SYMBOLS:
491 sRet = "zxx";
492 break;
493 case USCRIPT_UNWRITTEN_LANGUAGES:
494 case USCRIPT_UNKNOWN:
495 sRet = "und";
496 break;
497 case USCRIPT_NABATAEAN: //no language with an assigned code yet
498 sRet = "mis";
499 break;
500 case USCRIPT_PALMYRENE: //no language with an assigned code yet
501 sRet = "mis";
502 break;
503 case USCRIPT_ARABIC:
504 sRet = "ar";
505 break;
506 case USCRIPT_ARMENIAN:
507 sRet = "hy";
508 break;
509 case USCRIPT_BENGALI:
510 sRet = "bn";
511 break;
512 case USCRIPT_BOPOMOFO:
513 sRet = "zh";
514 break;
515 case USCRIPT_CHEROKEE:
516 sRet = "chr";
517 break;
518 case USCRIPT_COPTIC:
519 sRet = "cop";
520 break;
521 case USCRIPT_CYRILLIC:
522 sRet = "ru";
523 break;
524 case USCRIPT_DESERET:
525 sRet = "en";
526 break;
527 case USCRIPT_DEVANAGARI:
528 sRet = "hi";
529 break;
530 case USCRIPT_ETHIOPIC:
531 sRet = "am";
532 break;
533 case USCRIPT_GEORGIAN:
534 sRet = "ka";
535 break;
536 case USCRIPT_GOTHIC:
537 sRet = "got";
538 break;
539 case USCRIPT_GREEK:
540 sRet = "el";
541 break;
542 case USCRIPT_GUJARATI:
543 sRet = "gu";
544 break;
545 case USCRIPT_GURMUKHI:
546 sRet = "pa";
547 break;
548 case USCRIPT_HAN:
549 sRet = "zh";
550 break;
551 case USCRIPT_HANGUL:
552 sRet = "ko";
553 break;
554 case USCRIPT_HEBREW:
555 sRet = "hr";
556 break;
557 case USCRIPT_HIRAGANA:
558 sRet = "ja";
559 break;
560 case USCRIPT_KANNADA:
561 sRet = "kn";
562 break;
563 case USCRIPT_KATAKANA:
564 sRet = "ja";
565 break;
566 case USCRIPT_KHMER:
567 sRet = "km";
568 break;
569 case USCRIPT_LAO:
570 sRet = "lo";
571 break;
572 case USCRIPT_LATIN:
573 sRet = "en";
574 break;
575 case USCRIPT_MALAYALAM:
576 sRet = "ml";
577 break;
578 case USCRIPT_MONGOLIAN:
579 sRet = "mn";
580 break;
581 case USCRIPT_MYANMAR:
582 sRet = "my";
583 break;
584 case USCRIPT_OGHAM:
585 sRet = "pgl";
586 break;
587 case USCRIPT_OLD_ITALIC:
588 sRet = "osc";
589 break;
590 case USCRIPT_ORIYA:
591 sRet = "or";
592 break;
593 case USCRIPT_RUNIC:
594 sRet = "ang";
595 break;
596 case USCRIPT_SINHALA:
597 sRet = "si";
598 break;
599 case USCRIPT_SYRIAC:
600 sRet = "syr";
601 break;
602 case USCRIPT_TAMIL:
603 sRet = "ta";
604 break;
605 case USCRIPT_TELUGU:
606 sRet = "te";
607 break;
608 case USCRIPT_THAANA:
609 sRet = "dv";
610 break;
611 case USCRIPT_THAI:
612 sRet = "th";
613 break;
614 case USCRIPT_TIBETAN:
615 sRet = "bo";
616 break;
617 case USCRIPT_CANADIAN_ABORIGINAL:
618 sRet = "iu";
619 break;
620 case USCRIPT_YI:
621 sRet = "ii";
622 break;
623 case USCRIPT_TAGALOG:
624 sRet = "tl";
625 break;
626 case USCRIPT_HANUNOO:
627 sRet = "hnn";
628 break;
629 case USCRIPT_BUHID:
630 sRet = "bku";
631 break;
632 case USCRIPT_TAGBANWA:
633 sRet = "tbw";
634 break;
635 case USCRIPT_BRAILLE:
636 sRet = "en";
637 break;
638 case USCRIPT_CYPRIOT:
639 sRet = "ecy";
640 break;
641 case USCRIPT_LIMBU:
642 sRet = "lif";
643 break;
644 case USCRIPT_LINEAR_B:
645 sRet = "gmy";
646 break;
647 case USCRIPT_OSMANYA:
648 sRet = "so";
649 break;
650 case USCRIPT_SHAVIAN:
651 sRet = "en";
652 break;
653 case USCRIPT_TAI_LE:
654 sRet = "tdd";
655 break;
656 case USCRIPT_UGARITIC:
657 sRet = "uga";
658 break;
659 case USCRIPT_KATAKANA_OR_HIRAGANA:
660 sRet = "ja";
661 break;
662 case USCRIPT_BUGINESE:
663 sRet = "bug";
664 break;
665 case USCRIPT_GLAGOLITIC:
666 sRet = "ch";
667 break;
668 case USCRIPT_KHAROSHTHI:
669 sRet = "pra";
670 break;
671 case USCRIPT_SYLOTI_NAGRI:
672 sRet = "syl";
673 break;
674 case USCRIPT_NEW_TAI_LUE:
675 sRet = "khb";
676 break;
677 case USCRIPT_TIFINAGH:
678 sRet = "tmh";
679 break;
680 case USCRIPT_OLD_PERSIAN:
681 sRet = "peo";
682 break;
683 case USCRIPT_BALINESE:
684 sRet = "ban";
685 break;
686 case USCRIPT_BATAK:
687 sRet = "btk";
688 break;
689 case USCRIPT_BLISSYMBOLS:
690 sRet = "en";
691 break;
692 case USCRIPT_BRAHMI:
693 sRet = "pra";
694 break;
695 case USCRIPT_CHAM:
696 sRet = "cja";
697 break;
698 case USCRIPT_CIRTH:
699 sRet = "sjn";
700 break;
701 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
702 sRet = "cu";
703 break;
704 case USCRIPT_DEMOTIC_EGYPTIAN:
705 case USCRIPT_HIERATIC_EGYPTIAN:
706 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
707 sRet = "egy";
708 break;
709 case USCRIPT_KHUTSURI:
710 sRet = "ka";
711 break;
712 case USCRIPT_SIMPLIFIED_HAN:
713 sRet = "zh";
714 break;
715 case USCRIPT_TRADITIONAL_HAN:
716 sRet = "zh";
717 break;
718 case USCRIPT_PAHAWH_HMONG:
719 sRet = "blu";
720 break;
721 case USCRIPT_OLD_HUNGARIAN:
722 sRet = "ohu";
723 break;
724 case USCRIPT_HARAPPAN_INDUS:
725 sRet = "xiv";
726 break;
727 case USCRIPT_JAVANESE:
728 sRet = "kaw";
729 break;
730 case USCRIPT_KAYAH_LI:
731 sRet = "eky";
732 break;
733 case USCRIPT_LATIN_FRAKTUR:
734 sRet = "de";
735 break;
736 case USCRIPT_LATIN_GAELIC:
737 sRet = "ga";
738 break;
739 case USCRIPT_LEPCHA:
740 sRet = "lep";
741 break;
742 case USCRIPT_LINEAR_A:
743 sRet = "ecr";
744 break;
745 case USCRIPT_MANDAIC:
746 sRet = "mic";
747 break;
748 case USCRIPT_MAYAN_HIEROGLYPHS:
749 sRet = "myn";
750 break;
751 case USCRIPT_MEROITIC:
752 sRet = "xmr";
753 break;
754 case USCRIPT_NKO:
755 sRet = "nqo";
756 break;
757 case USCRIPT_ORKHON:
758 sRet = "otk";
759 break;
760 case USCRIPT_OLD_PERMIC:
761 sRet = "kv";
762 break;
763 case USCRIPT_PHAGS_PA:
764 sRet = "xng";
765 break;
766 case USCRIPT_PHOENICIAN:
767 sRet = "phn";
768 break;
769 case USCRIPT_PHONETIC_POLLARD:
770 sRet = "hmd";
771 break;
772 case USCRIPT_RONGORONGO:
773 sRet = "rap";
774 break;
775 case USCRIPT_SARATI:
776 sRet = "qya";
777 break;
778 case USCRIPT_ESTRANGELO_SYRIAC:
779 sRet = "syr";
780 break;
781 case USCRIPT_WESTERN_SYRIAC:
782 sRet = "tru";
783 break;
784 case USCRIPT_EASTERN_SYRIAC:
785 sRet = "aii";
786 break;
787 case USCRIPT_TENGWAR:
788 sRet = "sjn";
789 break;
790 case USCRIPT_VAI:
791 sRet = "vai";
792 break;
793 case USCRIPT_VISIBLE_SPEECH:
794 sRet = "en";
795 break;
796 case USCRIPT_CUNEIFORM:
797 sRet = "akk";
798 break;
799 case USCRIPT_CARIAN:
800 sRet = "xcr";
801 break;
802 case USCRIPT_JAPANESE:
803 sRet = "ja";
804 break;
805 case USCRIPT_LANNA:
806 sRet = "nod";
807 break;
808 case USCRIPT_LYCIAN:
809 sRet = "xlc";
810 break;
811 case USCRIPT_LYDIAN:
812 sRet = "xld";
813 break;
814 case USCRIPT_OL_CHIKI:
815 sRet = "sat";
816 break;
817 case USCRIPT_REJANG:
818 sRet = "rej";
819 break;
820 case USCRIPT_SAURASHTRA:
821 sRet = "saz";
822 break;
823 case USCRIPT_SIGN_WRITING:
824 sRet = "en";
825 break;
826 case USCRIPT_SUNDANESE:
827 sRet = "su";
828 break;
829 case USCRIPT_MOON:
830 sRet = "en";
831 break;
832 case USCRIPT_MEITEI_MAYEK:
833 sRet = "mni";
834 break;
835 case USCRIPT_IMPERIAL_ARAMAIC:
836 sRet = "arc";
837 break;
838 case USCRIPT_AVESTAN:
839 sRet = "ae";
840 break;
841 case USCRIPT_CHAKMA:
842 sRet = "ccp";
843 break;
844 case USCRIPT_KOREAN:
845 sRet = "ko";
846 break;
847 case USCRIPT_KAITHI:
848 sRet = "awa";
849 break;
850 case USCRIPT_MANICHAEAN:
851 sRet = "xmn";
852 break;
853 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
854 case USCRIPT_PSALTER_PAHLAVI:
855 case USCRIPT_BOOK_PAHLAVI:
856 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
857 sRet = "xpr";
858 break;
859 case USCRIPT_SAMARITAN:
860 sRet = "heb";
861 break;
862 case USCRIPT_TAI_VIET:
863 sRet = "blt";
864 break;
865 case USCRIPT_BAMUM:
866 sRet = "bax";
867 break;
868 case USCRIPT_LISU:
869 sRet = "lis";
870 break;
871 case USCRIPT_NAKHI_GEBA:
872 sRet = "nxq";
873 break;
874 case USCRIPT_OLD_SOUTH_ARABIAN:
875 sRet = "xsa";
876 break;
877 case USCRIPT_BASSA_VAH:
878 sRet = "bsq";
879 break;
880 case USCRIPT_DUPLOYAN_SHORTAND:
881 sRet = "fr";
882 break;
883 case USCRIPT_ELBASAN:
884 sRet = "sq";
885 break;
886 case USCRIPT_GRANTHA:
887 sRet = "ta";
888 break;
889 case USCRIPT_KPELLE:
890 sRet = "kpe";
891 break;
892 case USCRIPT_LOMA:
893 sRet = "lom";
894 break;
895 case USCRIPT_MENDE:
896 sRet = "men";
897 break;
898 case USCRIPT_MEROITIC_CURSIVE:
899 sRet = "xmr";
900 break;
901 case USCRIPT_OLD_NORTH_ARABIAN:
902 sRet = "xna";
903 break;
904 case USCRIPT_SINDHI:
905 sRet = "sd";
906 break;
907 case USCRIPT_WARANG_CITI:
908 sRet = "hoc";
909 break;
910 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
911 case USCRIPT_AFAKA:
912 sRet = "djk";
913 break;
914 case USCRIPT_JURCHEN:
915 sRet = "juc";
916 break;
917 case USCRIPT_MRO:
918 sRet = "cmr";
919 break;
920 case USCRIPT_NUSHU: //no language with an assigned code yet
921 sRet = "mis";
922 break;
923 case USCRIPT_SHARADA:
924 sRet = "sa";
925 break;
926 case USCRIPT_SORA_SOMPENG:
927 sRet = "srb";
928 break;
929 case USCRIPT_TAKRI:
930 sRet = "doi";
931 break;
932 case USCRIPT_TANGUT:
933 sRet = "txg";
934 break;
935 case USCRIPT_WOLEAI:
936 sRet = "woe";
937 break;
938 #endif
939 #if (U_ICU_VERSION_MAJOR_NUM > 4)
940 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
941 sRet = "hlu";
942 break;
943 case USCRIPT_KHOJKI:
944 sRet = "gu";
945 break;
946 case USCRIPT_TIRHUTA:
947 sRet = "mai";
948 break;
949 #endif
951 return sRet;
954 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */