update dev300-m58
[ooovba.git] / i18nutil / source / utility / unicode.cxx
blob82c2753c33f283a48da07921e6c602785e8022b7
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: unicode.cxx,v $
10 * $Revision: 1.6 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 #include <com/sun/star/i18n/UnicodeType.hpp>
32 #include <com/sun/star/i18n/KCharacterType.hpp>
33 #include <i18nutil/unicode.hxx>
34 #include "unicode_data.h"
36 // Workaround for glibc braindamage:
37 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
38 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
39 #undef CURRENCY_SYMBOL
41 using namespace ::com::sun::star::i18n;
43 static ScriptTypeList defaultTypeList[] = {
44 { UnicodeScript_kBasicLatin,
45 UnicodeScript_kBasicLatin,
46 UnicodeScript_kBasicLatin }, // 0,
47 { UnicodeScript_kLatin1Supplement,
48 UnicodeScript_kLatin1Supplement,
49 UnicodeScript_kLatin1Supplement },// 1,
50 { UnicodeScript_kLatinExtendedA,
51 UnicodeScript_kLatinExtendedA,
52 UnicodeScript_kLatinExtendedA }, // 2,
53 { UnicodeScript_kLatinExtendedB,
54 UnicodeScript_kLatinExtendedB,
55 UnicodeScript_kLatinExtendedB }, // 3,
56 { UnicodeScript_kIPAExtension,
57 UnicodeScript_kIPAExtension,
58 UnicodeScript_kIPAExtension }, // 4,
59 { UnicodeScript_kSpacingModifier,
60 UnicodeScript_kSpacingModifier,
61 UnicodeScript_kSpacingModifier }, // 5,
62 { UnicodeScript_kCombiningDiacritical,
63 UnicodeScript_kCombiningDiacritical,
64 UnicodeScript_kCombiningDiacritical }, // 6,
65 { UnicodeScript_kGreek,
66 UnicodeScript_kGreek,
67 UnicodeScript_kGreek }, // 7,
68 { UnicodeScript_kCyrillic,
69 UnicodeScript_kCyrillic,
70 UnicodeScript_kCyrillic }, // 8,
71 { UnicodeScript_kArmenian,
72 UnicodeScript_kArmenian,
73 UnicodeScript_kArmenian }, // 9,
74 { UnicodeScript_kHebrew,
75 UnicodeScript_kHebrew,
76 UnicodeScript_kHebrew }, // 10,
77 { UnicodeScript_kArabic,
78 UnicodeScript_kArabic,
79 UnicodeScript_kArabic }, // 11,
80 { UnicodeScript_kSyriac,
81 UnicodeScript_kSyriac,
82 UnicodeScript_kSyriac }, // 12,
83 { UnicodeScript_kThaana,
84 UnicodeScript_kThaana,
85 UnicodeScript_kThaana }, // 13,
86 { UnicodeScript_kDevanagari,
87 UnicodeScript_kDevanagari,
88 UnicodeScript_kDevanagari }, // 14,
89 { UnicodeScript_kBengali,
90 UnicodeScript_kBengali,
91 UnicodeScript_kBengali }, // 15,
92 { UnicodeScript_kGurmukhi,
93 UnicodeScript_kGurmukhi,
94 UnicodeScript_kGurmukhi }, // 16,
95 { UnicodeScript_kGujarati,
96 UnicodeScript_kGujarati,
97 UnicodeScript_kGujarati }, // 17,
98 { UnicodeScript_kOriya,
99 UnicodeScript_kOriya,
100 UnicodeScript_kOriya }, // 18,
101 { UnicodeScript_kTamil,
102 UnicodeScript_kTamil,
103 UnicodeScript_kTamil }, // 19,
104 { UnicodeScript_kTelugu,
105 UnicodeScript_kTelugu,
106 UnicodeScript_kTelugu }, // 20,
107 { UnicodeScript_kKannada,
108 UnicodeScript_kKannada,
109 UnicodeScript_kKannada }, // 21,
110 { UnicodeScript_kMalayalam,
111 UnicodeScript_kMalayalam,
112 UnicodeScript_kMalayalam }, // 22,
113 { UnicodeScript_kSinhala,
114 UnicodeScript_kSinhala,
115 UnicodeScript_kSinhala }, // 23,
116 { UnicodeScript_kThai,
117 UnicodeScript_kThai,
118 UnicodeScript_kThai }, // 24,
119 { UnicodeScript_kLao,
120 UnicodeScript_kLao,
121 UnicodeScript_kLao }, // 25,
122 { UnicodeScript_kTibetan,
123 UnicodeScript_kTibetan,
124 UnicodeScript_kTibetan }, // 26,
125 { UnicodeScript_kMyanmar,
126 UnicodeScript_kMyanmar,
127 UnicodeScript_kMyanmar }, // 27,
128 { UnicodeScript_kGeorgian,
129 UnicodeScript_kGeorgian,
130 UnicodeScript_kGeorgian }, // 28,
131 { UnicodeScript_kHangulJamo,
132 UnicodeScript_kHangulJamo,
133 UnicodeScript_kHangulJamo }, // 29,
134 { UnicodeScript_kEthiopic,
135 UnicodeScript_kEthiopic,
136 UnicodeScript_kEthiopic }, // 30,
137 { UnicodeScript_kCherokee,
138 UnicodeScript_kCherokee,
139 UnicodeScript_kCherokee }, // 31,
140 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
141 UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
142 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
143 { UnicodeScript_kOgham,
144 UnicodeScript_kOgham,
145 UnicodeScript_kOgham }, // 33,
146 { UnicodeScript_kRunic,
147 UnicodeScript_kRunic,
148 UnicodeScript_kRunic }, // 34,
149 { UnicodeScript_kKhmer,
150 UnicodeScript_kKhmer,
151 UnicodeScript_kKhmer }, // 35,
152 { UnicodeScript_kMongolian,
153 UnicodeScript_kMongolian,
154 UnicodeScript_kMongolian }, // 36,
155 { UnicodeScript_kLatinExtendedAdditional,
156 UnicodeScript_kLatinExtendedAdditional,
157 UnicodeScript_kLatinExtendedAdditional }, // 37,
158 { UnicodeScript_kGreekExtended,
159 UnicodeScript_kGreekExtended,
160 UnicodeScript_kGreekExtended }, // 38,
161 { UnicodeScript_kGeneralPunctuation,
162 UnicodeScript_kGeneralPunctuation,
163 UnicodeScript_kGeneralPunctuation }, // 39,
164 { UnicodeScript_kSuperSubScript,
165 UnicodeScript_kSuperSubScript,
166 UnicodeScript_kSuperSubScript }, // 40,
167 { UnicodeScript_kCurrencySymbolScript,
168 UnicodeScript_kCurrencySymbolScript,
169 UnicodeScript_kCurrencySymbolScript }, // 41,
170 { UnicodeScript_kSymbolCombiningMark,
171 UnicodeScript_kSymbolCombiningMark,
172 UnicodeScript_kSymbolCombiningMark }, // 42,
173 { UnicodeScript_kLetterlikeSymbol,
174 UnicodeScript_kLetterlikeSymbol,
175 UnicodeScript_kLetterlikeSymbol }, // 43,
176 { UnicodeScript_kNumberForm,
177 UnicodeScript_kNumberForm,
178 UnicodeScript_kNumberForm }, // 44,
179 { UnicodeScript_kArrow,
180 UnicodeScript_kArrow,
181 UnicodeScript_kArrow }, // 45,
182 { UnicodeScript_kMathOperator,
183 UnicodeScript_kMathOperator,
184 UnicodeScript_kMathOperator }, // 46,
185 { UnicodeScript_kMiscTechnical,
186 UnicodeScript_kMiscTechnical,
187 UnicodeScript_kMiscTechnical }, // 47,
188 { UnicodeScript_kControlPicture,
189 UnicodeScript_kControlPicture,
190 UnicodeScript_kControlPicture }, // 48,
191 { UnicodeScript_kOpticalCharacter,
192 UnicodeScript_kOpticalCharacter,
193 UnicodeScript_kOpticalCharacter }, // 49,
194 { UnicodeScript_kEnclosedAlphanumeric,
195 UnicodeScript_kEnclosedAlphanumeric,
196 UnicodeScript_kEnclosedAlphanumeric }, // 50,
197 { UnicodeScript_kBoxDrawing,
198 UnicodeScript_kBoxDrawing,
199 UnicodeScript_kBoxDrawing }, // 51,
200 { UnicodeScript_kBlockElement,
201 UnicodeScript_kBlockElement,
202 UnicodeScript_kBlockElement }, // 52,
203 { UnicodeScript_kGeometricShape,
204 UnicodeScript_kGeometricShape,
205 UnicodeScript_kGeometricShape }, // 53,
206 { UnicodeScript_kMiscSymbol,
207 UnicodeScript_kMiscSymbol,
208 UnicodeScript_kMiscSymbol }, // 54,
209 { UnicodeScript_kDingbat,
210 UnicodeScript_kDingbat,
211 UnicodeScript_kDingbat }, // 55,
212 { UnicodeScript_kBraillePatterns,
213 UnicodeScript_kBraillePatterns,
214 UnicodeScript_kBraillePatterns }, // 56,
215 { UnicodeScript_kCJKRadicalsSupplement,
216 UnicodeScript_kCJKRadicalsSupplement,
217 UnicodeScript_kCJKRadicalsSupplement }, // 57,
218 { UnicodeScript_kKangxiRadicals,
219 UnicodeScript_kKangxiRadicals,
220 UnicodeScript_kKangxiRadicals }, // 58,
221 { UnicodeScript_kIdeographicDescriptionCharacters,
222 UnicodeScript_kIdeographicDescriptionCharacters,
223 UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
224 { UnicodeScript_kCJKSymbolPunctuation,
225 UnicodeScript_kCJKSymbolPunctuation,
226 UnicodeScript_kCJKSymbolPunctuation }, // 60,
227 { UnicodeScript_kHiragana,
228 UnicodeScript_kHiragana,
229 UnicodeScript_kHiragana }, // 61,
230 { UnicodeScript_kKatakana,
231 UnicodeScript_kKatakana,
232 UnicodeScript_kKatakana }, // 62,
233 { UnicodeScript_kBopomofo,
234 UnicodeScript_kBopomofo,
235 UnicodeScript_kBopomofo }, // 63,
236 { UnicodeScript_kHangulCompatibilityJamo,
237 UnicodeScript_kHangulCompatibilityJamo,
238 UnicodeScript_kHangulCompatibilityJamo }, // 64,
239 { UnicodeScript_kKanbun,
240 UnicodeScript_kKanbun,
241 UnicodeScript_kKanbun }, // 65,
242 { UnicodeScript_kBopomofoExtended,
243 UnicodeScript_kBopomofoExtended,
244 UnicodeScript_kBopomofoExtended }, // 66,
245 { UnicodeScript_kEnclosedCJKLetterMonth,
246 UnicodeScript_kEnclosedCJKLetterMonth,
247 UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
248 { UnicodeScript_kCJKCompatibility,
249 UnicodeScript_kCJKCompatibility,
250 UnicodeScript_kCJKCompatibility }, // 68,
251 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
252 UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
253 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
254 { UnicodeScript_kCJKUnifiedIdeograph,
255 UnicodeScript_kCJKUnifiedIdeograph,
256 UnicodeScript_kCJKUnifiedIdeograph }, // 70,
257 { UnicodeScript_kYiSyllables,
258 UnicodeScript_kYiSyllables,
259 UnicodeScript_kYiSyllables }, // 71,
260 { UnicodeScript_kYiRadicals,
261 UnicodeScript_kYiRadicals,
262 UnicodeScript_kYiRadicals }, // 72,
263 { UnicodeScript_kHangulSyllable,
264 UnicodeScript_kHangulSyllable,
265 UnicodeScript_kHangulSyllable }, // 73,
266 { UnicodeScript_kHighSurrogate,
267 UnicodeScript_kHighSurrogate,
268 UnicodeScript_kHighSurrogate }, // 74,
269 { UnicodeScript_kHighPrivateUseSurrogate,
270 UnicodeScript_kHighPrivateUseSurrogate,
271 UnicodeScript_kHighPrivateUseSurrogate }, // 75,
272 { UnicodeScript_kLowSurrogate,
273 UnicodeScript_kLowSurrogate,
274 UnicodeScript_kLowSurrogate }, // 76,
275 { UnicodeScript_kPrivateUse,
276 UnicodeScript_kPrivateUse,
277 UnicodeScript_kPrivateUse }, // 77,
278 { UnicodeScript_kCJKCompatibilityIdeograph,
279 UnicodeScript_kCJKCompatibilityIdeograph,
280 UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
281 { UnicodeScript_kAlphabeticPresentation,
282 UnicodeScript_kAlphabeticPresentation,
283 UnicodeScript_kAlphabeticPresentation }, // 79,
284 { UnicodeScript_kArabicPresentationA,
285 UnicodeScript_kArabicPresentationA,
286 UnicodeScript_kArabicPresentationA }, // 80,
287 { UnicodeScript_kCombiningHalfMark,
288 UnicodeScript_kCombiningHalfMark,
289 UnicodeScript_kCombiningHalfMark }, // 81,
290 { UnicodeScript_kCJKCompatibilityForm,
291 UnicodeScript_kCJKCompatibilityForm,
292 UnicodeScript_kCJKCompatibilityForm }, // 82,
293 { UnicodeScript_kSmallFormVariant,
294 UnicodeScript_kSmallFormVariant,
295 UnicodeScript_kSmallFormVariant }, // 83,
296 { UnicodeScript_kArabicPresentationB,
297 UnicodeScript_kArabicPresentationB,
298 UnicodeScript_kArabicPresentationB }, // 84,
299 { UnicodeScript_kNoScript,
300 UnicodeScript_kNoScript,
301 UnicodeScript_kNoScript }, // 85,
302 { UnicodeScript_kHalfwidthFullwidthForm,
303 UnicodeScript_kHalfwidthFullwidthForm,
304 UnicodeScript_kHalfwidthFullwidthForm }, // 86,
305 { UnicodeScript_kScriptCount,
306 UnicodeScript_kScriptCount,
307 UnicodeScript_kNoScript } // 87,
310 sal_Int16 SAL_CALL
311 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
313 if (!typeList) {
314 typeList = defaultTypeList;
315 unknownType = UnicodeScript_kNoScript;
318 sal_Int16 i = 0, type = typeList[0].to;
319 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
320 type = typeList[++i].to;
323 return (type < UnicodeScript_kScriptCount &&
324 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
325 typeList[i].value : unknownType;
328 sal_Bool SAL_CALL
329 unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) {
330 return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] &&
331 ch <= UnicodeScriptType[type][UnicodeScriptTypeTo];
334 sal_Unicode SAL_CALL
335 unicode::getUnicodeScriptStart( UnicodeScript type) {
336 return UnicodeScriptType[type][UnicodeScriptTypeFrom];
339 sal_Unicode SAL_CALL
340 unicode::getUnicodeScriptEnd( UnicodeScript type) {
341 return UnicodeScriptType[type][UnicodeScriptTypeTo];
344 sal_Int16 SAL_CALL
345 unicode::getUnicodeType( const sal_Unicode ch ) {
346 static sal_Unicode c = 0x00;
347 static sal_Int16 r = 0x00;
349 if (ch == c) return r;
350 else c = ch;
352 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
353 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
354 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
357 sal_uInt8 SAL_CALL
358 unicode::getUnicodeDirection( const sal_Unicode ch ) {
359 static sal_Unicode c = 0x00;
360 static sal_uInt8 r = 0x00;
362 if (ch == c) return r;
363 else c = ch;
365 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
366 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
367 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
371 #define bit(name) (1 << name)
373 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
375 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
377 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
379 #define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
380 bit(UnicodeType::LETTER_NUMBER)|\
381 bit(UnicodeType::OTHER_NUMBER)
383 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
384 bit(UnicodeType::MODIFIER_LETTER)|\
385 bit(UnicodeType::OTHER_LETTER)
387 #define BASEMASK DIGITMASK|ALPHAMASK|\
388 bit(UnicodeType::NON_SPACING_MARK)|\
389 bit(UnicodeType::ENCLOSING_MARK)|\
390 bit(UnicodeType::COMBINING_SPACING_MARK)
392 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
393 bit(UnicodeType::LINE_SEPARATOR)|\
394 bit(UnicodeType::PARAGRAPH_SEPARATOR)
396 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
397 bit(UnicodeType::INITIAL_PUNCTUATION)|\
398 bit(UnicodeType::FINAL_PUNCTUATION)|\
399 bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
400 bit(UnicodeType::OTHER_PUNCTUATION)
402 #define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\
403 bit(UnicodeType::CURRENCY_SYMBOL)|\
404 bit(UnicodeType::MODIFIER_SYMBOL)|\
405 bit(UnicodeType::OTHER_SYMBOL)
407 #define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
409 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
410 bit(UnicodeType::FORMAT)|\
411 bit(UnicodeType::LINE_SEPARATOR)|\
412 bit(UnicodeType::PARAGRAPH_SEPARATOR)
414 #define IsType(func, mask) \
415 sal_Bool SAL_CALL func( const sal_Unicode ch) {\
416 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
419 IsType(unicode::isUpper, UPPERMASK)
420 IsType(unicode::isLower, LOWERMASK)
421 IsType(unicode::isTitle, DIGITMASK)
422 IsType(unicode::isControl, CONTROLMASK)
423 IsType(unicode::isPrint, PRINTMASK)
424 IsType(unicode::isAlpha, ALPHAMASK)
425 IsType(unicode::isDigit, DIGITMASK)
426 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
427 IsType(unicode::isSpace, SPACEMASK)
428 IsType(unicode::isBase, BASEMASK)
429 IsType(unicode::isPunctuation, PUNCTUATIONMASK)
431 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
432 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
434 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
435 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
438 sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch )
440 using namespace ::com::sun::star::i18n::KCharacterType;
442 switch ( getUnicodeType( ch ) ) {
443 // Upper
444 case UnicodeType::UPPERCASE_LETTER :
445 return UPPER|LETTER|PRINTABLE|BASE_FORM;
447 // Lower
448 case UnicodeType::LOWERCASE_LETTER :
449 return LOWER|LETTER|PRINTABLE|BASE_FORM;
451 // Title
452 case UnicodeType::TITLECASE_LETTER :
453 return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
455 // Letter
456 case UnicodeType::MODIFIER_LETTER :
457 case UnicodeType::OTHER_LETTER :
458 return LETTER|PRINTABLE|BASE_FORM;
460 // Digit
461 case UnicodeType::DECIMAL_DIGIT_NUMBER:
462 case UnicodeType::LETTER_NUMBER:
463 case UnicodeType::OTHER_NUMBER:
464 return DIGIT|PRINTABLE|BASE_FORM;
466 // Base
467 case UnicodeType::NON_SPACING_MARK:
468 case UnicodeType::ENCLOSING_MARK:
469 case UnicodeType::COMBINING_SPACING_MARK:
470 return BASE_FORM|PRINTABLE;
472 // Print
473 case UnicodeType::SPACE_SEPARATOR:
475 case UnicodeType::DASH_PUNCTUATION:
476 case UnicodeType::INITIAL_PUNCTUATION:
477 case UnicodeType::FINAL_PUNCTUATION:
478 case UnicodeType::CONNECTOR_PUNCTUATION:
479 case UnicodeType::OTHER_PUNCTUATION:
481 case UnicodeType::MATH_SYMBOL:
482 case UnicodeType::CURRENCY_SYMBOL:
483 case UnicodeType::MODIFIER_SYMBOL:
484 case UnicodeType::OTHER_SYMBOL:
485 return PRINTABLE;
487 // Control
488 case UnicodeType::CONTROL:
489 case UnicodeType::FORMAT:
490 return CONTROL;
492 case UnicodeType::LINE_SEPARATOR:
493 case UnicodeType::PARAGRAPH_SEPARATOR:
494 return CONTROL|PRINTABLE;
496 // for all others
497 default:
498 return 0;