1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: unicode.cxx,v $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 #include <com/sun/star/i18n/UnicodeType.hpp>
32 #include <com/sun/star/i18n/KCharacterType.hpp>
33 #include <i18nutil/unicode.hxx>
34 #include "unicode_data.h"
36 // Workaround for glibc braindamage:
37 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
38 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
39 #undef CURRENCY_SYMBOL
41 using namespace ::com::sun::star::i18n
;
43 static ScriptTypeList defaultTypeList
[] = {
44 { UnicodeScript_kBasicLatin
,
45 UnicodeScript_kBasicLatin
,
46 UnicodeScript_kBasicLatin
}, // 0,
47 { UnicodeScript_kLatin1Supplement
,
48 UnicodeScript_kLatin1Supplement
,
49 UnicodeScript_kLatin1Supplement
},// 1,
50 { UnicodeScript_kLatinExtendedA
,
51 UnicodeScript_kLatinExtendedA
,
52 UnicodeScript_kLatinExtendedA
}, // 2,
53 { UnicodeScript_kLatinExtendedB
,
54 UnicodeScript_kLatinExtendedB
,
55 UnicodeScript_kLatinExtendedB
}, // 3,
56 { UnicodeScript_kIPAExtension
,
57 UnicodeScript_kIPAExtension
,
58 UnicodeScript_kIPAExtension
}, // 4,
59 { UnicodeScript_kSpacingModifier
,
60 UnicodeScript_kSpacingModifier
,
61 UnicodeScript_kSpacingModifier
}, // 5,
62 { UnicodeScript_kCombiningDiacritical
,
63 UnicodeScript_kCombiningDiacritical
,
64 UnicodeScript_kCombiningDiacritical
}, // 6,
65 { UnicodeScript_kGreek
,
67 UnicodeScript_kGreek
}, // 7,
68 { UnicodeScript_kCyrillic
,
69 UnicodeScript_kCyrillic
,
70 UnicodeScript_kCyrillic
}, // 8,
71 { UnicodeScript_kArmenian
,
72 UnicodeScript_kArmenian
,
73 UnicodeScript_kArmenian
}, // 9,
74 { UnicodeScript_kHebrew
,
75 UnicodeScript_kHebrew
,
76 UnicodeScript_kHebrew
}, // 10,
77 { UnicodeScript_kArabic
,
78 UnicodeScript_kArabic
,
79 UnicodeScript_kArabic
}, // 11,
80 { UnicodeScript_kSyriac
,
81 UnicodeScript_kSyriac
,
82 UnicodeScript_kSyriac
}, // 12,
83 { UnicodeScript_kThaana
,
84 UnicodeScript_kThaana
,
85 UnicodeScript_kThaana
}, // 13,
86 { UnicodeScript_kDevanagari
,
87 UnicodeScript_kDevanagari
,
88 UnicodeScript_kDevanagari
}, // 14,
89 { UnicodeScript_kBengali
,
90 UnicodeScript_kBengali
,
91 UnicodeScript_kBengali
}, // 15,
92 { UnicodeScript_kGurmukhi
,
93 UnicodeScript_kGurmukhi
,
94 UnicodeScript_kGurmukhi
}, // 16,
95 { UnicodeScript_kGujarati
,
96 UnicodeScript_kGujarati
,
97 UnicodeScript_kGujarati
}, // 17,
98 { UnicodeScript_kOriya
,
100 UnicodeScript_kOriya
}, // 18,
101 { UnicodeScript_kTamil
,
102 UnicodeScript_kTamil
,
103 UnicodeScript_kTamil
}, // 19,
104 { UnicodeScript_kTelugu
,
105 UnicodeScript_kTelugu
,
106 UnicodeScript_kTelugu
}, // 20,
107 { UnicodeScript_kKannada
,
108 UnicodeScript_kKannada
,
109 UnicodeScript_kKannada
}, // 21,
110 { UnicodeScript_kMalayalam
,
111 UnicodeScript_kMalayalam
,
112 UnicodeScript_kMalayalam
}, // 22,
113 { UnicodeScript_kSinhala
,
114 UnicodeScript_kSinhala
,
115 UnicodeScript_kSinhala
}, // 23,
116 { UnicodeScript_kThai
,
118 UnicodeScript_kThai
}, // 24,
119 { UnicodeScript_kLao
,
121 UnicodeScript_kLao
}, // 25,
122 { UnicodeScript_kTibetan
,
123 UnicodeScript_kTibetan
,
124 UnicodeScript_kTibetan
}, // 26,
125 { UnicodeScript_kMyanmar
,
126 UnicodeScript_kMyanmar
,
127 UnicodeScript_kMyanmar
}, // 27,
128 { UnicodeScript_kGeorgian
,
129 UnicodeScript_kGeorgian
,
130 UnicodeScript_kGeorgian
}, // 28,
131 { UnicodeScript_kHangulJamo
,
132 UnicodeScript_kHangulJamo
,
133 UnicodeScript_kHangulJamo
}, // 29,
134 { UnicodeScript_kEthiopic
,
135 UnicodeScript_kEthiopic
,
136 UnicodeScript_kEthiopic
}, // 30,
137 { UnicodeScript_kCherokee
,
138 UnicodeScript_kCherokee
,
139 UnicodeScript_kCherokee
}, // 31,
140 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics
,
141 UnicodeScript_kUnifiedCanadianAboriginalSyllabics
,
142 UnicodeScript_kUnifiedCanadianAboriginalSyllabics
}, // 32,
143 { UnicodeScript_kOgham
,
144 UnicodeScript_kOgham
,
145 UnicodeScript_kOgham
}, // 33,
146 { UnicodeScript_kRunic
,
147 UnicodeScript_kRunic
,
148 UnicodeScript_kRunic
}, // 34,
149 { UnicodeScript_kKhmer
,
150 UnicodeScript_kKhmer
,
151 UnicodeScript_kKhmer
}, // 35,
152 { UnicodeScript_kMongolian
,
153 UnicodeScript_kMongolian
,
154 UnicodeScript_kMongolian
}, // 36,
155 { UnicodeScript_kLatinExtendedAdditional
,
156 UnicodeScript_kLatinExtendedAdditional
,
157 UnicodeScript_kLatinExtendedAdditional
}, // 37,
158 { UnicodeScript_kGreekExtended
,
159 UnicodeScript_kGreekExtended
,
160 UnicodeScript_kGreekExtended
}, // 38,
161 { UnicodeScript_kGeneralPunctuation
,
162 UnicodeScript_kGeneralPunctuation
,
163 UnicodeScript_kGeneralPunctuation
}, // 39,
164 { UnicodeScript_kSuperSubScript
,
165 UnicodeScript_kSuperSubScript
,
166 UnicodeScript_kSuperSubScript
}, // 40,
167 { UnicodeScript_kCurrencySymbolScript
,
168 UnicodeScript_kCurrencySymbolScript
,
169 UnicodeScript_kCurrencySymbolScript
}, // 41,
170 { UnicodeScript_kSymbolCombiningMark
,
171 UnicodeScript_kSymbolCombiningMark
,
172 UnicodeScript_kSymbolCombiningMark
}, // 42,
173 { UnicodeScript_kLetterlikeSymbol
,
174 UnicodeScript_kLetterlikeSymbol
,
175 UnicodeScript_kLetterlikeSymbol
}, // 43,
176 { UnicodeScript_kNumberForm
,
177 UnicodeScript_kNumberForm
,
178 UnicodeScript_kNumberForm
}, // 44,
179 { UnicodeScript_kArrow
,
180 UnicodeScript_kArrow
,
181 UnicodeScript_kArrow
}, // 45,
182 { UnicodeScript_kMathOperator
,
183 UnicodeScript_kMathOperator
,
184 UnicodeScript_kMathOperator
}, // 46,
185 { UnicodeScript_kMiscTechnical
,
186 UnicodeScript_kMiscTechnical
,
187 UnicodeScript_kMiscTechnical
}, // 47,
188 { UnicodeScript_kControlPicture
,
189 UnicodeScript_kControlPicture
,
190 UnicodeScript_kControlPicture
}, // 48,
191 { UnicodeScript_kOpticalCharacter
,
192 UnicodeScript_kOpticalCharacter
,
193 UnicodeScript_kOpticalCharacter
}, // 49,
194 { UnicodeScript_kEnclosedAlphanumeric
,
195 UnicodeScript_kEnclosedAlphanumeric
,
196 UnicodeScript_kEnclosedAlphanumeric
}, // 50,
197 { UnicodeScript_kBoxDrawing
,
198 UnicodeScript_kBoxDrawing
,
199 UnicodeScript_kBoxDrawing
}, // 51,
200 { UnicodeScript_kBlockElement
,
201 UnicodeScript_kBlockElement
,
202 UnicodeScript_kBlockElement
}, // 52,
203 { UnicodeScript_kGeometricShape
,
204 UnicodeScript_kGeometricShape
,
205 UnicodeScript_kGeometricShape
}, // 53,
206 { UnicodeScript_kMiscSymbol
,
207 UnicodeScript_kMiscSymbol
,
208 UnicodeScript_kMiscSymbol
}, // 54,
209 { UnicodeScript_kDingbat
,
210 UnicodeScript_kDingbat
,
211 UnicodeScript_kDingbat
}, // 55,
212 { UnicodeScript_kBraillePatterns
,
213 UnicodeScript_kBraillePatterns
,
214 UnicodeScript_kBraillePatterns
}, // 56,
215 { UnicodeScript_kCJKRadicalsSupplement
,
216 UnicodeScript_kCJKRadicalsSupplement
,
217 UnicodeScript_kCJKRadicalsSupplement
}, // 57,
218 { UnicodeScript_kKangxiRadicals
,
219 UnicodeScript_kKangxiRadicals
,
220 UnicodeScript_kKangxiRadicals
}, // 58,
221 { UnicodeScript_kIdeographicDescriptionCharacters
,
222 UnicodeScript_kIdeographicDescriptionCharacters
,
223 UnicodeScript_kIdeographicDescriptionCharacters
}, // 59,
224 { UnicodeScript_kCJKSymbolPunctuation
,
225 UnicodeScript_kCJKSymbolPunctuation
,
226 UnicodeScript_kCJKSymbolPunctuation
}, // 60,
227 { UnicodeScript_kHiragana
,
228 UnicodeScript_kHiragana
,
229 UnicodeScript_kHiragana
}, // 61,
230 { UnicodeScript_kKatakana
,
231 UnicodeScript_kKatakana
,
232 UnicodeScript_kKatakana
}, // 62,
233 { UnicodeScript_kBopomofo
,
234 UnicodeScript_kBopomofo
,
235 UnicodeScript_kBopomofo
}, // 63,
236 { UnicodeScript_kHangulCompatibilityJamo
,
237 UnicodeScript_kHangulCompatibilityJamo
,
238 UnicodeScript_kHangulCompatibilityJamo
}, // 64,
239 { UnicodeScript_kKanbun
,
240 UnicodeScript_kKanbun
,
241 UnicodeScript_kKanbun
}, // 65,
242 { UnicodeScript_kBopomofoExtended
,
243 UnicodeScript_kBopomofoExtended
,
244 UnicodeScript_kBopomofoExtended
}, // 66,
245 { UnicodeScript_kEnclosedCJKLetterMonth
,
246 UnicodeScript_kEnclosedCJKLetterMonth
,
247 UnicodeScript_kEnclosedCJKLetterMonth
}, // 67,
248 { UnicodeScript_kCJKCompatibility
,
249 UnicodeScript_kCJKCompatibility
,
250 UnicodeScript_kCJKCompatibility
}, // 68,
251 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA
,
252 UnicodeScript_k_CJKUnifiedIdeographsExtensionA
,
253 UnicodeScript_k_CJKUnifiedIdeographsExtensionA
}, // 69,
254 { UnicodeScript_kCJKUnifiedIdeograph
,
255 UnicodeScript_kCJKUnifiedIdeograph
,
256 UnicodeScript_kCJKUnifiedIdeograph
}, // 70,
257 { UnicodeScript_kYiSyllables
,
258 UnicodeScript_kYiSyllables
,
259 UnicodeScript_kYiSyllables
}, // 71,
260 { UnicodeScript_kYiRadicals
,
261 UnicodeScript_kYiRadicals
,
262 UnicodeScript_kYiRadicals
}, // 72,
263 { UnicodeScript_kHangulSyllable
,
264 UnicodeScript_kHangulSyllable
,
265 UnicodeScript_kHangulSyllable
}, // 73,
266 { UnicodeScript_kHighSurrogate
,
267 UnicodeScript_kHighSurrogate
,
268 UnicodeScript_kHighSurrogate
}, // 74,
269 { UnicodeScript_kHighPrivateUseSurrogate
,
270 UnicodeScript_kHighPrivateUseSurrogate
,
271 UnicodeScript_kHighPrivateUseSurrogate
}, // 75,
272 { UnicodeScript_kLowSurrogate
,
273 UnicodeScript_kLowSurrogate
,
274 UnicodeScript_kLowSurrogate
}, // 76,
275 { UnicodeScript_kPrivateUse
,
276 UnicodeScript_kPrivateUse
,
277 UnicodeScript_kPrivateUse
}, // 77,
278 { UnicodeScript_kCJKCompatibilityIdeograph
,
279 UnicodeScript_kCJKCompatibilityIdeograph
,
280 UnicodeScript_kCJKCompatibilityIdeograph
}, // 78,
281 { UnicodeScript_kAlphabeticPresentation
,
282 UnicodeScript_kAlphabeticPresentation
,
283 UnicodeScript_kAlphabeticPresentation
}, // 79,
284 { UnicodeScript_kArabicPresentationA
,
285 UnicodeScript_kArabicPresentationA
,
286 UnicodeScript_kArabicPresentationA
}, // 80,
287 { UnicodeScript_kCombiningHalfMark
,
288 UnicodeScript_kCombiningHalfMark
,
289 UnicodeScript_kCombiningHalfMark
}, // 81,
290 { UnicodeScript_kCJKCompatibilityForm
,
291 UnicodeScript_kCJKCompatibilityForm
,
292 UnicodeScript_kCJKCompatibilityForm
}, // 82,
293 { UnicodeScript_kSmallFormVariant
,
294 UnicodeScript_kSmallFormVariant
,
295 UnicodeScript_kSmallFormVariant
}, // 83,
296 { UnicodeScript_kArabicPresentationB
,
297 UnicodeScript_kArabicPresentationB
,
298 UnicodeScript_kArabicPresentationB
}, // 84,
299 { UnicodeScript_kNoScript
,
300 UnicodeScript_kNoScript
,
301 UnicodeScript_kNoScript
}, // 85,
302 { UnicodeScript_kHalfwidthFullwidthForm
,
303 UnicodeScript_kHalfwidthFullwidthForm
,
304 UnicodeScript_kHalfwidthFullwidthForm
}, // 86,
305 { UnicodeScript_kScriptCount
,
306 UnicodeScript_kScriptCount
,
307 UnicodeScript_kNoScript
} // 87,
311 unicode::getUnicodeScriptType( const sal_Unicode ch
, ScriptTypeList
* typeList
, sal_Int16 unknownType
) {
314 typeList
= defaultTypeList
;
315 unknownType
= UnicodeScript_kNoScript
;
318 sal_Int16 i
= 0, type
= typeList
[0].to
;
319 while (type
< UnicodeScript_kScriptCount
&& ch
> UnicodeScriptType
[type
][UnicodeScriptTypeTo
]) {
320 type
= typeList
[++i
].to
;
323 return (type
< UnicodeScript_kScriptCount
&&
324 ch
>= UnicodeScriptType
[typeList
[i
].from
][UnicodeScriptTypeFrom
]) ?
325 typeList
[i
].value
: unknownType
;
329 unicode::isUnicodeScriptType( const sal_Unicode ch
, sal_Int16 type
) {
330 return ch
>= UnicodeScriptType
[type
][UnicodeScriptTypeFrom
] &&
331 ch
<= UnicodeScriptType
[type
][UnicodeScriptTypeTo
];
335 unicode::getUnicodeScriptStart( UnicodeScript type
) {
336 return UnicodeScriptType
[type
][UnicodeScriptTypeFrom
];
340 unicode::getUnicodeScriptEnd( UnicodeScript type
) {
341 return UnicodeScriptType
[type
][UnicodeScriptTypeTo
];
345 unicode::getUnicodeType( const sal_Unicode ch
) {
346 static sal_Unicode c
= 0x00;
347 static sal_Int16 r
= 0x00;
349 if (ch
== c
) return r
;
352 sal_Int16 address
= UnicodeTypeIndex
[ch
>> 8];
353 return r
= (sal_Int16
)((address
< UnicodeTypeNumberBlock
) ? UnicodeTypeBlockValue
[address
] :
354 UnicodeTypeValue
[((address
- UnicodeTypeNumberBlock
) << 8) + (ch
& 0xff)]);
358 unicode::getUnicodeDirection( const sal_Unicode ch
) {
359 static sal_Unicode c
= 0x00;
360 static sal_uInt8 r
= 0x00;
362 if (ch
== c
) return r
;
365 sal_Int16 address
= UnicodeDirectionIndex
[ch
>> 8];
366 return r
= ((address
< UnicodeDirectionNumberBlock
) ? UnicodeDirectionBlockValue
[address
] :
367 UnicodeDirectionValue
[((address
- UnicodeDirectionNumberBlock
) << 8) + (ch
& 0xff)]);
371 #define bit(name) (1 << name)
373 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
375 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
377 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
379 #define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
380 bit(UnicodeType::LETTER_NUMBER)|\
381 bit(UnicodeType::OTHER_NUMBER)
383 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
384 bit(UnicodeType::MODIFIER_LETTER)|\
385 bit(UnicodeType::OTHER_LETTER)
387 #define BASEMASK DIGITMASK|ALPHAMASK|\
388 bit(UnicodeType::NON_SPACING_MARK)|\
389 bit(UnicodeType::ENCLOSING_MARK)|\
390 bit(UnicodeType::COMBINING_SPACING_MARK)
392 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
393 bit(UnicodeType::LINE_SEPARATOR)|\
394 bit(UnicodeType::PARAGRAPH_SEPARATOR)
396 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
397 bit(UnicodeType::INITIAL_PUNCTUATION)|\
398 bit(UnicodeType::FINAL_PUNCTUATION)|\
399 bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
400 bit(UnicodeType::OTHER_PUNCTUATION)
402 #define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\
403 bit(UnicodeType::CURRENCY_SYMBOL)|\
404 bit(UnicodeType::MODIFIER_SYMBOL)|\
405 bit(UnicodeType::OTHER_SYMBOL)
407 #define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
409 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
410 bit(UnicodeType::FORMAT)|\
411 bit(UnicodeType::LINE_SEPARATOR)|\
412 bit(UnicodeType::PARAGRAPH_SEPARATOR)
414 #define IsType(func, mask) \
415 sal_Bool SAL_CALL func( const sal_Unicode ch) {\
416 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
419 IsType(unicode::isUpper
, UPPERMASK
)
420 IsType(unicode::isLower
, LOWERMASK
)
421 IsType(unicode::isTitle
, DIGITMASK
)
422 IsType(unicode::isControl
, CONTROLMASK
)
423 IsType(unicode::isPrint
, PRINTMASK
)
424 IsType(unicode::isAlpha
, ALPHAMASK
)
425 IsType(unicode::isDigit
, DIGITMASK
)
426 IsType(unicode::isAlphaDigit
, ALPHAMASK
|DIGITMASK
)
427 IsType(unicode::isSpace
, SPACEMASK
)
428 IsType(unicode::isBase
, BASEMASK
)
429 IsType(unicode::isPunctuation
, PUNCTUATIONMASK
)
431 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
432 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
434 sal_Bool SAL_CALL
unicode::isWhiteSpace( const sal_Unicode ch
) {
435 return (ch
!= 0xa0 && isSpace(ch
)) || (ch
<= 0x1F && (bit(ch
) & (CONTROLSPACE
)));
438 sal_Int32 SAL_CALL
unicode::getCharType( const sal_Unicode ch
)
440 using namespace ::com::sun::star::i18n::KCharacterType
;
442 switch ( getUnicodeType( ch
) ) {
444 case UnicodeType::UPPERCASE_LETTER
:
445 return UPPER
|LETTER
|PRINTABLE
|BASE_FORM
;
448 case UnicodeType::LOWERCASE_LETTER
:
449 return LOWER
|LETTER
|PRINTABLE
|BASE_FORM
;
452 case UnicodeType::TITLECASE_LETTER
:
453 return TITLE_CASE
|LETTER
|PRINTABLE
|BASE_FORM
;
456 case UnicodeType::MODIFIER_LETTER
:
457 case UnicodeType::OTHER_LETTER
:
458 return LETTER
|PRINTABLE
|BASE_FORM
;
461 case UnicodeType::DECIMAL_DIGIT_NUMBER
:
462 case UnicodeType::LETTER_NUMBER
:
463 case UnicodeType::OTHER_NUMBER
:
464 return DIGIT
|PRINTABLE
|BASE_FORM
;
467 case UnicodeType::NON_SPACING_MARK
:
468 case UnicodeType::ENCLOSING_MARK
:
469 case UnicodeType::COMBINING_SPACING_MARK
:
470 return BASE_FORM
|PRINTABLE
;
473 case UnicodeType::SPACE_SEPARATOR
:
475 case UnicodeType::DASH_PUNCTUATION
:
476 case UnicodeType::INITIAL_PUNCTUATION
:
477 case UnicodeType::FINAL_PUNCTUATION
:
478 case UnicodeType::CONNECTOR_PUNCTUATION
:
479 case UnicodeType::OTHER_PUNCTUATION
:
481 case UnicodeType::MATH_SYMBOL
:
482 case UnicodeType::CURRENCY_SYMBOL
:
483 case UnicodeType::MODIFIER_SYMBOL
:
484 case UnicodeType::OTHER_SYMBOL
:
488 case UnicodeType::CONTROL
:
489 case UnicodeType::FORMAT
:
492 case UnicodeType::LINE_SEPARATOR
:
493 case UnicodeType::PARAGRAPH_SEPARATOR
:
494 return CONTROL
|PRINTABLE
;