Version 5.2.6.1, tag libreoffice-5.2.6.1
[LibreOffice.git] / i18nutil / source / utility / unicode.cxx
blob6507479807fe5f75d5995a04595ca526f156feed
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/KCharacterType.hpp>
22 #include <com/sun/star/i18n/ScriptType.hpp>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <i18nutil/unicode.hxx>
26 #include <sal/log.hxx>
27 #include <unicode/numfmt.h>
28 #include "unicode_data.h"
29 #include <com/sun/star/i18n/UnicodeType.hpp>
30 #include <rtl/character.hxx>
31 #include <memory>
33 // Workaround for glibc braindamage:
34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36 #undef CURRENCY_SYMBOL
38 using namespace ::com::sun::star::i18n;
40 static const ScriptTypeList defaultTypeList[] = {
41 { UnicodeScript_kBasicLatin,
42 UnicodeScript_kBasicLatin,
43 UnicodeScript_kBasicLatin }, // 0,
44 { UnicodeScript_kLatin1Supplement,
45 UnicodeScript_kLatin1Supplement,
46 UnicodeScript_kLatin1Supplement },// 1,
47 { UnicodeScript_kLatinExtendedA,
48 UnicodeScript_kLatinExtendedA,
49 UnicodeScript_kLatinExtendedA }, // 2,
50 { UnicodeScript_kLatinExtendedB,
51 UnicodeScript_kLatinExtendedB,
52 UnicodeScript_kLatinExtendedB }, // 3,
53 { UnicodeScript_kIPAExtension,
54 UnicodeScript_kIPAExtension,
55 UnicodeScript_kIPAExtension }, // 4,
56 { UnicodeScript_kSpacingModifier,
57 UnicodeScript_kSpacingModifier,
58 UnicodeScript_kSpacingModifier }, // 5,
59 { UnicodeScript_kCombiningDiacritical,
60 UnicodeScript_kCombiningDiacritical,
61 UnicodeScript_kCombiningDiacritical }, // 6,
62 { UnicodeScript_kGreek,
63 UnicodeScript_kGreek,
64 UnicodeScript_kGreek }, // 7,
65 { UnicodeScript_kCyrillic,
66 UnicodeScript_kCyrillic,
67 UnicodeScript_kCyrillic }, // 8,
68 { UnicodeScript_kArmenian,
69 UnicodeScript_kArmenian,
70 UnicodeScript_kArmenian }, // 9,
71 { UnicodeScript_kHebrew,
72 UnicodeScript_kHebrew,
73 UnicodeScript_kHebrew }, // 10,
74 { UnicodeScript_kArabic,
75 UnicodeScript_kArabic,
76 UnicodeScript_kArabic }, // 11,
77 { UnicodeScript_kSyriac,
78 UnicodeScript_kSyriac,
79 UnicodeScript_kSyriac }, // 12,
80 { UnicodeScript_kThaana,
81 UnicodeScript_kThaana,
82 UnicodeScript_kThaana }, // 13,
83 { UnicodeScript_kDevanagari,
84 UnicodeScript_kDevanagari,
85 UnicodeScript_kDevanagari }, // 14,
86 { UnicodeScript_kBengali,
87 UnicodeScript_kBengali,
88 UnicodeScript_kBengali }, // 15,
89 { UnicodeScript_kGurmukhi,
90 UnicodeScript_kGurmukhi,
91 UnicodeScript_kGurmukhi }, // 16,
92 { UnicodeScript_kGujarati,
93 UnicodeScript_kGujarati,
94 UnicodeScript_kGujarati }, // 17,
95 { UnicodeScript_kOriya,
96 UnicodeScript_kOriya,
97 UnicodeScript_kOriya }, // 18,
98 { UnicodeScript_kTamil,
99 UnicodeScript_kTamil,
100 UnicodeScript_kTamil }, // 19,
101 { UnicodeScript_kTelugu,
102 UnicodeScript_kTelugu,
103 UnicodeScript_kTelugu }, // 20,
104 { UnicodeScript_kKannada,
105 UnicodeScript_kKannada,
106 UnicodeScript_kKannada }, // 21,
107 { UnicodeScript_kMalayalam,
108 UnicodeScript_kMalayalam,
109 UnicodeScript_kMalayalam }, // 22,
110 { UnicodeScript_kSinhala,
111 UnicodeScript_kSinhala,
112 UnicodeScript_kSinhala }, // 23,
113 { UnicodeScript_kThai,
114 UnicodeScript_kThai,
115 UnicodeScript_kThai }, // 24,
116 { UnicodeScript_kLao,
117 UnicodeScript_kLao,
118 UnicodeScript_kLao }, // 25,
119 { UnicodeScript_kTibetan,
120 UnicodeScript_kTibetan,
121 UnicodeScript_kTibetan }, // 26,
122 { UnicodeScript_kMyanmar,
123 UnicodeScript_kMyanmar,
124 UnicodeScript_kMyanmar }, // 27,
125 { UnicodeScript_kGeorgian,
126 UnicodeScript_kGeorgian,
127 UnicodeScript_kGeorgian }, // 28,
128 { UnicodeScript_kHangulJamo,
129 UnicodeScript_kHangulJamo,
130 UnicodeScript_kHangulJamo }, // 29,
131 { UnicodeScript_kEthiopic,
132 UnicodeScript_kEthiopic,
133 UnicodeScript_kEthiopic }, // 30,
134 { UnicodeScript_kCherokee,
135 UnicodeScript_kCherokee,
136 UnicodeScript_kCherokee }, // 31,
137 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
138 UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
139 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
140 { UnicodeScript_kOgham,
141 UnicodeScript_kOgham,
142 UnicodeScript_kOgham }, // 33,
143 { UnicodeScript_kRunic,
144 UnicodeScript_kRunic,
145 UnicodeScript_kRunic }, // 34,
146 { UnicodeScript_kKhmer,
147 UnicodeScript_kKhmer,
148 UnicodeScript_kKhmer }, // 35,
149 { UnicodeScript_kMongolian,
150 UnicodeScript_kMongolian,
151 UnicodeScript_kMongolian }, // 36,
152 { UnicodeScript_kLatinExtendedAdditional,
153 UnicodeScript_kLatinExtendedAdditional,
154 UnicodeScript_kLatinExtendedAdditional }, // 37,
155 { UnicodeScript_kGreekExtended,
156 UnicodeScript_kGreekExtended,
157 UnicodeScript_kGreekExtended }, // 38,
158 { UnicodeScript_kGeneralPunctuation,
159 UnicodeScript_kGeneralPunctuation,
160 UnicodeScript_kGeneralPunctuation }, // 39,
161 { UnicodeScript_kSuperSubScript,
162 UnicodeScript_kSuperSubScript,
163 UnicodeScript_kSuperSubScript }, // 40,
164 { UnicodeScript_kCurrencySymbolScript,
165 UnicodeScript_kCurrencySymbolScript,
166 UnicodeScript_kCurrencySymbolScript }, // 41,
167 { UnicodeScript_kSymbolCombiningMark,
168 UnicodeScript_kSymbolCombiningMark,
169 UnicodeScript_kSymbolCombiningMark }, // 42,
170 { UnicodeScript_kLetterlikeSymbol,
171 UnicodeScript_kLetterlikeSymbol,
172 UnicodeScript_kLetterlikeSymbol }, // 43,
173 { UnicodeScript_kNumberForm,
174 UnicodeScript_kNumberForm,
175 UnicodeScript_kNumberForm }, // 44,
176 { UnicodeScript_kArrow,
177 UnicodeScript_kArrow,
178 UnicodeScript_kArrow }, // 45,
179 { UnicodeScript_kMathOperator,
180 UnicodeScript_kMathOperator,
181 UnicodeScript_kMathOperator }, // 46,
182 { UnicodeScript_kMiscTechnical,
183 UnicodeScript_kMiscTechnical,
184 UnicodeScript_kMiscTechnical }, // 47,
185 { UnicodeScript_kControlPicture,
186 UnicodeScript_kControlPicture,
187 UnicodeScript_kControlPicture }, // 48,
188 { UnicodeScript_kOpticalCharacter,
189 UnicodeScript_kOpticalCharacter,
190 UnicodeScript_kOpticalCharacter }, // 49,
191 { UnicodeScript_kEnclosedAlphanumeric,
192 UnicodeScript_kEnclosedAlphanumeric,
193 UnicodeScript_kEnclosedAlphanumeric }, // 50,
194 { UnicodeScript_kBoxDrawing,
195 UnicodeScript_kBoxDrawing,
196 UnicodeScript_kBoxDrawing }, // 51,
197 { UnicodeScript_kBlockElement,
198 UnicodeScript_kBlockElement,
199 UnicodeScript_kBlockElement }, // 52,
200 { UnicodeScript_kGeometricShape,
201 UnicodeScript_kGeometricShape,
202 UnicodeScript_kGeometricShape }, // 53,
203 { UnicodeScript_kMiscSymbol,
204 UnicodeScript_kMiscSymbol,
205 UnicodeScript_kMiscSymbol }, // 54,
206 { UnicodeScript_kDingbat,
207 UnicodeScript_kDingbat,
208 UnicodeScript_kDingbat }, // 55,
209 { UnicodeScript_kBraillePatterns,
210 UnicodeScript_kBraillePatterns,
211 UnicodeScript_kBraillePatterns }, // 56,
212 { UnicodeScript_kCJKRadicalsSupplement,
213 UnicodeScript_kCJKRadicalsSupplement,
214 UnicodeScript_kCJKRadicalsSupplement }, // 57,
215 { UnicodeScript_kKangxiRadicals,
216 UnicodeScript_kKangxiRadicals,
217 UnicodeScript_kKangxiRadicals }, // 58,
218 { UnicodeScript_kIdeographicDescriptionCharacters,
219 UnicodeScript_kIdeographicDescriptionCharacters,
220 UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
221 { UnicodeScript_kCJKSymbolPunctuation,
222 UnicodeScript_kCJKSymbolPunctuation,
223 UnicodeScript_kCJKSymbolPunctuation }, // 60,
224 { UnicodeScript_kHiragana,
225 UnicodeScript_kHiragana,
226 UnicodeScript_kHiragana }, // 61,
227 { UnicodeScript_kKatakana,
228 UnicodeScript_kKatakana,
229 UnicodeScript_kKatakana }, // 62,
230 { UnicodeScript_kBopomofo,
231 UnicodeScript_kBopomofo,
232 UnicodeScript_kBopomofo }, // 63,
233 { UnicodeScript_kHangulCompatibilityJamo,
234 UnicodeScript_kHangulCompatibilityJamo,
235 UnicodeScript_kHangulCompatibilityJamo }, // 64,
236 { UnicodeScript_kKanbun,
237 UnicodeScript_kKanbun,
238 UnicodeScript_kKanbun }, // 65,
239 { UnicodeScript_kBopomofoExtended,
240 UnicodeScript_kBopomofoExtended,
241 UnicodeScript_kBopomofoExtended }, // 66,
242 { UnicodeScript_kEnclosedCJKLetterMonth,
243 UnicodeScript_kEnclosedCJKLetterMonth,
244 UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
245 { UnicodeScript_kCJKCompatibility,
246 UnicodeScript_kCJKCompatibility,
247 UnicodeScript_kCJKCompatibility }, // 68,
248 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
249 UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
250 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
251 { UnicodeScript_kCJKUnifiedIdeograph,
252 UnicodeScript_kCJKUnifiedIdeograph,
253 UnicodeScript_kCJKUnifiedIdeograph }, // 70,
254 { UnicodeScript_kYiSyllables,
255 UnicodeScript_kYiSyllables,
256 UnicodeScript_kYiSyllables }, // 71,
257 { UnicodeScript_kYiRadicals,
258 UnicodeScript_kYiRadicals,
259 UnicodeScript_kYiRadicals }, // 72,
260 { UnicodeScript_kHangulSyllable,
261 UnicodeScript_kHangulSyllable,
262 UnicodeScript_kHangulSyllable }, // 73,
263 { UnicodeScript_kHighSurrogate,
264 UnicodeScript_kHighSurrogate,
265 UnicodeScript_kHighSurrogate }, // 74,
266 { UnicodeScript_kHighPrivateUseSurrogate,
267 UnicodeScript_kHighPrivateUseSurrogate,
268 UnicodeScript_kHighPrivateUseSurrogate }, // 75,
269 { UnicodeScript_kLowSurrogate,
270 UnicodeScript_kLowSurrogate,
271 UnicodeScript_kLowSurrogate }, // 76,
272 { UnicodeScript_kPrivateUse,
273 UnicodeScript_kPrivateUse,
274 UnicodeScript_kPrivateUse }, // 77,
275 { UnicodeScript_kCJKCompatibilityIdeograph,
276 UnicodeScript_kCJKCompatibilityIdeograph,
277 UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
278 { UnicodeScript_kAlphabeticPresentation,
279 UnicodeScript_kAlphabeticPresentation,
280 UnicodeScript_kAlphabeticPresentation }, // 79,
281 { UnicodeScript_kArabicPresentationA,
282 UnicodeScript_kArabicPresentationA,
283 UnicodeScript_kArabicPresentationA }, // 80,
284 { UnicodeScript_kCombiningHalfMark,
285 UnicodeScript_kCombiningHalfMark,
286 UnicodeScript_kCombiningHalfMark }, // 81,
287 { UnicodeScript_kCJKCompatibilityForm,
288 UnicodeScript_kCJKCompatibilityForm,
289 UnicodeScript_kCJKCompatibilityForm }, // 82,
290 { UnicodeScript_kSmallFormVariant,
291 UnicodeScript_kSmallFormVariant,
292 UnicodeScript_kSmallFormVariant }, // 83,
293 { UnicodeScript_kArabicPresentationB,
294 UnicodeScript_kArabicPresentationB,
295 UnicodeScript_kArabicPresentationB }, // 84,
296 { UnicodeScript_kNoScript,
297 UnicodeScript_kNoScript,
298 UnicodeScript_kNoScript }, // 85,
299 { UnicodeScript_kHalfwidthFullwidthForm,
300 UnicodeScript_kHalfwidthFullwidthForm,
301 UnicodeScript_kHalfwidthFullwidthForm }, // 86,
302 { UnicodeScript_kScriptCount,
303 UnicodeScript_kScriptCount,
304 UnicodeScript_kNoScript } // 87,
307 sal_Int16 SAL_CALL
308 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
310 if (!typeList) {
311 typeList = defaultTypeList;
312 unknownType = UnicodeScript_kNoScript;
315 sal_Int16 i = 0, type = typeList[0].to;
316 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
317 type = typeList[++i].to;
320 return (type < UnicodeScript_kScriptCount &&
321 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
322 typeList[i].value : unknownType;
325 sal_Unicode SAL_CALL
326 unicode::getUnicodeScriptStart( UnicodeScript type) {
327 return UnicodeScriptType[type][UnicodeScriptTypeFrom];
330 sal_Unicode SAL_CALL
331 unicode::getUnicodeScriptEnd( UnicodeScript type) {
332 return UnicodeScriptType[type][UnicodeScriptTypeTo];
335 sal_Int16 SAL_CALL
336 unicode::getUnicodeType( const sal_Unicode ch ) {
337 static sal_Unicode c = 0x00;
338 static sal_Int16 r = 0x00;
340 if (ch == c) return r;
341 else c = ch;
343 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
344 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
345 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
348 sal_uInt8 SAL_CALL
349 unicode::getUnicodeDirection( const sal_Unicode ch ) {
350 static sal_Unicode c = 0x00;
351 static sal_uInt8 r = 0x00;
353 if (ch == c) return r;
354 else c = ch;
356 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
357 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
358 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
362 #define bit(name) (1U << name)
364 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
366 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
368 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
370 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
371 bit(UnicodeType::MODIFIER_LETTER)|\
372 bit(UnicodeType::OTHER_LETTER)
374 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
375 bit(UnicodeType::LINE_SEPARATOR)|\
376 bit(UnicodeType::PARAGRAPH_SEPARATOR)
378 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
379 bit(UnicodeType::FORMAT)|\
380 bit(UnicodeType::LINE_SEPARATOR)|\
381 bit(UnicodeType::PARAGRAPH_SEPARATOR)
383 #define IsType(func, mask) \
384 bool SAL_CALL func( const sal_Unicode ch) {\
385 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
388 IsType(unicode::isControl, CONTROLMASK)
389 IsType(unicode::isAlpha, ALPHAMASK)
390 IsType(unicode::isSpace, SPACEMASK)
392 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
393 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
395 bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
396 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
399 sal_Int16 SAL_CALL unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
401 //See unicode/uscript.h
402 static const sal_Int16 scriptTypes[] =
404 ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
405 ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
406 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
407 // 15
408 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
409 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
410 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
411 // 30
412 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
413 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
414 ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
415 // 45
416 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
417 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
418 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
419 // 60
420 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
421 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
422 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
423 // 75
424 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
425 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
426 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
427 // 90
428 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
429 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
430 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
431 // 105
432 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
433 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
434 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
435 // 120
436 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
437 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
438 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
439 // 135
440 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
441 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
442 ScriptType::COMPLEX,
443 ScriptType::WEAK
446 sal_Int16 nRet;
447 if (eScript < USCRIPT_COMMON)
448 nRet = ScriptType::WEAK;
449 else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
450 nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
451 else
452 nRet = scriptTypes[eScript];
453 return nRet;
456 OString SAL_CALL unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
458 OString sRet;
459 switch (eScript)
461 case USCRIPT_CODE_LIMIT:
462 case USCRIPT_INVALID_CODE:
463 sRet = "zxx";
464 break;
465 case USCRIPT_COMMON:
466 case USCRIPT_INHERITED:
467 sRet = "und";
468 break;
469 case USCRIPT_MATHEMATICAL_NOTATION:
470 case USCRIPT_SYMBOLS:
471 sRet = "zxx";
472 break;
473 case USCRIPT_UNWRITTEN_LANGUAGES:
474 case USCRIPT_UNKNOWN:
475 sRet = "und";
476 break;
477 case USCRIPT_ARABIC:
478 sRet = "ar";
479 break;
480 case USCRIPT_ARMENIAN:
481 sRet = "hy";
482 break;
483 case USCRIPT_BENGALI:
484 sRet = "bn";
485 break;
486 case USCRIPT_BOPOMOFO:
487 sRet = "zh";
488 break;
489 case USCRIPT_CHEROKEE:
490 sRet = "chr";
491 break;
492 case USCRIPT_COPTIC:
493 sRet = "cop";
494 break;
495 case USCRIPT_CYRILLIC:
496 sRet = "ru";
497 break;
498 case USCRIPT_DESERET:
499 sRet = "en";
500 break;
501 case USCRIPT_DEVANAGARI:
502 sRet = "hi";
503 break;
504 case USCRIPT_ETHIOPIC:
505 sRet = "am";
506 break;
507 case USCRIPT_GEORGIAN:
508 sRet = "ka";
509 break;
510 case USCRIPT_GOTHIC:
511 sRet = "got";
512 break;
513 case USCRIPT_GREEK:
514 sRet = "el";
515 break;
516 case USCRIPT_GUJARATI:
517 sRet = "gu";
518 break;
519 case USCRIPT_GURMUKHI:
520 sRet = "pa";
521 break;
522 case USCRIPT_HAN:
523 sRet = "zh";
524 break;
525 case USCRIPT_HANGUL:
526 sRet = "ko";
527 break;
528 case USCRIPT_HEBREW:
529 sRet = "hr";
530 break;
531 case USCRIPT_HIRAGANA:
532 sRet = "ja";
533 break;
534 case USCRIPT_KANNADA:
535 sRet = "kn";
536 break;
537 case USCRIPT_KATAKANA:
538 sRet = "ja";
539 break;
540 case USCRIPT_KHMER:
541 sRet = "km";
542 break;
543 case USCRIPT_LAO:
544 sRet = "lo";
545 break;
546 case USCRIPT_LATIN:
547 sRet = "en";
548 break;
549 case USCRIPT_MALAYALAM:
550 sRet = "ml";
551 break;
552 case USCRIPT_MONGOLIAN:
553 sRet = "mn";
554 break;
555 case USCRIPT_MYANMAR:
556 sRet = "my";
557 break;
558 case USCRIPT_OGHAM:
559 sRet = "pgl";
560 break;
561 case USCRIPT_OLD_ITALIC:
562 sRet = "osc";
563 break;
564 case USCRIPT_ORIYA:
565 sRet = "or";
566 break;
567 case USCRIPT_RUNIC:
568 sRet = "ang";
569 break;
570 case USCRIPT_SINHALA:
571 sRet = "si";
572 break;
573 case USCRIPT_SYRIAC:
574 sRet = "syr";
575 break;
576 case USCRIPT_TAMIL:
577 sRet = "ta";
578 break;
579 case USCRIPT_TELUGU:
580 sRet = "te";
581 break;
582 case USCRIPT_THAANA:
583 sRet = "dv";
584 break;
585 case USCRIPT_THAI:
586 sRet = "th";
587 break;
588 case USCRIPT_TIBETAN:
589 sRet = "bo";
590 break;
591 case USCRIPT_CANADIAN_ABORIGINAL:
592 sRet = "iu";
593 break;
594 case USCRIPT_YI:
595 sRet = "ii";
596 break;
597 case USCRIPT_TAGALOG:
598 sRet = "tl";
599 break;
600 case USCRIPT_HANUNOO:
601 sRet = "hnn";
602 break;
603 case USCRIPT_BUHID:
604 sRet = "bku";
605 break;
606 case USCRIPT_TAGBANWA:
607 sRet = "tbw";
608 break;
609 case USCRIPT_BRAILLE:
610 sRet = "en";
611 break;
612 case USCRIPT_CYPRIOT:
613 sRet = "ecy";
614 break;
615 case USCRIPT_LIMBU:
616 sRet = "lif";
617 break;
618 case USCRIPT_LINEAR_B:
619 sRet = "gmy";
620 break;
621 case USCRIPT_OSMANYA:
622 sRet = "so";
623 break;
624 case USCRIPT_SHAVIAN:
625 sRet = "en";
626 break;
627 case USCRIPT_TAI_LE:
628 sRet = "tdd";
629 break;
630 case USCRIPT_UGARITIC:
631 sRet = "uga";
632 break;
633 case USCRIPT_KATAKANA_OR_HIRAGANA:
634 sRet = "ja";
635 break;
636 case USCRIPT_BUGINESE:
637 sRet = "bug";
638 break;
639 case USCRIPT_GLAGOLITIC:
640 sRet = "ch";
641 break;
642 case USCRIPT_KHAROSHTHI:
643 sRet = "pra";
644 break;
645 case USCRIPT_SYLOTI_NAGRI:
646 sRet = "syl";
647 break;
648 case USCRIPT_NEW_TAI_LUE:
649 sRet = "khb";
650 break;
651 case USCRIPT_TIFINAGH:
652 sRet = "tmh";
653 break;
654 case USCRIPT_OLD_PERSIAN:
655 sRet = "peo";
656 break;
657 case USCRIPT_BALINESE:
658 sRet = "ban";
659 break;
660 case USCRIPT_BATAK:
661 sRet = "btk";
662 break;
663 case USCRIPT_BLISSYMBOLS:
664 sRet = "en";
665 break;
666 case USCRIPT_BRAHMI:
667 sRet = "pra";
668 break;
669 case USCRIPT_CHAM:
670 sRet = "cja";
671 break;
672 case USCRIPT_CIRTH:
673 sRet = "sjn";
674 break;
675 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
676 sRet = "cu";
677 break;
678 case USCRIPT_DEMOTIC_EGYPTIAN:
679 case USCRIPT_HIERATIC_EGYPTIAN:
680 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
681 sRet = "egy";
682 break;
683 case USCRIPT_KHUTSURI:
684 sRet = "ka";
685 break;
686 case USCRIPT_SIMPLIFIED_HAN:
687 sRet = "zh";
688 break;
689 case USCRIPT_TRADITIONAL_HAN:
690 sRet = "zh";
691 break;
692 case USCRIPT_PAHAWH_HMONG:
693 sRet = "blu";
694 break;
695 case USCRIPT_OLD_HUNGARIAN:
696 sRet = "ohu";
697 break;
698 case USCRIPT_HARAPPAN_INDUS:
699 sRet = "xiv";
700 break;
701 case USCRIPT_JAVANESE:
702 sRet = "kaw";
703 break;
704 case USCRIPT_KAYAH_LI:
705 sRet = "eky";
706 break;
707 case USCRIPT_LATIN_FRAKTUR:
708 sRet = "de";
709 break;
710 case USCRIPT_LATIN_GAELIC:
711 sRet = "ga";
712 break;
713 case USCRIPT_LEPCHA:
714 sRet = "lep";
715 break;
716 case USCRIPT_LINEAR_A:
717 sRet = "ecr";
718 break;
719 case USCRIPT_MAYAN_HIEROGLYPHS:
720 sRet = "myn";
721 break;
722 case USCRIPT_MEROITIC:
723 sRet = "xmr";
724 break;
725 case USCRIPT_NKO:
726 sRet = "nqo";
727 break;
728 case USCRIPT_ORKHON:
729 sRet = "otk";
730 break;
731 case USCRIPT_OLD_PERMIC:
732 sRet = "kv";
733 break;
734 case USCRIPT_PHAGS_PA:
735 sRet = "xng";
736 break;
737 case USCRIPT_PHOENICIAN:
738 sRet = "phn";
739 break;
740 case USCRIPT_PHONETIC_POLLARD:
741 sRet = "hmd";
742 break;
743 case USCRIPT_RONGORONGO:
744 sRet = "rap";
745 break;
746 case USCRIPT_SARATI:
747 sRet = "qya";
748 break;
749 case USCRIPT_ESTRANGELO_SYRIAC:
750 sRet = "syr";
751 break;
752 case USCRIPT_WESTERN_SYRIAC:
753 sRet = "tru";
754 break;
755 case USCRIPT_EASTERN_SYRIAC:
756 sRet = "aii";
757 break;
758 case USCRIPT_TENGWAR:
759 sRet = "sjn";
760 break;
761 case USCRIPT_VAI:
762 sRet = "vai";
763 break;
764 case USCRIPT_VISIBLE_SPEECH:
765 sRet = "en";
766 break;
767 case USCRIPT_CUNEIFORM:
768 sRet = "akk";
769 break;
770 case USCRIPT_CARIAN:
771 sRet = "xcr";
772 break;
773 case USCRIPT_JAPANESE:
774 sRet = "ja";
775 break;
776 case USCRIPT_LANNA:
777 sRet = "nod";
778 break;
779 case USCRIPT_LYCIAN:
780 sRet = "xlc";
781 break;
782 case USCRIPT_LYDIAN:
783 sRet = "xld";
784 break;
785 case USCRIPT_OL_CHIKI:
786 sRet = "sat";
787 break;
788 case USCRIPT_REJANG:
789 sRet = "rej";
790 break;
791 case USCRIPT_SAURASHTRA:
792 sRet = "saz";
793 break;
794 case USCRIPT_SIGN_WRITING:
795 sRet = "en";
796 break;
797 case USCRIPT_SUNDANESE:
798 sRet = "su";
799 break;
800 case USCRIPT_MOON:
801 sRet = "en";
802 break;
803 case USCRIPT_MEITEI_MAYEK:
804 sRet = "mni";
805 break;
806 case USCRIPT_IMPERIAL_ARAMAIC:
807 sRet = "arc";
808 break;
809 case USCRIPT_AVESTAN:
810 sRet = "ae";
811 break;
812 case USCRIPT_CHAKMA:
813 sRet = "ccp";
814 break;
815 case USCRIPT_KOREAN:
816 sRet = "ko";
817 break;
818 case USCRIPT_KAITHI:
819 sRet = "awa";
820 break;
821 case USCRIPT_MANICHAEAN:
822 sRet = "xmn";
823 break;
824 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
825 case USCRIPT_PSALTER_PAHLAVI:
826 case USCRIPT_BOOK_PAHLAVI:
827 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
828 sRet = "xpr";
829 break;
830 case USCRIPT_SAMARITAN:
831 sRet = "heb";
832 break;
833 case USCRIPT_TAI_VIET:
834 sRet = "blt";
835 break;
836 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
837 sRet = "mic";
838 break;
839 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
840 case USCRIPT_NABATAEAN: //no language with an assigned code yet
841 sRet = "mis";
842 break;
843 case USCRIPT_PALMYRENE: //no language with an assigned code yet
844 sRet = "mis";
845 break;
846 case USCRIPT_BAMUM:
847 sRet = "bax";
848 break;
849 case USCRIPT_LISU:
850 sRet = "lis";
851 break;
852 case USCRIPT_NAKHI_GEBA:
853 sRet = "nxq";
854 break;
855 case USCRIPT_OLD_SOUTH_ARABIAN:
856 sRet = "xsa";
857 break;
858 case USCRIPT_BASSA_VAH:
859 sRet = "bsq";
860 break;
861 case USCRIPT_DUPLOYAN_SHORTAND:
862 sRet = "fr";
863 break;
864 case USCRIPT_ELBASAN:
865 sRet = "sq";
866 break;
867 case USCRIPT_GRANTHA:
868 sRet = "ta";
869 break;
870 case USCRIPT_KPELLE:
871 sRet = "kpe";
872 break;
873 case USCRIPT_LOMA:
874 sRet = "lom";
875 break;
876 case USCRIPT_MENDE:
877 sRet = "men";
878 break;
879 case USCRIPT_MEROITIC_CURSIVE:
880 sRet = "xmr";
881 break;
882 case USCRIPT_OLD_NORTH_ARABIAN:
883 sRet = "xna";
884 break;
885 case USCRIPT_SINDHI:
886 sRet = "sd";
887 break;
888 case USCRIPT_WARANG_CITI:
889 sRet = "hoc";
890 break;
891 #endif
892 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
893 case USCRIPT_AFAKA:
894 sRet = "djk";
895 break;
896 case USCRIPT_JURCHEN:
897 sRet = "juc";
898 break;
899 case USCRIPT_MRO:
900 sRet = "cmr";
901 break;
902 case USCRIPT_NUSHU: //no language with an assigned code yet
903 sRet = "mis";
904 break;
905 case USCRIPT_SHARADA:
906 sRet = "sa";
907 break;
908 case USCRIPT_SORA_SOMPENG:
909 sRet = "srb";
910 break;
911 case USCRIPT_TAKRI:
912 sRet = "doi";
913 break;
914 case USCRIPT_TANGUT:
915 sRet = "txg";
916 break;
917 case USCRIPT_WOLEAI:
918 sRet = "woe";
919 break;
920 #endif
921 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
922 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
923 sRet = "hlu";
924 break;
925 case USCRIPT_KHOJKI:
926 sRet = "gu";
927 break;
928 case USCRIPT_TIRHUTA:
929 sRet = "mai";
930 break;
931 #endif
932 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
933 case USCRIPT_CAUCASIAN_ALBANIAN:
934 sRet = "xag";
935 break;
936 case USCRIPT_MAHAJANI:
937 sRet = "mwr";
938 break;
939 #endif
940 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
941 case USCRIPT_AHOM:
942 sRet = "aho";
943 break;
944 case USCRIPT_HATRAN:
945 sRet = "qly-Hatr";
946 break;
947 case USCRIPT_MODI:
948 sRet = "mr-Modi";
949 break;
950 case USCRIPT_MULTANI:
951 sRet = "skr-Mutl";
952 break;
953 case USCRIPT_PAU_CIN_HAU:
954 sRet = "ctd-Pauc";
955 break;
956 case USCRIPT_SIDDHAM:
957 sRet = "sa-Sidd";
958 break;
959 #endif
961 return sRet;
964 //Format a number as a percentage according to the rules of the given
965 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
966 OUString SAL_CALL unicode::formatPercent(double dNumber,
967 const LanguageTag &rLangTag)
969 // get a currency formatter for this locale ID
970 UErrorCode errorCode=U_ZERO_ERROR;
972 LanguageTag aLangTag(rLangTag);
974 // As of CLDR Version 24 these languages were not listed as using spacing
975 // between number and % but are reported as such by our l10n groups
976 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
977 // so format using French which has the desired rules
978 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
979 aLangTag = LanguageTag("fr-FR");
981 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
983 std::unique_ptr<NumberFormat> xF(
984 NumberFormat::createPercentInstance(aLocale, errorCode));
985 if(U_FAILURE(errorCode))
987 SAL_WARN("i18n", "NumberFormat::createPercentInstance failed");
988 return OUString::number(dNumber) + "%";
991 UnicodeString output;
992 xF->format(dNumber/100, output);
993 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
994 output.length());
995 if (rLangTag.getLanguage() == "de")
997 //narrow no-break space instead of (normal) no-break space
998 return aRet.replace(0x00A0, 0x202F);
1000 return aRet;
1003 ToggleUnicodeCodepoint::ToggleUnicodeCodepoint ()
1005 maInput = OUStringBuffer();
1006 maOutput = OUStringBuffer();
1007 maUtf16 = OUStringBuffer();
1008 maCombining = OUStringBuffer();
1011 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
1013 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1014 if( maInput.getLength() > 255 )
1015 mbAllowMoreChars = false;
1017 if( !mbAllowMoreChars )
1018 return false;
1020 bool bPreventNonHex = false;
1021 if( maInput.indexOf("U+") != -1 )
1022 bPreventNonHex = true;
1024 switch ( unicode::getUnicodeType(uChar) )
1026 case css::i18n::UnicodeType::SURROGATE:
1027 if( bPreventNonHex )
1029 mbAllowMoreChars = false;
1030 return false;
1033 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
1035 maUtf16.append(uChar);
1036 return true;
1038 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
1039 maUtf16.insert(0, uChar );
1040 //end of hex strings, or unexpected order of high/low, so don't accept more
1041 if( !maUtf16.isEmpty() )
1042 maInput.append(maUtf16);
1043 if( !maCombining.isEmpty() )
1044 maInput.append(maCombining);
1045 mbAllowMoreChars = false;
1046 break;
1048 case css::i18n::UnicodeType::NON_SPACING_MARK:
1049 case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
1050 if( bPreventNonHex )
1052 mbAllowMoreChars = false;
1053 return false;
1056 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1057 if( !maUtf16.isEmpty() )
1059 maInput = maUtf16;
1060 if( !maCombining.isEmpty() )
1061 maInput.append(maCombining);
1062 mbAllowMoreChars = false;
1063 return false;
1065 maCombining.insert(0, uChar);
1066 break;
1068 default:
1069 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1070 if( !maUtf16.isEmpty() )
1072 maInput = maUtf16;
1073 if( !maCombining.isEmpty() )
1074 maInput.append(maCombining);
1075 mbAllowMoreChars = false;
1076 return false;
1079 if( !maCombining.isEmpty() )
1081 maCombining.insert(0, uChar);
1082 maInput = maCombining;
1083 mbAllowMoreChars = false;
1084 return false;
1087 // 0 - 1f are control characters. Do not process those.
1088 if( uChar < 0x20 )
1090 mbAllowMoreChars = false;
1091 return false;
1094 switch( uChar )
1096 case 'u':
1097 case 'U':
1098 // U+ notation found. Continue looking for another one.
1099 if( mbRequiresU )
1101 mbRequiresU = false;
1102 maInput.insert(0,"U+");
1104 // treat as a normal character
1105 else
1107 mbAllowMoreChars = false;
1108 if( !bPreventNonHex )
1109 maInput.insertUtf32(0, uChar);
1111 break;
1112 case '+':
1113 // + already found: skip when not U, or edge case of +U+xxxx
1114 if( mbRequiresU || (maInput.indexOf("U+") == 0) )
1115 mbAllowMoreChars = false;
1116 // hex chars followed by '+' - now require a 'U'
1117 else if ( !maInput.isEmpty() )
1118 mbRequiresU = true;
1119 // treat as a normal character
1120 else
1122 mbAllowMoreChars = false;
1123 if( !bPreventNonHex )
1124 maInput.insertUtf32(0, uChar);
1126 break;
1127 default:
1128 // + already found. Since not U, cancel further input
1129 if( mbRequiresU )
1130 mbAllowMoreChars = false;
1131 // maximum digits per notation is 8: only one notation
1132 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
1133 mbAllowMoreChars = false;
1134 // maximum digits per notation is 8: previous notation found
1135 else if( maInput.indexOf("U+") == 8 )
1136 mbAllowMoreChars = false;
1137 // a hex character. Add to string.
1138 else if( isxdigit(uChar) )
1140 mbIsHexString = true;
1141 maInput.insertUtf32(0, uChar);
1143 // not a hex character: stop input. keep if it is the first input provided
1144 else
1146 mbAllowMoreChars = false;
1147 if( maInput.isEmpty() )
1148 maInput.insertUtf32(0, uChar);
1152 return mbAllowMoreChars;
1155 OUString ToggleUnicodeCodepoint::StringToReplace()
1157 if( maInput.isEmpty() )
1159 //edge case - input finished with incomplete low surrogate or combining characters without a base
1160 if( mbAllowMoreChars )
1162 if( !maUtf16.isEmpty() )
1163 maInput = maUtf16;
1164 if( !maCombining.isEmpty() )
1165 maInput.append(maCombining);
1167 return maInput.toString();
1170 if( !mbIsHexString )
1171 return maInput.toString();
1173 //this function potentially modifies the input string. Prevent addition of further characters
1174 mbAllowMoreChars = false;
1176 //validate unicode notation.
1177 OUStringBuffer sIn;
1178 sal_uInt32 nUnicode = 0;
1179 sal_Int32 nUPlus = maInput.indexOf("U+");
1180 //if U+ notation used, strip off all extra chars added not in U+ notation
1181 if( nUPlus != -1 )
1183 maInput = maInput.copy(nUPlus);
1184 sIn = maInput.copy(2);
1185 nUPlus = sIn.indexOf("U+");
1187 else
1188 sIn = maInput;
1189 while( nUPlus != -1 )
1191 nUnicode = sIn.copy(0, nUPlus).toString().toUInt32(16);
1192 //prevent creating control characters or invalid Unicode values
1193 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1194 maInput = sIn.copy(nUPlus);
1195 sIn = sIn.copy(nUPlus+2);
1196 nUPlus = sIn.indexOf("U+");
1199 nUnicode = sIn.toString().toUInt32(16);
1200 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1201 maInput.truncate().append( sIn[sIn.getLength()-1] );
1202 return maInput.toString();
1205 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
1207 OUString sIn = StringToReplace();
1208 sal_Int32 nPos = 0;
1209 sal_uInt32 counter = 0;
1210 while( nPos < sIn.getLength() )
1212 sIn.iterateCodePoints(&nPos);
1213 ++counter;
1215 return counter;
1218 OUString ToggleUnicodeCodepoint::ReplacementString()
1220 OUString sIn = StringToReplace();
1221 maOutput = "";
1222 sal_Int32 nUPlus = sIn.indexOf("U+");
1223 // convert from hex notation to glyph
1224 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1226 sal_uInt32 nUnicode = 0;
1227 if( nUPlus == 0)
1229 sIn = sIn.copy(2);
1230 nUPlus = sIn.indexOf("U+");
1232 while( nUPlus > 0 )
1234 nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1235 maOutput.appendUtf32( nUnicode );
1237 sIn = sIn.copy(nUPlus+2);
1238 nUPlus = sIn.indexOf("U+");
1240 nUnicode = sIn.toUInt32(16);
1241 maOutput.appendUtf32( nUnicode );
1243 // convert from glyph to hex notation
1244 else
1246 sal_Int32 nPos = 0;
1247 while( nPos < sIn.getLength() )
1249 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1250 //pad with zeros - minimum length of 4.
1251 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1252 aTmp.insert( 0,"0" );
1253 maOutput.append( "U+" );
1254 maOutput.append( aTmp );
1257 return maOutput.toString();
1260 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */