Fix GNU C++ version check
[LibreOffice.git] / i18nutil / source / utility / unicode.cxx
blobd2d54e53623c6f99db03593016cffb4b89832987
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include <unicode/uchar.h>
28 #include "unicode_data.h"
29 #include <rtl/character.hxx>
30 #include <o3tl/string_view.hxx>
31 #include <memory>
33 // Workaround for glibc braindamage:
34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36 #undef CURRENCY_SYMBOL
38 using namespace ::com::sun::star::i18n;
40 template<class L, typename T>
41 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
43 sal_Int16 i = 0;
44 css::i18n::UnicodeScript type = typeList[0].to;
45 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
46 type = typeList[++i].to;
49 return (type < UnicodeScript_kScriptCount &&
50 ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
51 typeList[i].value : unknownType;
54 sal_Int16
55 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
56 return getScriptType(ch, typeList, unknownType);
59 sal_Unicode
60 unicode::getUnicodeScriptStart( UnicodeScript type) {
61 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
64 sal_Unicode
65 unicode::getUnicodeScriptEnd( UnicodeScript type) {
66 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
69 sal_Int16
70 unicode::getUnicodeType(const sal_uInt32 ch)
72 static sal_uInt32 c = 0x00;
73 static sal_uInt32 r = 0x00;
75 if (ch == c) return r;
76 else c = ch;
78 switch (u_charType(ch))
80 case U_UNASSIGNED:
81 r = css::i18n::UnicodeType::UNASSIGNED;
82 break;
83 case U_UPPERCASE_LETTER:
84 r = css::i18n::UnicodeType::UPPERCASE_LETTER;
85 break;
86 case U_LOWERCASE_LETTER:
87 r = css::i18n::UnicodeType::LOWERCASE_LETTER;
88 break;
89 case U_TITLECASE_LETTER:
90 r = css::i18n::UnicodeType::TITLECASE_LETTER;
91 break;
92 case U_MODIFIER_LETTER:
93 r = css::i18n::UnicodeType::MODIFIER_LETTER;
94 break;
95 case U_OTHER_LETTER:
96 r = css::i18n::UnicodeType::OTHER_LETTER;
97 break;
98 case U_NON_SPACING_MARK:
99 r = css::i18n::UnicodeType::NON_SPACING_MARK;
100 break;
101 case U_ENCLOSING_MARK:
102 r = css::i18n::UnicodeType::ENCLOSING_MARK;
103 break;
104 case U_COMBINING_SPACING_MARK:
105 r = css::i18n::UnicodeType::COMBINING_SPACING_MARK;
106 break;
107 case U_DECIMAL_DIGIT_NUMBER:
108 r = css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER;
109 break;
110 case U_LETTER_NUMBER:
111 r = css::i18n::UnicodeType::LETTER_NUMBER;
112 break;
113 case U_OTHER_NUMBER:
114 r = css::i18n::UnicodeType::OTHER_NUMBER;
115 break;
116 case U_SPACE_SEPARATOR:
117 r = css::i18n::UnicodeType::SPACE_SEPARATOR;
118 break;
119 case U_LINE_SEPARATOR:
120 r = css::i18n::UnicodeType::LINE_SEPARATOR;
121 break;
122 case U_PARAGRAPH_SEPARATOR:
123 r = css::i18n::UnicodeType::PARAGRAPH_SEPARATOR;
124 break;
125 case U_CONTROL_CHAR:
126 r = css::i18n::UnicodeType::CONTROL;
127 break;
128 case U_FORMAT_CHAR:
129 r = css::i18n::UnicodeType::FORMAT;
130 break;
131 case U_PRIVATE_USE_CHAR:
132 r = css::i18n::UnicodeType::PRIVATE_USE;
133 break;
134 case U_SURROGATE:
135 r = css::i18n::UnicodeType::SURROGATE;
136 break;
137 case U_DASH_PUNCTUATION:
138 r = css::i18n::UnicodeType::DASH_PUNCTUATION;
139 break;
140 case U_INITIAL_PUNCTUATION:
141 r = css::i18n::UnicodeType::INITIAL_PUNCTUATION;
142 break;
143 case U_FINAL_PUNCTUATION:
144 r = css::i18n::UnicodeType::FINAL_PUNCTUATION;
145 break;
146 case U_CONNECTOR_PUNCTUATION:
147 r = css::i18n::UnicodeType::CONNECTOR_PUNCTUATION;
148 break;
149 case U_OTHER_PUNCTUATION:
150 r = css::i18n::UnicodeType::OTHER_PUNCTUATION;
151 break;
152 case U_MATH_SYMBOL:
153 r = css::i18n::UnicodeType::MATH_SYMBOL;
154 break;
155 case U_CURRENCY_SYMBOL:
156 r = css::i18n::UnicodeType::CURRENCY_SYMBOL;
157 break;
158 case U_MODIFIER_SYMBOL:
159 r = css::i18n::UnicodeType::MODIFIER_SYMBOL;
160 break;
161 case U_OTHER_SYMBOL:
162 r = css::i18n::UnicodeType::OTHER_SYMBOL;
163 break;
164 case U_START_PUNCTUATION:
165 r = css::i18n::UnicodeType::START_PUNCTUATION;
166 break;
167 case U_END_PUNCTUATION:
168 r = css::i18n::UnicodeType::END_PUNCTUATION;
169 break;
172 return r;
175 sal_uInt8
176 unicode::getUnicodeDirection( const sal_Unicode ch ) {
177 static sal_Unicode c = 0x00;
178 static sal_uInt8 r = 0x00;
180 if (ch == c) return r;
181 else c = ch;
183 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
184 r = (address < UnicodeDirectionNumberBlock)
185 ? UnicodeDirectionBlockValue[address]
186 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
187 return r;
190 sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
191 nChar = u_charMirror(nChar);
192 return nChar;
195 #define bit(name) (1U << name)
197 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
199 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
201 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
203 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
204 bit(UnicodeType::MODIFIER_LETTER)|\
205 bit(UnicodeType::OTHER_LETTER)
207 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
208 bit(UnicodeType::LINE_SEPARATOR)|\
209 bit(UnicodeType::PARAGRAPH_SEPARATOR)
211 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
212 bit(UnicodeType::FORMAT)|\
213 bit(UnicodeType::LINE_SEPARATOR)|\
214 bit(UnicodeType::PARAGRAPH_SEPARATOR)
216 #define IsType(func, mask) \
217 bool func( const sal_uInt32 ch) {\
218 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
221 IsType(unicode::isControl, CONTROLMASK)
222 IsType(unicode::isAlpha, ALPHAMASK)
223 IsType(unicode::isSpace, SPACEMASK)
225 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
226 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
228 bool unicode::isWhiteSpace(const sal_uInt32 ch)
230 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
233 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
235 //See unicode/uscript.h
236 sal_Int16 nRet;
237 switch (eScript)
239 case USCRIPT_INVALID_CODE:
240 case USCRIPT_COMMON:
241 case USCRIPT_INHERITED:
242 case USCRIPT_UNWRITTEN_LANGUAGES:
243 case USCRIPT_UNKNOWN:
244 case USCRIPT_MATHEMATICAL_NOTATION:
245 case USCRIPT_SYMBOLS:
246 case USCRIPT_CODE_LIMIT:
247 nRet = ScriptType::WEAK;
248 break;
249 case USCRIPT_ARMENIAN:
250 case USCRIPT_CHEROKEE:
251 case USCRIPT_COPTIC:
252 case USCRIPT_CYRILLIC:
253 case USCRIPT_GEORGIAN:
254 case USCRIPT_GOTHIC:
255 case USCRIPT_GREEK:
256 case USCRIPT_LATIN:
257 case USCRIPT_OGHAM:
258 case USCRIPT_OLD_ITALIC:
259 case USCRIPT_RUNIC:
260 case USCRIPT_CANADIAN_ABORIGINAL:
261 case USCRIPT_BRAILLE:
262 case USCRIPT_CYPRIOT:
263 case USCRIPT_OSMANYA:
264 case USCRIPT_SHAVIAN:
265 case USCRIPT_KATAKANA_OR_HIRAGANA:
266 case USCRIPT_GLAGOLITIC:
267 case USCRIPT_CIRTH:
268 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
269 case USCRIPT_OLD_HUNGARIAN:
270 case USCRIPT_LATIN_FRAKTUR:
271 case USCRIPT_LATIN_GAELIC:
272 nRet = ScriptType::LATIN;
273 break;
274 case USCRIPT_BOPOMOFO:
275 case USCRIPT_HAN:
276 case USCRIPT_HANGUL:
277 case USCRIPT_HIRAGANA:
278 case USCRIPT_KATAKANA:
279 case USCRIPT_YI:
280 case USCRIPT_SIMPLIFIED_HAN:
281 case USCRIPT_TRADITIONAL_HAN:
282 case USCRIPT_JAPANESE:
283 case USCRIPT_KOREAN:
284 case USCRIPT_TANGUT:
285 case USCRIPT_KHITAN_SMALL_SCRIPT:
286 nRet = ScriptType::ASIAN;
287 break;
288 case USCRIPT_ARABIC:
289 case USCRIPT_BENGALI:
290 case USCRIPT_DESERET:
291 case USCRIPT_DEVANAGARI:
292 case USCRIPT_ETHIOPIC:
293 case USCRIPT_GUJARATI:
294 case USCRIPT_GURMUKHI:
295 case USCRIPT_HEBREW:
296 case USCRIPT_KANNADA:
297 case USCRIPT_KHMER:
298 case USCRIPT_LAO:
299 case USCRIPT_MALAYALAM:
300 case USCRIPT_MONGOLIAN:
301 case USCRIPT_MYANMAR:
302 case USCRIPT_ORIYA:
303 case USCRIPT_SINHALA:
304 case USCRIPT_SYRIAC:
305 case USCRIPT_TAMIL:
306 case USCRIPT_TELUGU:
307 case USCRIPT_THAANA:
308 case USCRIPT_THAI:
309 case USCRIPT_TIBETAN:
310 case USCRIPT_TAGALOG:
311 case USCRIPT_HANUNOO:
312 case USCRIPT_BUHID:
313 case USCRIPT_TAGBANWA:
314 case USCRIPT_LIMBU:
315 case USCRIPT_LINEAR_B:
316 case USCRIPT_TAI_LE:
317 case USCRIPT_UGARITIC:
318 case USCRIPT_BUGINESE:
319 case USCRIPT_KHAROSHTHI:
320 case USCRIPT_SYLOTI_NAGRI:
321 case USCRIPT_NEW_TAI_LUE:
322 case USCRIPT_TIFINAGH:
323 case USCRIPT_OLD_PERSIAN:
324 case USCRIPT_BALINESE:
325 case USCRIPT_BATAK:
326 case USCRIPT_BLISSYMBOLS:
327 case USCRIPT_BRAHMI:
328 case USCRIPT_CHAM:
329 case USCRIPT_DEMOTIC_EGYPTIAN:
330 case USCRIPT_HIERATIC_EGYPTIAN:
331 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
332 case USCRIPT_KHUTSURI:
333 case USCRIPT_PAHAWH_HMONG:
334 case USCRIPT_HARAPPAN_INDUS:
335 case USCRIPT_JAVANESE:
336 case USCRIPT_KAYAH_LI:
337 case USCRIPT_LEPCHA:
338 case USCRIPT_LINEAR_A:
339 case USCRIPT_MANDAEAN:
340 case USCRIPT_MAYAN_HIEROGLYPHS:
341 case USCRIPT_MEROITIC:
342 case USCRIPT_NKO:
343 case USCRIPT_ORKHON:
344 case USCRIPT_OLD_PERMIC:
345 case USCRIPT_PHAGS_PA:
346 case USCRIPT_PHOENICIAN:
347 case USCRIPT_PHONETIC_POLLARD:
348 case USCRIPT_RONGORONGO:
349 case USCRIPT_SARATI:
350 case USCRIPT_ESTRANGELO_SYRIAC:
351 case USCRIPT_WESTERN_SYRIAC:
352 case USCRIPT_EASTERN_SYRIAC:
353 case USCRIPT_TENGWAR:
354 case USCRIPT_VAI:
355 case USCRIPT_VISIBLE_SPEECH:
356 case USCRIPT_CUNEIFORM:
357 case USCRIPT_CARIAN:
358 case USCRIPT_LANNA:
359 case USCRIPT_LYCIAN:
360 case USCRIPT_LYDIAN:
361 case USCRIPT_OL_CHIKI:
362 case USCRIPT_REJANG:
363 case USCRIPT_SAURASHTRA:
364 case USCRIPT_SIGN_WRITING:
365 case USCRIPT_SUNDANESE:
366 case USCRIPT_MOON:
367 case USCRIPT_MEITEI_MAYEK:
368 case USCRIPT_IMPERIAL_ARAMAIC:
369 case USCRIPT_AVESTAN:
370 case USCRIPT_CHAKMA:
371 case USCRIPT_KAITHI:
372 case USCRIPT_MANICHAEAN:
373 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
374 case USCRIPT_PSALTER_PAHLAVI:
375 case USCRIPT_BOOK_PAHLAVI:
376 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
377 case USCRIPT_SAMARITAN:
378 case USCRIPT_TAI_VIET:
379 case USCRIPT_BAMUM:
380 case USCRIPT_LISU:
381 case USCRIPT_NAKHI_GEBA:
382 case USCRIPT_OLD_SOUTH_ARABIAN:
383 case USCRIPT_BASSA_VAH:
384 case USCRIPT_DUPLOYAN_SHORTAND:
385 case USCRIPT_ELBASAN:
386 case USCRIPT_GRANTHA:
387 case USCRIPT_KPELLE:
388 case USCRIPT_LOMA:
389 case USCRIPT_MENDE:
390 case USCRIPT_MEROITIC_CURSIVE:
391 case USCRIPT_OLD_NORTH_ARABIAN:
392 case USCRIPT_NABATAEAN:
393 case USCRIPT_PALMYRENE:
394 case USCRIPT_SINDHI:
395 case USCRIPT_WARANG_CITI:
396 default: // anything new is going to be pretty wild
397 nRet = ScriptType::COMPLEX;
398 break;
400 return nRet;
403 sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag )
405 constexpr int32_t nBuf = 42;
406 UScriptCode aBuf[nBuf];
407 if (rLanguageTag.hasScript())
409 aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
410 OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
412 else
414 OUString aName;
415 if (rLanguageTag.getCountry().isEmpty())
416 aName = rLanguageTag.getLanguage();
417 else
418 aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
419 UErrorCode status = U_ZERO_ERROR;
420 const int32_t nScripts = uscript_getCode(
421 OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
422 aBuf, nBuf, &status);
423 // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
424 // and required capacity returned, but really..
425 if (nScripts == 0 || !U_SUCCESS(status))
426 return css::i18n::ScriptType::LATIN;
428 return getScriptClassFromUScriptCode( aBuf[0]);
431 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
433 OString sRet;
434 switch (eScript)
436 case USCRIPT_CODE_LIMIT:
437 case USCRIPT_INVALID_CODE:
438 case USCRIPT_MATHEMATICAL_NOTATION:
439 case USCRIPT_SYMBOLS:
440 sRet = "zxx"_ostr;
441 break;
442 case USCRIPT_COMMON:
443 case USCRIPT_INHERITED:
444 case USCRIPT_UNWRITTEN_LANGUAGES:
445 case USCRIPT_UNKNOWN:
446 sRet = "und"_ostr;
447 break;
448 case USCRIPT_ARABIC:
449 sRet = "ar"_ostr;
450 break;
451 case USCRIPT_ARMENIAN:
452 sRet = "hy"_ostr;
453 break;
454 case USCRIPT_BENGALI:
455 sRet = "bn"_ostr;
456 break;
457 case USCRIPT_BOPOMOFO:
458 sRet = "zh"_ostr;
459 break;
460 case USCRIPT_CHEROKEE:
461 sRet = "chr"_ostr;
462 break;
463 case USCRIPT_COPTIC:
464 sRet = "cop"_ostr;
465 break;
466 case USCRIPT_CYRILLIC:
467 sRet = "ru"_ostr;
468 break;
469 case USCRIPT_DESERET:
470 sRet = "en"_ostr;
471 break;
472 case USCRIPT_DEVANAGARI:
473 sRet = "hi"_ostr;
474 break;
475 case USCRIPT_ETHIOPIC:
476 sRet = "am"_ostr;
477 break;
478 case USCRIPT_GEORGIAN:
479 case USCRIPT_KHUTSURI:
480 sRet = "ka"_ostr;
481 break;
482 case USCRIPT_GOTHIC:
483 sRet = "got"_ostr;
484 break;
485 case USCRIPT_GREEK:
486 sRet = "el"_ostr;
487 break;
488 case USCRIPT_GUJARATI:
489 case USCRIPT_KHOJKI:
490 sRet = "gu"_ostr;
491 break;
492 case USCRIPT_GURMUKHI:
493 sRet = "pa"_ostr;
494 break;
495 case USCRIPT_HAN:
496 sRet = "zh"_ostr;
497 break;
498 case USCRIPT_HANGUL:
499 case USCRIPT_KOREAN:
500 case USCRIPT_JAMO:
501 sRet = "ko"_ostr; // Jamo - elements of Hangul Syllables
502 break;
503 case USCRIPT_HEBREW:
504 sRet = "hr"_ostr;
505 break;
506 case USCRIPT_HIRAGANA:
507 sRet = "ja"_ostr;
508 break;
509 case USCRIPT_KANNADA:
510 sRet = "kn"_ostr;
511 break;
512 case USCRIPT_KATAKANA:
513 sRet = "ja"_ostr;
514 break;
515 case USCRIPT_KHMER:
516 sRet = "km"_ostr;
517 break;
518 case USCRIPT_LAO:
519 sRet = "lo"_ostr;
520 break;
521 case USCRIPT_LATIN:
522 sRet = "en"_ostr;
523 break;
524 case USCRIPT_MALAYALAM:
525 sRet = "ml"_ostr;
526 break;
527 case USCRIPT_MONGOLIAN:
528 sRet = "mn"_ostr;
529 break;
530 case USCRIPT_MYANMAR:
531 sRet = "my"_ostr;
532 break;
533 case USCRIPT_OGHAM:
534 sRet = "pgl"_ostr;
535 break;
536 case USCRIPT_OLD_ITALIC:
537 sRet = "osc"_ostr;
538 break;
539 case USCRIPT_ORIYA:
540 sRet = "or"_ostr;
541 break;
542 case USCRIPT_RUNIC:
543 sRet = "ang"_ostr;
544 break;
545 case USCRIPT_SINHALA:
546 sRet = "si"_ostr;
547 break;
548 case USCRIPT_SYRIAC:
549 case USCRIPT_ESTRANGELO_SYRIAC:
550 sRet = "syr"_ostr;
551 break;
552 case USCRIPT_TAMIL:
553 case USCRIPT_GRANTHA:
554 sRet = "ta"_ostr;
555 break;
556 case USCRIPT_TELUGU:
557 sRet = "te"_ostr;
558 break;
559 case USCRIPT_THAANA:
560 sRet = "dv"_ostr;
561 break;
562 case USCRIPT_THAI:
563 sRet = "th"_ostr;
564 break;
565 case USCRIPT_TIBETAN:
566 sRet = "bo"_ostr;
567 break;
568 case USCRIPT_CANADIAN_ABORIGINAL:
569 sRet = "iu"_ostr;
570 break;
571 case USCRIPT_YI:
572 sRet = "ii"_ostr;
573 break;
574 case USCRIPT_TAGALOG:
575 sRet = "tl"_ostr;
576 break;
577 case USCRIPT_HANUNOO:
578 sRet = "hnn"_ostr;
579 break;
580 case USCRIPT_BUHID:
581 sRet = "bku"_ostr;
582 break;
583 case USCRIPT_TAGBANWA:
584 sRet = "tbw"_ostr;
585 break;
586 case USCRIPT_BRAILLE:
587 sRet = "en"_ostr;
588 break;
589 case USCRIPT_CYPRIOT:
590 sRet = "ecy"_ostr;
591 break;
592 case USCRIPT_LIMBU:
593 sRet = "lif"_ostr;
594 break;
595 case USCRIPT_LINEAR_B:
596 sRet = "gmy"_ostr;
597 break;
598 case USCRIPT_OSMANYA:
599 sRet = "so"_ostr;
600 break;
601 case USCRIPT_SHAVIAN:
602 sRet = "en"_ostr;
603 break;
604 case USCRIPT_TAI_LE:
605 sRet = "tdd"_ostr;
606 break;
607 case USCRIPT_UGARITIC:
608 sRet = "uga"_ostr;
609 break;
610 case USCRIPT_KATAKANA_OR_HIRAGANA:
611 sRet = "ja"_ostr;
612 break;
613 case USCRIPT_BUGINESE:
614 sRet = "bug"_ostr;
615 break;
616 case USCRIPT_GLAGOLITIC:
617 sRet = "ch"_ostr;
618 break;
619 case USCRIPT_KHAROSHTHI:
620 case USCRIPT_BRAHMI:
621 sRet = "pra"_ostr;
622 break;
623 case USCRIPT_SYLOTI_NAGRI:
624 sRet = "syl"_ostr;
625 break;
626 case USCRIPT_NEW_TAI_LUE:
627 sRet = "khb"_ostr;
628 break;
629 case USCRIPT_TIFINAGH:
630 sRet = "tmh"_ostr;
631 break;
632 case USCRIPT_OLD_PERSIAN:
633 sRet = "peo"_ostr;
634 break;
635 case USCRIPT_BALINESE:
636 sRet = "ban"_ostr;
637 break;
638 case USCRIPT_BATAK:
639 sRet = "btk"_ostr;
640 break;
641 case USCRIPT_BLISSYMBOLS:
642 sRet = "en"_ostr;
643 break;
644 case USCRIPT_CHAM:
645 sRet = "cja"_ostr;
646 break;
647 case USCRIPT_CIRTH:
648 case USCRIPT_TENGWAR:
649 sRet = "sjn"_ostr;
650 break;
651 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
652 sRet = "cu"_ostr;
653 break;
654 case USCRIPT_DEMOTIC_EGYPTIAN:
655 case USCRIPT_HIERATIC_EGYPTIAN:
656 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
657 sRet = "egy"_ostr;
658 break;
659 case USCRIPT_SIMPLIFIED_HAN:
660 sRet = "zh"_ostr;
661 break;
662 case USCRIPT_TRADITIONAL_HAN:
663 sRet = "zh"_ostr;
664 break;
665 case USCRIPT_PAHAWH_HMONG:
666 sRet = "blu"_ostr;
667 break;
668 case USCRIPT_OLD_HUNGARIAN:
669 sRet = "ohu"_ostr;
670 break;
671 case USCRIPT_HARAPPAN_INDUS:
672 sRet = "xiv"_ostr;
673 break;
674 case USCRIPT_JAVANESE:
675 sRet = "kaw"_ostr;
676 break;
677 case USCRIPT_KAYAH_LI:
678 sRet = "eky"_ostr;
679 break;
680 case USCRIPT_LATIN_FRAKTUR:
681 sRet = "de"_ostr;
682 break;
683 case USCRIPT_LATIN_GAELIC:
684 sRet = "ga"_ostr;
685 break;
686 case USCRIPT_LEPCHA:
687 sRet = "lep"_ostr;
688 break;
689 case USCRIPT_LINEAR_A:
690 sRet = "ecr"_ostr;
691 break;
692 case USCRIPT_MAYAN_HIEROGLYPHS:
693 sRet = "myn"_ostr;
694 break;
695 case USCRIPT_MEROITIC_CURSIVE:
696 case USCRIPT_MEROITIC:
697 sRet = "xmr"_ostr;
698 break;
699 case USCRIPT_NKO:
700 sRet = "nqo"_ostr;
701 break;
702 case USCRIPT_ORKHON:
703 sRet = "otk"_ostr;
704 break;
705 case USCRIPT_OLD_PERMIC:
706 sRet = "kv"_ostr;
707 break;
708 case USCRIPT_PHAGS_PA:
709 sRet = "xng"_ostr;
710 break;
711 case USCRIPT_PHOENICIAN:
712 sRet = "phn"_ostr;
713 break;
714 case USCRIPT_PHONETIC_POLLARD:
715 sRet = "hmd"_ostr;
716 break;
717 case USCRIPT_RONGORONGO:
718 sRet = "rap"_ostr;
719 break;
720 case USCRIPT_SARATI:
721 sRet = "qya"_ostr;
722 break;
723 case USCRIPT_WESTERN_SYRIAC:
724 sRet = "tru"_ostr;
725 break;
726 case USCRIPT_EASTERN_SYRIAC:
727 sRet = "aii"_ostr;
728 break;
729 case USCRIPT_VAI:
730 sRet = "vai"_ostr;
731 break;
732 case USCRIPT_VISIBLE_SPEECH:
733 sRet = "en"_ostr;
734 break;
735 case USCRIPT_CUNEIFORM:
736 sRet = "akk"_ostr;
737 break;
738 case USCRIPT_CARIAN:
739 sRet = "xcr"_ostr;
740 break;
741 case USCRIPT_JAPANESE:
742 sRet = "ja"_ostr;
743 break;
744 case USCRIPT_LANNA:
745 sRet = "nod"_ostr;
746 break;
747 case USCRIPT_LYCIAN:
748 sRet = "xlc"_ostr;
749 break;
750 case USCRIPT_LYDIAN:
751 sRet = "xld"_ostr;
752 break;
753 case USCRIPT_OL_CHIKI:
754 sRet = "sat"_ostr;
755 break;
756 case USCRIPT_REJANG:
757 sRet = "rej"_ostr;
758 break;
759 case USCRIPT_SAURASHTRA:
760 sRet = "saz"_ostr;
761 break;
762 case USCRIPT_SIGN_WRITING:
763 sRet = "en"_ostr;
764 break;
765 case USCRIPT_SUNDANESE:
766 sRet = "su"_ostr;
767 break;
768 case USCRIPT_MOON:
769 sRet = "en"_ostr;
770 break;
771 case USCRIPT_MEITEI_MAYEK:
772 sRet = "mni"_ostr;
773 break;
774 case USCRIPT_IMPERIAL_ARAMAIC:
775 sRet = "arc"_ostr;
776 break;
777 case USCRIPT_AVESTAN:
778 sRet = "ae"_ostr;
779 break;
780 case USCRIPT_CHAKMA:
781 sRet = "ccp"_ostr;
782 break;
783 case USCRIPT_KAITHI:
784 sRet = "awa"_ostr;
785 break;
786 case USCRIPT_MANICHAEAN:
787 sRet = "xmn"_ostr;
788 break;
789 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
790 case USCRIPT_PSALTER_PAHLAVI:
791 case USCRIPT_BOOK_PAHLAVI:
792 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
793 sRet = "xpr"_ostr;
794 break;
795 case USCRIPT_SAMARITAN:
796 sRet = "heb"_ostr;
797 break;
798 case USCRIPT_TAI_VIET:
799 sRet = "blt"_ostr;
800 break;
801 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
802 sRet = "mic"_ostr;
803 break;
804 case USCRIPT_NABATAEAN:
805 sRet = "mis-Nbat"_ostr; // Uncoded with script
806 break;
807 case USCRIPT_PALMYRENE:
808 sRet = "mis-Palm"_ostr; // Uncoded with script
809 break;
810 case USCRIPT_BAMUM:
811 sRet = "bax"_ostr;
812 break;
813 case USCRIPT_LISU:
814 sRet = "lis"_ostr;
815 break;
816 case USCRIPT_NAKHI_GEBA:
817 sRet = "nxq"_ostr;
818 break;
819 case USCRIPT_OLD_SOUTH_ARABIAN:
820 sRet = "xsa"_ostr;
821 break;
822 case USCRIPT_BASSA_VAH:
823 sRet = "bsq"_ostr;
824 break;
825 case USCRIPT_DUPLOYAN_SHORTAND:
826 sRet = "fr"_ostr;
827 break;
828 case USCRIPT_ELBASAN:
829 sRet = "sq"_ostr;
830 break;
831 case USCRIPT_KPELLE:
832 sRet = "kpe"_ostr;
833 break;
834 case USCRIPT_LOMA:
835 sRet = "lom"_ostr;
836 break;
837 case USCRIPT_MENDE:
838 sRet = "men"_ostr;
839 break;
840 case USCRIPT_OLD_NORTH_ARABIAN:
841 sRet = "xna"_ostr;
842 break;
843 case USCRIPT_SINDHI:
844 sRet = "sd"_ostr;
845 break;
846 case USCRIPT_WARANG_CITI:
847 sRet = "hoc"_ostr;
848 break;
849 case USCRIPT_AFAKA:
850 sRet = "djk"_ostr;
851 break;
852 case USCRIPT_JURCHEN:
853 sRet = "juc"_ostr;
854 break;
855 case USCRIPT_MRO:
856 sRet = "cmr"_ostr;
857 break;
858 case USCRIPT_NUSHU:
859 sRet = "mis-Nshu"_ostr; // Uncoded with script
860 break;
861 case USCRIPT_SHARADA:
862 sRet = "sa"_ostr;
863 break;
864 case USCRIPT_SORA_SOMPENG:
865 sRet = "srb"_ostr;
866 break;
867 case USCRIPT_TAKRI:
868 sRet = "doi"_ostr;
869 break;
870 case USCRIPT_TANGUT:
871 sRet = "txg"_ostr;
872 break;
873 case USCRIPT_WOLEAI:
874 sRet = "woe"_ostr;
875 break;
876 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
877 sRet = "hlu"_ostr;
878 break;
879 case USCRIPT_TIRHUTA:
880 sRet = "mai"_ostr;
881 break;
882 case USCRIPT_CAUCASIAN_ALBANIAN:
883 sRet = "xag"_ostr;
884 break;
885 case USCRIPT_MAHAJANI:
886 sRet = "mwr"_ostr;
887 break;
888 case USCRIPT_AHOM:
889 sRet = "aho"_ostr;
890 break;
891 case USCRIPT_HATRAN:
892 sRet = "qly-Hatr"_ostr;
893 break;
894 case USCRIPT_MODI:
895 sRet = "mr-Modi"_ostr;
896 break;
897 case USCRIPT_MULTANI:
898 sRet = "skr-Mutl"_ostr;
899 break;
900 case USCRIPT_PAU_CIN_HAU:
901 sRet = "ctd-Pauc"_ostr;
902 break;
903 case USCRIPT_SIDDHAM:
904 sRet = "sa-Sidd"_ostr;
905 break;
906 case USCRIPT_ADLAM:
907 sRet = "mis-Adlm"_ostr; // Adlam for Fulani, no language code
908 break;
909 case USCRIPT_BHAIKSUKI:
910 sRet = "mis-Bhks"_ostr; // Bhaiksuki for some Buddhist texts, no language code
911 break;
912 case USCRIPT_MARCHEN:
913 sRet = "bo-Marc"_ostr;
914 break;
915 case USCRIPT_NEWA:
916 sRet = "new-Newa"_ostr;
917 break;
918 case USCRIPT_OSAGE:
919 sRet = "osa-Osge"_ostr;
920 break;
921 case USCRIPT_HAN_WITH_BOPOMOFO:
922 sRet = "mis-Hanb"_ostr; // Han with Bopomofo, zh-Hanb ?
923 break;
924 case USCRIPT_SYMBOLS_EMOJI:
925 sRet = "mis-Zsye"_ostr; // Emoji variant
926 break;
927 case USCRIPT_MASARAM_GONDI:
928 sRet = "gon-Gonm"_ostr; // macro language code, could be wsg,esg,gno
929 break;
930 case USCRIPT_SOYOMBO:
931 sRet = "mn-Soyo"_ostr; // abugida to write Mongolian, also Tibetan and Sanskrit
932 break;
933 case USCRIPT_ZANABAZAR_SQUARE:
934 sRet = "mn-Zanb"_ostr; // abugida to write Mongolian
935 break;
936 case USCRIPT_DOGRA:
937 sRet = "dgo"_ostr; // Dogri proper
938 break;
939 case USCRIPT_GUNJALA_GONDI:
940 sRet = "wsg"_ostr; // Adilabad Gondi
941 break;
942 case USCRIPT_MAKASAR:
943 sRet = "mak"_ostr;
944 break;
945 case USCRIPT_MEDEFAIDRIN:
946 sRet = "dmf-Medf"_ostr;
947 break;
948 case USCRIPT_HANIFI_ROHINGYA:
949 sRet = "rhg"_ostr;
950 break;
951 case USCRIPT_SOGDIAN:
952 case USCRIPT_OLD_SOGDIAN:
953 sRet = "sog"_ostr;
954 break;
955 case USCRIPT_ELYMAIC:
956 sRet = "arc-Elym"_ostr;
957 break;
958 case USCRIPT_NYIAKENG_PUACHUE_HMONG:
959 sRet = "hmn-Hmnp"_ostr; // macrolanguage code
960 break;
961 case USCRIPT_NANDINAGARI:
962 sRet = "sa-Nand"_ostr;
963 break;
964 case USCRIPT_WANCHO:
965 sRet = "nnp-Wcho"_ostr;
966 break;
967 case USCRIPT_CHORASMIAN:
968 sRet = "xco-Chrs"_ostr;
969 break;
970 case USCRIPT_DIVES_AKURU:
971 sRet = "dv-Diak"_ostr;
972 break;
973 case USCRIPT_KHITAN_SMALL_SCRIPT:
974 sRet = "zkt-Kits"_ostr;
975 break;
976 case USCRIPT_YEZIDI:
977 sRet = "kmr-Yezi"_ostr;
978 break;
979 #if (U_ICU_VERSION_MAJOR_NUM >= 70)
980 case USCRIPT_CYPRO_MINOAN:
981 sRet = "mis-Cpmn"_ostr; // Uncoded with script
982 break;
983 case USCRIPT_OLD_UYGHUR:
984 sRet = "oui-Ougr"_ostr;
985 break;
986 case USCRIPT_TANGSA:
987 sRet = "nst-Tnsa"_ostr;
988 break;
989 case USCRIPT_TOTO:
990 sRet = "txo-Toto"_ostr;
991 break;
992 case USCRIPT_VITHKUQI:
993 sRet = "sq-Vith"_ostr; // macrolanguage code
994 break;
995 #endif
996 #if (U_ICU_VERSION_MAJOR_NUM >= 72)
997 case USCRIPT_KAWI:
998 sRet = "mis-Kawi"_ostr; // Uncoded with script
999 break;
1000 case USCRIPT_NAG_MUNDARI:
1001 sRet = "unr-Nagm"_ostr;
1002 break;
1003 #endif
1004 #if (U_ICU_VERSION_MAJOR_NUM >= 75)
1005 case USCRIPT_ARABIC_NASTALIQ:
1006 sRet = "fa-Aran"_ostr;
1007 break;
1008 #endif
1010 return sRet;
1013 //Format a number as a percentage according to the rules of the given
1014 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
1015 OUString unicode::formatPercent(double dNumber,
1016 const LanguageTag &rLangTag)
1018 // get a currency formatter for this locale ID
1019 UErrorCode errorCode=U_ZERO_ERROR;
1021 LanguageTag aLangTag(rLangTag);
1023 // As of CLDR Version 24 these languages were not listed as using spacing
1024 // between number and % but are reported as such by our l10n groups
1025 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
1026 // so format using French which has the desired rules
1027 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
1028 aLangTag.reset(u"fr-FR"_ustr);
1030 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
1032 std::unique_ptr<icu::NumberFormat> xF(
1033 icu::NumberFormat::createPercentInstance(aLocale, errorCode));
1034 if(U_FAILURE(errorCode))
1036 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
1037 return OUString::number(dNumber) + "%";
1040 icu::UnicodeString output;
1041 xF->format(dNumber/100, output);
1042 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
1043 output.length());
1044 if (rLangTag.getLanguage() == "de")
1046 //narrow no-break space instead of (normal) no-break space
1047 return aRet.replace(0x00A0, 0x202F);
1049 return aRet;
1052 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_uInt32 uChar)
1054 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1055 if( maInput.getLength() > 255 )
1056 mbAllowMoreChars = false;
1058 if( !mbAllowMoreChars )
1059 return false;
1061 bool bPreventNonHex = false;
1062 if( maInput.indexOf("U+") != -1 )
1063 bPreventNonHex = true;
1065 switch ( unicode::getUnicodeType(uChar) )
1067 case css::i18n::UnicodeType::SURROGATE:
1068 if( bPreventNonHex )
1070 mbAllowMoreChars = false;
1071 return false;
1074 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
1076 maUtf16.append(sal_Unicode(uChar));
1077 return true;
1079 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
1080 maUtf16.insert(0, sal_Unicode(uChar));
1081 if (maUtf16.getLength() == 2)
1083 assert(rtl::isHighSurrogate(maUtf16[0]) && rtl::isLowSurrogate(maUtf16[1]));
1084 // The resulting codepoint may itself be combining, so may allow more
1085 sal_uInt32 nUCS4 = rtl::combineSurrogates(maUtf16[0], maUtf16[1]);
1086 maUtf16.setLength(0);
1087 return AllowMoreInput(nUCS4);
1089 // unexpected order of high/low, so don't accept more
1090 if( !maUtf16.isEmpty() )
1091 maInput.append(maUtf16);
1092 if( !maCombining.isEmpty() )
1093 maInput.append(maCombining);
1094 mbAllowMoreChars = false;
1095 break;
1097 case css::i18n::UnicodeType::NON_SPACING_MARK:
1098 case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
1099 if( bPreventNonHex )
1101 mbAllowMoreChars = false;
1102 return false;
1105 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1106 if( !maUtf16.isEmpty() )
1108 maInput = maUtf16;
1109 if( !maCombining.isEmpty() )
1110 maInput.append(maCombining);
1111 mbAllowMoreChars = false;
1112 return false;
1114 maCombining.insertUtf32(0, uChar);
1115 break;
1117 default:
1118 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1119 if( !maUtf16.isEmpty() )
1121 maInput = maUtf16;
1122 if( !maCombining.isEmpty() )
1123 maInput.append(maCombining);
1124 mbAllowMoreChars = false;
1125 return false;
1128 if( !maCombining.isEmpty() )
1130 maCombining.insertUtf32(0, uChar);
1131 maInput = maCombining;
1132 mbAllowMoreChars = false;
1133 return false;
1136 // 0 - 1f are control characters. Do not process those.
1137 if( uChar < 0x20 )
1139 mbAllowMoreChars = false;
1140 return false;
1143 switch( uChar )
1145 case 'u':
1146 case 'U':
1147 // U+ notation found. Continue looking for another one.
1148 if( mbRequiresU )
1150 mbRequiresU = false;
1151 maInput.insert(0,"U+");
1153 // treat as a normal character
1154 else
1156 mbAllowMoreChars = false;
1157 if( !bPreventNonHex )
1158 maInput.insertUtf32(0, uChar);
1160 break;
1161 case '+':
1162 // + already found: skip when not U, or edge case of +U+xxxx
1163 if( mbRequiresU || (maInput.indexOf("U+") == 0) )
1164 mbAllowMoreChars = false;
1165 // hex chars followed by '+' - now require a 'U'
1166 else if ( !maInput.isEmpty() )
1167 mbRequiresU = true;
1168 // treat as a normal character
1169 else
1171 mbAllowMoreChars = false;
1172 if( !bPreventNonHex )
1173 maInput.insertUtf32(0, uChar);
1175 break;
1176 default:
1177 // + already found. Since not U, cancel further input
1178 if( mbRequiresU )
1179 mbAllowMoreChars = false;
1180 // maximum digits per notation is 8: only one notation
1181 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
1182 mbAllowMoreChars = false;
1183 // maximum digits per notation is 8: previous notation found
1184 else if( maInput.indexOf("U+") == 8 )
1185 mbAllowMoreChars = false;
1186 // a hex character. Add to string.
1187 else if( rtl::isAsciiHexDigit(uChar) )
1189 mbIsHexString = true;
1190 maInput.insertUtf32(0, uChar);
1192 // not a hex character: stop input. keep if it is the first input provided
1193 else
1195 mbAllowMoreChars = false;
1196 if( maInput.isEmpty() )
1197 maInput.insertUtf32(0, uChar);
1201 return mbAllowMoreChars;
1204 OUString ToggleUnicodeCodepoint::StringToReplace()
1206 if( maInput.isEmpty() )
1208 //edge case - input finished with incomplete low surrogate or combining characters without a base
1209 if( mbAllowMoreChars )
1211 if( !maUtf16.isEmpty() )
1212 maInput = maUtf16;
1213 if( !maCombining.isEmpty() )
1214 maInput.append(maCombining);
1216 return maInput.toString();
1219 if( !mbIsHexString )
1220 return maInput.toString();
1222 //this function potentially modifies the input string. Prevent addition of further characters
1223 mbAllowMoreChars = false;
1225 //validate unicode notation.
1226 OUString sIn;
1227 sal_uInt32 nUnicode = 0;
1228 sal_Int32 nUPlus = maInput.indexOf("U+");
1229 //if U+ notation used, strip off all extra chars added not in U+ notation
1230 if( nUPlus != -1 )
1232 maInput.remove(0, nUPlus);
1233 sIn = maInput.copy(2).makeStringAndClear();
1234 nUPlus = sIn.indexOf("U+");
1236 else
1237 sIn = maInput.toString();
1238 while( nUPlus != -1 )
1240 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1241 //prevent creating control characters or invalid Unicode values
1242 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1243 maInput = sIn.subView(nUPlus);
1244 sIn = sIn.copy(nUPlus+2);
1245 nUPlus = sIn.indexOf("U+");
1248 nUnicode = sIn.toUInt32(16);
1249 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1250 maInput.truncate().append( sIn[sIn.getLength()-1] );
1251 return maInput.toString();
1254 OUString ToggleUnicodeCodepoint::ReplacementString()
1256 OUString sIn = StringToReplace();
1257 OUStringBuffer output = "";
1258 sal_Int32 nUPlus = sIn.indexOf("U+");
1259 // convert from hex notation to glyph
1260 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1262 sal_uInt32 nUnicode = 0;
1263 if( nUPlus == 0)
1265 sIn = sIn.copy(2);
1266 nUPlus = sIn.indexOf("U+");
1268 while( nUPlus > 0 )
1270 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1271 output.appendUtf32( nUnicode );
1273 sIn = sIn.copy(nUPlus+2);
1274 nUPlus = sIn.indexOf("U+");
1276 nUnicode = sIn.toUInt32(16);
1277 output.appendUtf32( nUnicode );
1279 // convert from glyph to hex notation
1280 else
1282 sal_Int32 nPos = 0;
1283 while( nPos < sIn.getLength() )
1285 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1286 //pad with zeros - minimum length of 4.
1287 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1288 aTmp.insert( 0,"0" );
1289 output.append( "U+" + aTmp );
1292 return output.makeStringAndClear();
1295 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */