i18nutil/source/utility/unicode.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <com/sun/star/i18n/UnicodeType.hpp>
  21 #include <com/sun/star/i18n/ScriptType.hpp>
  22 #include <i18nlangtag/languagetag.hxx>
  23 #include <i18nlangtag/languagetagicu.hxx>
  24 #include <i18nutil/unicode.hxx>
  25 #include <sal/log.hxx>
  26 #include <unicode/numfmt.h>
  27 #include <unicode/uchar.h>
  28 #include "unicode_data.h"
  29 #include <rtl/character.hxx>
  30 #include <o3tl/string_view.hxx>
  31 #include <memory>
  32
  33 // Workaround for glibc braindamage:
  34 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
  35 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
  36 #undef CURRENCY_SYMBOL
  37
  38 using namespace ::com::sun::star::i18n;
  39
  40 template<class L, typename T>
  41 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
  42
  43     sal_Int16 i = 0;
  44     css::i18n::UnicodeScript type = typeList[0].to;
  45     while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
  46         type = typeList[++i].to;
  47     }
  48
  49     return (type < UnicodeScript_kScriptCount &&
  50             ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
  51             typeList[i].value : unknownType;
  52 }
  53
  54 sal_Int16
  55 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
  56     return getScriptType(ch, typeList, unknownType);
  57 }
  58
  59 sal_Unicode
  60 unicode::getUnicodeScriptStart( UnicodeScript type) {
  61     return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
  62 }
  63
  64 sal_Unicode
  65 unicode::getUnicodeScriptEnd( UnicodeScript type) {
  66     return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
  67 }
  68
  69 sal_Int16
  70 unicode::getUnicodeType( const sal_Unicode ch ) {
  71     static sal_Unicode c = 0x00;
  72     static sal_Int16 r = 0x00;
  73
  74     if (ch == c) return r;
  75     else c = ch;
  76
  77     sal_Int16 address = UnicodeTypeIndex[ch >> 8];
  78     r = static_cast<sal_Int16>(
  79             (address < UnicodeTypeNumberBlock)
  80             ? UnicodeTypeBlockValue[address]
  81             : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
  82     return r;
  83 }
  84
  85 sal_uInt8
  86 unicode::getUnicodeDirection( const sal_Unicode ch ) {
  87     static sal_Unicode c = 0x00;
  88     static sal_uInt8 r = 0x00;
  89
  90     if (ch == c) return r;
  91     else c = ch;
  92
  93     sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
  94     r = (address < UnicodeDirectionNumberBlock)
  95             ? UnicodeDirectionBlockValue[address]
  96             : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
  97     return r;
  98 }
  99
 100 sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
 101     nChar = u_charMirror(nChar);
 102     return nChar;
 103 }
 104
 105 #define bit(name)   (1U << name)
 106
 107 #define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)
 108
 109 #define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)
 110
 111 #define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)
 112
 113 #define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
 114             bit(UnicodeType::MODIFIER_LETTER)|\
 115             bit(UnicodeType::OTHER_LETTER)
 116
 117 #define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
 118             bit(UnicodeType::LINE_SEPARATOR)|\
 119             bit(UnicodeType::PARAGRAPH_SEPARATOR)
 120
 121 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
 122             bit(UnicodeType::FORMAT)|\
 123             bit(UnicodeType::LINE_SEPARATOR)|\
 124             bit(UnicodeType::PARAGRAPH_SEPARATOR)
 125
 126 #define IsType(func, mask)  \
 127 bool func( const sal_Unicode ch) {\
 128     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
 129 }
 130
 131 IsType(unicode::isControl, CONTROLMASK)
 132 IsType(unicode::isAlpha, ALPHAMASK)
 133 IsType(unicode::isSpace, SPACEMASK)
 134
 135 #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
 136             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
 137
 138 bool unicode::isWhiteSpace( const sal_Unicode ch) {
 139     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
 140 }
 141
 142 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
 143 {
 144     //See unicode/uscript.h
 145     sal_Int16 nRet;
 146     switch (eScript)
 147     {
 148         case USCRIPT_INVALID_CODE:
 149         case USCRIPT_COMMON:
 150         case USCRIPT_INHERITED:
 151         case USCRIPT_UNWRITTEN_LANGUAGES:
 152         case USCRIPT_UNKNOWN:
 153         case USCRIPT_MATHEMATICAL_NOTATION:
 154         case USCRIPT_SYMBOLS:
 155         case USCRIPT_WARANG_CITI:
 156             nRet = ScriptType::WEAK;
 157             break;
 158         case USCRIPT_ARMENIAN:
 159         case USCRIPT_CHEROKEE:
 160         case USCRIPT_COPTIC:
 161         case USCRIPT_CYRILLIC:
 162         case USCRIPT_GEORGIAN:
 163         case USCRIPT_GOTHIC:
 164         case USCRIPT_GREEK:
 165         case USCRIPT_LATIN:
 166         case USCRIPT_OGHAM:
 167         case USCRIPT_OLD_ITALIC:
 168         case USCRIPT_RUNIC:
 169         case USCRIPT_CANADIAN_ABORIGINAL:
 170         case USCRIPT_BRAILLE:
 171         case USCRIPT_CYPRIOT:
 172         case USCRIPT_OSMANYA:
 173         case USCRIPT_SHAVIAN:
 174         case USCRIPT_KATAKANA_OR_HIRAGANA:
 175         case USCRIPT_GLAGOLITIC:
 176         case USCRIPT_CIRTH:
 177         case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
 178         case USCRIPT_OLD_HUNGARIAN:
 179         case USCRIPT_LATIN_FRAKTUR:
 180         case USCRIPT_LATIN_GAELIC:
 181             nRet = ScriptType::LATIN;
 182             break;
 183         case USCRIPT_BOPOMOFO:
 184         case USCRIPT_HAN:
 185         case USCRIPT_HANGUL:
 186         case USCRIPT_HIRAGANA:
 187         case USCRIPT_KATAKANA:
 188         case USCRIPT_YI:
 189         case USCRIPT_SIMPLIFIED_HAN:
 190         case USCRIPT_TRADITIONAL_HAN:
 191         case USCRIPT_JAPANESE:
 192         case USCRIPT_KOREAN:
 193 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
 194         case USCRIPT_TANGUT:
 195 #endif
 196 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
 197         case USCRIPT_KHITAN_SMALL_SCRIPT:
 198 #endif
 199             nRet = ScriptType::ASIAN;
 200             break;
 201         case USCRIPT_ARABIC:
 202         case USCRIPT_BENGALI:
 203         case USCRIPT_DESERET:
 204         case USCRIPT_DEVANAGARI:
 205         case USCRIPT_ETHIOPIC:
 206         case USCRIPT_GUJARATI:
 207         case USCRIPT_GURMUKHI:
 208         case USCRIPT_HEBREW:
 209         case USCRIPT_KANNADA:
 210         case USCRIPT_KHMER:
 211         case USCRIPT_LAO:
 212         case USCRIPT_MALAYALAM:
 213         case USCRIPT_MONGOLIAN:
 214         case USCRIPT_MYANMAR:
 215         case USCRIPT_ORIYA:
 216         case USCRIPT_SINHALA:
 217         case USCRIPT_SYRIAC:
 218         case USCRIPT_TAMIL:
 219         case USCRIPT_TELUGU:
 220         case USCRIPT_THAANA:
 221         case USCRIPT_THAI:
 222         case USCRIPT_TIBETAN:
 223         case USCRIPT_TAGALOG:
 224         case USCRIPT_HANUNOO:
 225         case USCRIPT_BUHID:
 226         case USCRIPT_TAGBANWA:
 227         case USCRIPT_LIMBU:
 228         case USCRIPT_LINEAR_B:
 229         case USCRIPT_TAI_LE:
 230         case USCRIPT_UGARITIC:
 231         case USCRIPT_BUGINESE:
 232         case USCRIPT_KHAROSHTHI:
 233         case USCRIPT_SYLOTI_NAGRI:
 234         case USCRIPT_NEW_TAI_LUE:
 235         case USCRIPT_TIFINAGH:
 236         case USCRIPT_OLD_PERSIAN:
 237         case USCRIPT_BALINESE:
 238         case USCRIPT_BATAK:
 239         case USCRIPT_BLISSYMBOLS:
 240         case USCRIPT_BRAHMI:
 241         case USCRIPT_CHAM:
 242         case USCRIPT_DEMOTIC_EGYPTIAN:
 243         case USCRIPT_HIERATIC_EGYPTIAN:
 244         case USCRIPT_EGYPTIAN_HIEROGLYPHS:
 245         case USCRIPT_KHUTSURI:
 246         case USCRIPT_PAHAWH_HMONG:
 247         case USCRIPT_HARAPPAN_INDUS:
 248         case USCRIPT_JAVANESE:
 249         case USCRIPT_KAYAH_LI:
 250         case USCRIPT_LEPCHA:
 251         case USCRIPT_LINEAR_A:
 252         case USCRIPT_MANDAEAN:
 253         case USCRIPT_MAYAN_HIEROGLYPHS:
 254         case USCRIPT_MEROITIC:
 255         case USCRIPT_NKO:
 256         case USCRIPT_ORKHON:
 257         case USCRIPT_OLD_PERMIC:
 258         case USCRIPT_PHAGS_PA:
 259         case USCRIPT_PHOENICIAN:
 260         case USCRIPT_PHONETIC_POLLARD:
 261         case USCRIPT_RONGORONGO:
 262         case USCRIPT_SARATI:
 263         case USCRIPT_ESTRANGELO_SYRIAC:
 264         case USCRIPT_WESTERN_SYRIAC:
 265         case USCRIPT_EASTERN_SYRIAC:
 266         case USCRIPT_TENGWAR:
 267         case USCRIPT_VAI:
 268         case USCRIPT_VISIBLE_SPEECH:
 269         case USCRIPT_CUNEIFORM:
 270         case USCRIPT_CARIAN:
 271         case USCRIPT_LANNA:
 272         case USCRIPT_LYCIAN:
 273         case USCRIPT_LYDIAN:
 274         case USCRIPT_OL_CHIKI:
 275         case USCRIPT_REJANG:
 276         case USCRIPT_SAURASHTRA:
 277         case USCRIPT_SIGN_WRITING:
 278         case USCRIPT_SUNDANESE:
 279         case USCRIPT_MOON:
 280         case USCRIPT_MEITEI_MAYEK:
 281         case USCRIPT_IMPERIAL_ARAMAIC:
 282         case USCRIPT_AVESTAN:
 283         case USCRIPT_CHAKMA:
 284         case USCRIPT_KAITHI:
 285         case USCRIPT_MANICHAEAN:
 286         case USCRIPT_INSCRIPTIONAL_PAHLAVI:
 287         case USCRIPT_PSALTER_PAHLAVI:
 288         case USCRIPT_BOOK_PAHLAVI:
 289         case USCRIPT_INSCRIPTIONAL_PARTHIAN:
 290         case USCRIPT_SAMARITAN:
 291         case USCRIPT_TAI_VIET:
 292         case USCRIPT_BAMUM:
 293         case USCRIPT_LISU:
 294         case USCRIPT_NAKHI_GEBA:
 295         case USCRIPT_OLD_SOUTH_ARABIAN:
 296         case USCRIPT_BASSA_VAH:
 297         case USCRIPT_DUPLOYAN_SHORTAND:
 298         case USCRIPT_ELBASAN:
 299         case USCRIPT_GRANTHA:
 300         case USCRIPT_KPELLE:
 301         case USCRIPT_LOMA:
 302         case USCRIPT_MENDE:
 303         case USCRIPT_MEROITIC_CURSIVE:
 304         case USCRIPT_OLD_NORTH_ARABIAN:
 305         case USCRIPT_NABATAEAN:
 306         case USCRIPT_PALMYRENE:
 307         case USCRIPT_SINDHI:
 308         default:         // anything new is going to be pretty wild
 309             nRet = ScriptType::COMPLEX;
 310             break;
 311     }
 312     return nRet;
 313 }
 314
 315 sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag )
 316 {
 317     constexpr int32_t nBuf = 42;
 318     UScriptCode aBuf[nBuf];
 319     if (rLanguageTag.hasScript())
 320     {
 321         aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
 322                 OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
 323     }
 324     else
 325     {
 326         OUString aName;
 327         if (rLanguageTag.getCountry().isEmpty())
 328             aName = rLanguageTag.getLanguage();
 329         else
 330             aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
 331         UErrorCode status = U_ZERO_ERROR;
 332         const int32_t nScripts = uscript_getCode(
 333                 OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
 334                 aBuf, nBuf, &status);
 335         // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
 336         // and required capacity returned, but really..
 337         if (nScripts == 0 || !U_SUCCESS(status))
 338             return css::i18n::ScriptType::LATIN;
 339     }
 340     return getScriptClassFromUScriptCode( aBuf[0]);
 341 }
 342
 343 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
 344 {
 345     OString sRet;
 346     switch (eScript)
 347     {
 348         case USCRIPT_CODE_LIMIT:
 349         case USCRIPT_INVALID_CODE:
 350             sRet = "zxx";
 351             break;
 352         case USCRIPT_COMMON:
 353         case USCRIPT_INHERITED:
 354             sRet = "und";
 355             break;
 356         case USCRIPT_MATHEMATICAL_NOTATION:
 357         case USCRIPT_SYMBOLS:
 358             sRet = "zxx";
 359             break;
 360         case USCRIPT_UNWRITTEN_LANGUAGES:
 361         case USCRIPT_UNKNOWN:
 362             sRet = "und";
 363             break;
 364         case USCRIPT_ARABIC:
 365             sRet = "ar";
 366             break;
 367         case USCRIPT_ARMENIAN:
 368             sRet = "hy";
 369             break;
 370         case USCRIPT_BENGALI:
 371             sRet = "bn";
 372             break;
 373         case USCRIPT_BOPOMOFO:
 374             sRet = "zh";
 375             break;
 376         case USCRIPT_CHEROKEE:
 377             sRet = "chr";
 378             break;
 379         case USCRIPT_COPTIC:
 380             sRet = "cop";
 381             break;
 382         case USCRIPT_CYRILLIC:
 383             sRet = "ru";
 384             break;
 385         case USCRIPT_DESERET:
 386             sRet = "en";
 387             break;
 388         case USCRIPT_DEVANAGARI:
 389             sRet = "hi";
 390             break;
 391         case USCRIPT_ETHIOPIC:
 392             sRet = "am";
 393             break;
 394         case USCRIPT_GEORGIAN:
 395             sRet = "ka";
 396             break;
 397         case USCRIPT_GOTHIC:
 398             sRet = "got";
 399             break;
 400         case USCRIPT_GREEK:
 401             sRet = "el";
 402             break;
 403         case USCRIPT_GUJARATI:
 404             sRet = "gu";
 405             break;
 406         case USCRIPT_GURMUKHI:
 407             sRet = "pa";
 408             break;
 409         case USCRIPT_HAN:
 410             sRet = "zh";
 411             break;
 412         case USCRIPT_HANGUL:
 413             sRet = "ko";
 414             break;
 415         case USCRIPT_HEBREW:
 416             sRet = "hr";
 417             break;
 418         case USCRIPT_HIRAGANA:
 419             sRet = "ja";
 420             break;
 421         case USCRIPT_KANNADA:
 422             sRet = "kn";
 423             break;
 424         case USCRIPT_KATAKANA:
 425             sRet = "ja";
 426             break;
 427         case USCRIPT_KHMER:
 428             sRet = "km";
 429             break;
 430         case USCRIPT_LAO:
 431             sRet = "lo";
 432             break;
 433         case USCRIPT_LATIN:
 434             sRet = "en";
 435             break;
 436         case USCRIPT_MALAYALAM:
 437             sRet = "ml";
 438             break;
 439         case USCRIPT_MONGOLIAN:
 440             sRet = "mn";
 441             break;
 442         case USCRIPT_MYANMAR:
 443             sRet = "my";
 444             break;
 445         case USCRIPT_OGHAM:
 446             sRet = "pgl";
 447             break;
 448         case USCRIPT_OLD_ITALIC:
 449             sRet = "osc";
 450             break;
 451         case USCRIPT_ORIYA:
 452             sRet = "or";
 453             break;
 454         case USCRIPT_RUNIC:
 455             sRet = "ang";
 456             break;
 457         case USCRIPT_SINHALA:
 458             sRet = "si";
 459             break;
 460         case USCRIPT_SYRIAC:
 461             sRet = "syr";
 462             break;
 463         case USCRIPT_TAMIL:
 464             sRet = "ta";
 465             break;
 466         case USCRIPT_TELUGU:
 467             sRet = "te";
 468             break;
 469         case USCRIPT_THAANA:
 470             sRet = "dv";
 471             break;
 472         case USCRIPT_THAI:
 473             sRet = "th";
 474             break;
 475         case USCRIPT_TIBETAN:
 476             sRet = "bo";
 477             break;
 478         case USCRIPT_CANADIAN_ABORIGINAL:
 479             sRet = "iu";
 480             break;
 481         case USCRIPT_YI:
 482             sRet = "ii";
 483             break;
 484         case USCRIPT_TAGALOG:
 485             sRet = "tl";
 486             break;
 487         case USCRIPT_HANUNOO:
 488             sRet = "hnn";
 489             break;
 490         case USCRIPT_BUHID:
 491             sRet = "bku";
 492             break;
 493         case USCRIPT_TAGBANWA:
 494             sRet = "tbw";
 495             break;
 496         case USCRIPT_BRAILLE:
 497             sRet = "en";
 498             break;
 499         case USCRIPT_CYPRIOT:
 500             sRet = "ecy";
 501             break;
 502         case USCRIPT_LIMBU:
 503             sRet = "lif";
 504             break;
 505         case USCRIPT_LINEAR_B:
 506             sRet = "gmy";
 507             break;
 508         case USCRIPT_OSMANYA:
 509             sRet = "so";
 510             break;
 511         case USCRIPT_SHAVIAN:
 512             sRet = "en";
 513             break;
 514         case USCRIPT_TAI_LE:
 515             sRet = "tdd";
 516             break;
 517         case USCRIPT_UGARITIC:
 518             sRet = "uga";
 519             break;
 520         case USCRIPT_KATAKANA_OR_HIRAGANA:
 521             sRet = "ja";
 522             break;
 523         case USCRIPT_BUGINESE:
 524             sRet = "bug";
 525             break;
 526         case USCRIPT_GLAGOLITIC:
 527             sRet = "ch";
 528             break;
 529         case USCRIPT_KHAROSHTHI:
 530             sRet = "pra";
 531             break;
 532         case USCRIPT_SYLOTI_NAGRI:
 533             sRet = "syl";
 534             break;
 535         case USCRIPT_NEW_TAI_LUE:
 536             sRet = "khb";
 537             break;
 538         case USCRIPT_TIFINAGH:
 539             sRet = "tmh";
 540             break;
 541         case USCRIPT_OLD_PERSIAN:
 542             sRet = "peo";
 543             break;
 544         case USCRIPT_BALINESE:
 545             sRet = "ban";
 546             break;
 547         case USCRIPT_BATAK:
 548             sRet = "btk";
 549             break;
 550         case USCRIPT_BLISSYMBOLS:
 551             sRet = "en";
 552             break;
 553         case USCRIPT_BRAHMI:
 554             sRet = "pra";
 555             break;
 556         case USCRIPT_CHAM:
 557             sRet = "cja";
 558             break;
 559         case USCRIPT_CIRTH:
 560             sRet = "sjn";
 561             break;
 562         case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
 563             sRet = "cu";
 564             break;
 565         case USCRIPT_DEMOTIC_EGYPTIAN:
 566         case USCRIPT_HIERATIC_EGYPTIAN:
 567         case USCRIPT_EGYPTIAN_HIEROGLYPHS:
 568             sRet = "egy";
 569             break;
 570         case USCRIPT_KHUTSURI:
 571             sRet = "ka";
 572             break;
 573         case USCRIPT_SIMPLIFIED_HAN:
 574             sRet = "zh";
 575             break;
 576         case USCRIPT_TRADITIONAL_HAN:
 577             sRet = "zh";
 578             break;
 579         case USCRIPT_PAHAWH_HMONG:
 580             sRet = "blu";
 581             break;
 582         case USCRIPT_OLD_HUNGARIAN:
 583             sRet = "ohu";
 584             break;
 585         case USCRIPT_HARAPPAN_INDUS:
 586             sRet = "xiv";
 587             break;
 588         case USCRIPT_JAVANESE:
 589             sRet = "kaw";
 590             break;
 591         case USCRIPT_KAYAH_LI:
 592             sRet = "eky";
 593             break;
 594         case USCRIPT_LATIN_FRAKTUR:
 595             sRet = "de";
 596             break;
 597         case USCRIPT_LATIN_GAELIC:
 598             sRet = "ga";
 599             break;
 600         case USCRIPT_LEPCHA:
 601             sRet = "lep";
 602             break;
 603         case USCRIPT_LINEAR_A:
 604             sRet = "ecr";
 605             break;
 606         case USCRIPT_MAYAN_HIEROGLYPHS:
 607             sRet = "myn";
 608             break;
 609         case USCRIPT_MEROITIC:
 610             sRet = "xmr";
 611             break;
 612         case USCRIPT_NKO:
 613             sRet = "nqo";
 614             break;
 615         case USCRIPT_ORKHON:
 616             sRet = "otk";
 617             break;
 618         case USCRIPT_OLD_PERMIC:
 619             sRet = "kv";
 620             break;
 621         case USCRIPT_PHAGS_PA:
 622             sRet = "xng";
 623             break;
 624         case USCRIPT_PHOENICIAN:
 625             sRet = "phn";
 626             break;
 627         case USCRIPT_PHONETIC_POLLARD:
 628             sRet = "hmd";
 629             break;
 630         case USCRIPT_RONGORONGO:
 631             sRet = "rap";
 632             break;
 633         case USCRIPT_SARATI:
 634             sRet = "qya";
 635             break;
 636         case USCRIPT_ESTRANGELO_SYRIAC:
 637             sRet = "syr";
 638             break;
 639         case USCRIPT_WESTERN_SYRIAC:
 640             sRet = "tru";
 641             break;
 642         case USCRIPT_EASTERN_SYRIAC:
 643             sRet = "aii";
 644             break;
 645         case USCRIPT_TENGWAR:
 646             sRet = "sjn";
 647             break;
 648         case USCRIPT_VAI:
 649             sRet = "vai";
 650             break;
 651         case USCRIPT_VISIBLE_SPEECH:
 652             sRet = "en";
 653             break;
 654         case USCRIPT_CUNEIFORM:
 655             sRet = "akk";
 656             break;
 657         case USCRIPT_CARIAN:
 658             sRet = "xcr";
 659             break;
 660         case USCRIPT_JAPANESE:
 661             sRet = "ja";
 662             break;
 663         case USCRIPT_LANNA:
 664             sRet = "nod";
 665             break;
 666         case USCRIPT_LYCIAN:
 667             sRet = "xlc";
 668             break;
 669         case USCRIPT_LYDIAN:
 670             sRet = "xld";
 671             break;
 672         case USCRIPT_OL_CHIKI:
 673             sRet = "sat";
 674             break;
 675         case USCRIPT_REJANG:
 676             sRet = "rej";
 677             break;
 678         case USCRIPT_SAURASHTRA:
 679             sRet = "saz";
 680             break;
 681         case USCRIPT_SIGN_WRITING:
 682             sRet = "en";
 683             break;
 684         case USCRIPT_SUNDANESE:
 685             sRet = "su";
 686             break;
 687         case USCRIPT_MOON:
 688             sRet = "en";
 689             break;
 690         case USCRIPT_MEITEI_MAYEK:
 691             sRet = "mni";
 692             break;
 693         case USCRIPT_IMPERIAL_ARAMAIC:
 694             sRet = "arc";
 695             break;
 696         case USCRIPT_AVESTAN:
 697             sRet = "ae";
 698             break;
 699         case USCRIPT_CHAKMA:
 700             sRet = "ccp";
 701             break;
 702         case USCRIPT_KOREAN:
 703             sRet = "ko";
 704             break;
 705         case USCRIPT_KAITHI:
 706             sRet = "awa";
 707             break;
 708         case USCRIPT_MANICHAEAN:
 709             sRet = "xmn";
 710             break;
 711         case USCRIPT_INSCRIPTIONAL_PAHLAVI:
 712         case USCRIPT_PSALTER_PAHLAVI:
 713         case USCRIPT_BOOK_PAHLAVI:
 714         case USCRIPT_INSCRIPTIONAL_PARTHIAN:
 715             sRet = "xpr";
 716             break;
 717         case USCRIPT_SAMARITAN:
 718             sRet = "heb";
 719             break;
 720         case USCRIPT_TAI_VIET:
 721             sRet = "blt";
 722             break;
 723         case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
 724             sRet = "mic";
 725             break;
 726         case USCRIPT_NABATAEAN:
 727             sRet = "mis-Nbat";  // Uncoded with script
 728             break;
 729         case USCRIPT_PALMYRENE:
 730             sRet = "mis-Palm";  // Uncoded with script
 731             break;
 732         case USCRIPT_BAMUM:
 733             sRet = "bax";
 734             break;
 735         case USCRIPT_LISU:
 736             sRet = "lis";
 737             break;
 738         case USCRIPT_NAKHI_GEBA:
 739             sRet = "nxq";
 740             break;
 741         case USCRIPT_OLD_SOUTH_ARABIAN:
 742             sRet = "xsa";
 743             break;
 744         case USCRIPT_BASSA_VAH:
 745             sRet = "bsq";
 746             break;
 747         case USCRIPT_DUPLOYAN_SHORTAND:
 748             sRet = "fr";
 749             break;
 750         case USCRIPT_ELBASAN:
 751             sRet = "sq";
 752             break;
 753         case USCRIPT_GRANTHA:
 754             sRet = "ta";
 755             break;
 756         case USCRIPT_KPELLE:
 757             sRet = "kpe";
 758             break;
 759         case USCRIPT_LOMA:
 760             sRet = "lom";
 761             break;
 762         case USCRIPT_MENDE:
 763             sRet = "men";
 764             break;
 765         case USCRIPT_MEROITIC_CURSIVE:
 766             sRet = "xmr";
 767             break;
 768         case USCRIPT_OLD_NORTH_ARABIAN:
 769             sRet = "xna";
 770             break;
 771         case USCRIPT_SINDHI:
 772             sRet = "sd";
 773             break;
 774         case USCRIPT_WARANG_CITI:
 775             sRet = "hoc";
 776             break;
 777 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
 778         case USCRIPT_AFAKA:
 779             sRet = "djk";
 780             break;
 781         case USCRIPT_JURCHEN:
 782             sRet = "juc";
 783             break;
 784         case USCRIPT_MRO:
 785             sRet = "cmr";
 786             break;
 787         case USCRIPT_NUSHU:
 788             sRet = "mis-Nshu";  // Uncoded with script
 789             break;
 790         case USCRIPT_SHARADA:
 791             sRet = "sa";
 792             break;
 793         case USCRIPT_SORA_SOMPENG:
 794             sRet = "srb";
 795             break;
 796         case USCRIPT_TAKRI:
 797             sRet = "doi";
 798             break;
 799         case USCRIPT_TANGUT:
 800             sRet = "txg";
 801             break;
 802         case USCRIPT_WOLEAI:
 803             sRet = "woe";
 804             break;
 805 #endif
 806 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
 807         case USCRIPT_ANATOLIAN_HIEROGLYPHS:
 808             sRet = "hlu";
 809             break;
 810         case USCRIPT_KHOJKI:
 811             sRet = "gu";
 812             break;
 813         case USCRIPT_TIRHUTA:
 814             sRet = "mai";
 815             break;
 816 #endif
 817 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
 818         case USCRIPT_CAUCASIAN_ALBANIAN:
 819             sRet = "xag";
 820             break;
 821         case USCRIPT_MAHAJANI:
 822             sRet = "mwr";
 823             break;
 824 #endif
 825 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
 826         case USCRIPT_AHOM:
 827             sRet = "aho";
 828             break;
 829         case USCRIPT_HATRAN:
 830             sRet = "qly-Hatr";
 831             break;
 832         case USCRIPT_MODI:
 833             sRet = "mr-Modi";
 834             break;
 835         case USCRIPT_MULTANI:
 836             sRet = "skr-Mutl";
 837             break;
 838         case USCRIPT_PAU_CIN_HAU:
 839             sRet = "ctd-Pauc";
 840             break;
 841         case USCRIPT_SIDDHAM:
 842             sRet = "sa-Sidd";
 843             break;
 844 #endif
 845 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
 846         case USCRIPT_ADLAM:
 847             sRet = "mis-Adlm";   // Adlam for Fulani, no language code
 848             break;
 849         case USCRIPT_BHAIKSUKI:
 850             sRet = "mis-Bhks";   // Bhaiksuki for some Buddhist texts, no language code
 851             break;
 852         case USCRIPT_MARCHEN:
 853             sRet = "bo-Marc";
 854             break;
 855         case USCRIPT_NEWA:
 856             sRet = "new-Newa";
 857             break;
 858         case USCRIPT_OSAGE:
 859             sRet = "osa-Osge";
 860             break;
 861         case USCRIPT_HAN_WITH_BOPOMOFO:
 862             sRet = "mis-Hanb";   // Han with Bopomofo, zh-Hanb ?
 863             break;
 864         case USCRIPT_JAMO:
 865             sRet = "ko";   // Jamo - elements of Hangul Syllables
 866             break;
 867         case USCRIPT_SYMBOLS_EMOJI:
 868             sRet = "mis-Zsye";   // Emoji variant
 869             break;
 870 #endif
 871 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
 872         case USCRIPT_MASARAM_GONDI:
 873             sRet = "gon-Gonm";  // macro language code, could be wsg,esg,gno
 874             break;
 875         case USCRIPT_SOYOMBO:
 876             sRet = "mn-Soyo";   // abugida to write Mongolian, also Tibetan and Sanskrit
 877             break;
 878         case USCRIPT_ZANABAZAR_SQUARE:
 879             sRet = "mn-Zanb";   // abugida to write Mongolian
 880             break;
 881 #endif
 882 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
 883         case USCRIPT_DOGRA:
 884             sRet = "dgo";       // Dogri proper
 885             break;
 886         case USCRIPT_GUNJALA_GONDI:
 887             sRet = "wsg";       // Adilabad Gondi
 888             break;
 889         case USCRIPT_MAKASAR:
 890             sRet = "mak";
 891             break;
 892         case USCRIPT_MEDEFAIDRIN:
 893             sRet = "dmf-Medf";
 894             break;
 895         case USCRIPT_HANIFI_ROHINGYA:
 896             sRet = "rhg";
 897             break;
 898         case USCRIPT_SOGDIAN:
 899             sRet = "sog";
 900             break;
 901         case USCRIPT_OLD_SOGDIAN:
 902             sRet = "sog";
 903             break;
 904 #endif
 905 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
 906         case USCRIPT_ELYMAIC:
 907             sRet = "arc-Elym";
 908             break;
 909         case USCRIPT_NYIAKENG_PUACHUE_HMONG:
 910             sRet = "hmn-Hmnp";  // macrolanguage code
 911             break;
 912         case USCRIPT_NANDINAGARI:
 913             sRet = "sa-Nand";
 914             break;
 915         case USCRIPT_WANCHO:
 916             sRet = "nnp-Wcho";
 917             break;
 918 #endif
 919 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
 920         case USCRIPT_CHORASMIAN:
 921             sRet = "xco-Chrs";
 922             break;
 923         case USCRIPT_DIVES_AKURU:
 924             sRet = "dv-Diak";
 925             break;
 926         case USCRIPT_KHITAN_SMALL_SCRIPT:
 927             sRet = "zkt-Kits";
 928             break;
 929         case USCRIPT_YEZIDI:
 930             sRet = "kmr-Yezi";
 931             break;
 932 #endif
 933 #if (U_ICU_VERSION_MAJOR_NUM >= 70)
 934         case USCRIPT_CYPRO_MINOAN:
 935             sRet = "mis-Cpmn";  // Uncoded with script
 936             break;
 937         case USCRIPT_OLD_UYGHUR:
 938             sRet = "oui-Ougr";
 939             break;
 940         case USCRIPT_TANGSA:
 941             sRet = "nst-Tnsa";
 942             break;
 943         case USCRIPT_TOTO:
 944             sRet = "txo-Toto";
 945             break;
 946         case USCRIPT_VITHKUQI:
 947             sRet = "sq-Vith";   // macrolanguage code
 948             break;
 949 #endif
 950 #if (U_ICU_VERSION_MAJOR_NUM >= 72)
 951         case USCRIPT_KAWI:
 952             sRet = "mis-Kawi";  // Uncoded with script
 953             break;
 954         case USCRIPT_NAG_MUNDARI:
 955             sRet = "unr-Nagm";
 956             break;
 957 #endif
 958     }
 959     return sRet;
 960 }
 961
 962 //Format a number as a percentage according to the rules of the given
 963 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
 964 OUString unicode::formatPercent(double dNumber,
 965     const LanguageTag &rLangTag)
 966 {
 967     // get a currency formatter for this locale ID
 968     UErrorCode errorCode=U_ZERO_ERROR;
 969
 970     LanguageTag aLangTag(rLangTag);
 971
 972     // As of CLDR Version 24 these languages were not listed as using spacing
 973     // between number and % but are reported as such by our l10n groups
 974     // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
 975     // so format using French which has the desired rules
 976     if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
 977         aLangTag.reset("fr-FR");
 978
 979     icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
 980
 981     std::unique_ptr<icu::NumberFormat> xF(
 982         icu::NumberFormat::createPercentInstance(aLocale, errorCode));
 983     if(U_FAILURE(errorCode))
 984     {
 985         SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
 986         return OUString::number(dNumber) + "%";
 987     }
 988
 989     icu::UnicodeString output;
 990     xF->format(dNumber/100, output);
 991     OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
 992         output.length());
 993     if (rLangTag.getLanguage() == "de")
 994     {
 995         //narrow no-break space instead of (normal) no-break space
 996         return aRet.replace(0x00A0, 0x202F);
 997     }
 998     return aRet;
 999 }
1000
1001 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
1002 {
1003     //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1004     if( maInput.getLength() > 255 )
1005         mbAllowMoreChars = false;
1006
1007     if( !mbAllowMoreChars )
1008         return false;
1009
1010     bool bPreventNonHex = false;
1011     if( maInput.indexOf("U+") != -1 )
1012         bPreventNonHex = true;
1013
1014     switch ( unicode::getUnicodeType(uChar) )
1015     {
1016         case css::i18n::UnicodeType::SURROGATE:
1017             if( bPreventNonHex )
1018             {
1019                 mbAllowMoreChars = false;
1020                 return false;
1021             }
1022
1023             if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty()  )
1024             {
1025                 maUtf16.append(uChar);
1026                 return true;
1027             }
1028             if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
1029                 maUtf16.insert(0, uChar );
1030             //end of hex strings, or unexpected order of high/low, so don't accept more
1031             if( !maUtf16.isEmpty() )
1032                 maInput.append(maUtf16);
1033             if( !maCombining.isEmpty() )
1034                 maInput.append(maCombining);
1035             mbAllowMoreChars = false;
1036             break;
1037
1038         case css::i18n::UnicodeType::NON_SPACING_MARK:
1039         case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
1040             if( bPreventNonHex )
1041             {
1042                 mbAllowMoreChars = false;
1043                 return false;
1044             }
1045
1046             //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1047             if( !maUtf16.isEmpty() )
1048             {
1049                 maInput = maUtf16;
1050                 if( !maCombining.isEmpty() )
1051                     maInput.append(maCombining);
1052                 mbAllowMoreChars = false;
1053                 return false;
1054             }
1055             maCombining.insert(0, uChar);
1056             break;
1057
1058         default:
1059             //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1060             if( !maUtf16.isEmpty() )
1061             {
1062                 maInput = maUtf16;
1063                 if( !maCombining.isEmpty() )
1064                     maInput.append(maCombining);
1065                 mbAllowMoreChars = false;
1066                 return false;
1067             }
1068
1069             if( !maCombining.isEmpty() )
1070             {
1071                 maCombining.insert(0, uChar);
1072                 maInput = maCombining;
1073                 mbAllowMoreChars = false;
1074                 return false;
1075             }
1076
1077             // 0 - 1f are control characters.  Do not process those.
1078             if( uChar < 0x20 )
1079             {
1080                 mbAllowMoreChars = false;
1081                 return false;
1082             }
1083
1084             switch( uChar )
1085             {
1086                 case 'u':
1087                 case 'U':
1088                     // U+ notation found.  Continue looking for another one.
1089                     if( mbRequiresU )
1090                     {
1091                         mbRequiresU = false;
1092                         maInput.insert(0,"U+");
1093                     }
1094                     // treat as a normal character
1095                     else
1096                     {
1097                         mbAllowMoreChars = false;
1098                         if( !bPreventNonHex )
1099                             maInput.insertUtf32(0, uChar);
1100                     }
1101                     break;
1102                 case '+':
1103                     // + already found: skip when not U, or edge case of +U+xxxx
1104                     if( mbRequiresU || (maInput.indexOf("U+") == 0) )
1105                         mbAllowMoreChars = false;
1106                     // hex chars followed by '+' - now require a 'U'
1107                     else if ( !maInput.isEmpty() )
1108                         mbRequiresU = true;
1109                     // treat as a normal character
1110                     else
1111                     {
1112                         mbAllowMoreChars = false;
1113                         if( !bPreventNonHex )
1114                             maInput.insertUtf32(0, uChar);
1115                     }
1116                     break;
1117                 default:
1118                     // + already found. Since not U, cancel further input
1119                     if( mbRequiresU )
1120                         mbAllowMoreChars = false;
1121                     // maximum digits per notation is 8: only one notation
1122                     else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
1123                         mbAllowMoreChars = false;
1124                     // maximum digits per notation is 8: previous notation found
1125                     else if( maInput.indexOf("U+") == 8 )
1126                         mbAllowMoreChars = false;
1127                     // a hex character. Add to string.
1128                     else if( rtl::isAsciiHexDigit(uChar) )
1129                     {
1130                         mbIsHexString = true;
1131                         maInput.insertUtf32(0, uChar);
1132                     }
1133                     // not a hex character: stop input. keep if it is the first input provided
1134                     else
1135                     {
1136                         mbAllowMoreChars = false;
1137                         if( maInput.isEmpty() )
1138                             maInput.insertUtf32(0, uChar);
1139                     }
1140             }
1141     }
1142     return mbAllowMoreChars;
1143 }
1144
1145 OUString ToggleUnicodeCodepoint::StringToReplace()
1146 {
1147     if( maInput.isEmpty() )
1148     {
1149         //edge case - input finished with incomplete low surrogate or combining characters without a base
1150         if( mbAllowMoreChars )
1151         {
1152             if( !maUtf16.isEmpty() )
1153                 maInput = maUtf16;
1154             if( !maCombining.isEmpty() )
1155                 maInput.append(maCombining);
1156         }
1157         return maInput.toString();
1158     }
1159
1160     if( !mbIsHexString )
1161         return maInput.toString();
1162
1163     //this function potentially modifies the input string.  Prevent addition of further characters
1164     mbAllowMoreChars = false;
1165
1166     //validate unicode notation.
1167     OUString sIn;
1168     sal_uInt32 nUnicode = 0;
1169     sal_Int32 nUPlus = maInput.indexOf("U+");
1170     //if U+ notation used, strip off all extra chars added not in U+ notation
1171     if( nUPlus != -1 )
1172     {
1173         maInput.remove(0, nUPlus);
1174         sIn = maInput.copy(2).makeStringAndClear();
1175         nUPlus = sIn.indexOf("U+");
1176     }
1177     else
1178         sIn = maInput.toString();
1179     while( nUPlus != -1 )
1180     {
1181         nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1182         //prevent creating control characters or invalid Unicode values
1183         if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20  )
1184             maInput = sIn.subView(nUPlus);
1185         sIn = sIn.copy(nUPlus+2);
1186         nUPlus =  sIn.indexOf("U+");
1187     }
1188
1189     nUnicode = sIn.toUInt32(16);
1190     if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1191        maInput.truncate().append( sIn[sIn.getLength()-1] );
1192     return maInput.toString();
1193 }
1194
1195 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
1196 {
1197     OUString sIn = StringToReplace();
1198     sal_Int32 nPos = 0;
1199     sal_uInt32 counter = 0;
1200     while( nPos < sIn.getLength() )
1201     {
1202         sIn.iterateCodePoints(&nPos);
1203         ++counter;
1204     }
1205     return counter;
1206 }
1207
1208 OUString ToggleUnicodeCodepoint::ReplacementString()
1209 {
1210     OUString sIn = StringToReplace();
1211     OUStringBuffer output = "";
1212     sal_Int32 nUPlus = sIn.indexOf("U+");
1213     // convert from hex notation to glyph
1214     if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1215     {
1216         sal_uInt32 nUnicode = 0;
1217         if( nUPlus == 0)
1218         {
1219             sIn = sIn.copy(2);
1220             nUPlus = sIn.indexOf("U+");
1221         }
1222         while( nUPlus > 0 )
1223         {
1224             nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1225             output.appendUtf32( nUnicode );
1226
1227             sIn = sIn.copy(nUPlus+2);
1228             nUPlus = sIn.indexOf("U+");
1229         }
1230         nUnicode = sIn.toUInt32(16);
1231         output.appendUtf32( nUnicode );
1232     }
1233     // convert from glyph to hex notation
1234     else
1235     {
1236         sal_Int32 nPos = 0;
1237         while( nPos < sIn.getLength() )
1238         {
1239             OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1240             //pad with zeros - minimum length of 4.
1241             for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1242                 aTmp.insert( 0,"0" );
1243             output.append( "U+" + aTmp );
1244         }
1245     }
1246     return output.makeStringAndClear();
1247 }
1248
1249 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */