i18npool/source/breakiterator/breakiterator_unicode.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <breakiterator_unicode.hxx>
  21 #include <cppuhelper/supportsservice.hxx>
  22 #include <localedata.hxx>
  23 #include <i18nlangtag/languagetag.hxx>
  24 #include <i18nlangtag/languagetagicu.hxx>
  25 #include <unicode/uchar.h>
  26 #include <unicode/locid.h>
  27 #include <unicode/rbbi.h>
  28 #include <unicode/udata.h>
  29 #include <rtl/strbuf.hxx>
  30 #include <rtl/ustring.hxx>
  31 #include <string.h>
  32
  33 U_CDECL_BEGIN
  34 extern const char OpenOffice_dat[];
  35 U_CDECL_END
  36
  37 using namespace ::com::sun::star;
  38 using namespace ::com::sun::star::i18n;
  39 using namespace ::com::sun::star::lang;
  40
  41 namespace i18npool {
  42
  43 // Cache map of breakiterators, stores state information so has to be
  44 // thread_local.
  45 thread_local static BreakIterator_Unicode::BIMap theBIMap;
  46
  47 BreakIterator_Unicode::BreakIterator_Unicode()
  48     : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" )    // implementation name
  49     , lineRule( "line" )
  50     , icuBI( nullptr )
  51 {
  52 }
  53
  54 BreakIterator_Unicode::~BreakIterator_Unicode()
  55 {
  56 }
  57
  58 /*
  59     Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
  60     setbreakType method.
  61 */
  62 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
  63 {
  64     public:
  65 #if (U_ICU_VERSION_MAJOR_NUM < 58)
  66     // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
  67     void publicSetBreakType(int32_t type)
  68         {
  69             setBreakType(type);
  70         };
  71 #endif
  72     OOoRuleBasedBreakIterator(UDataMemory* image,
  73                               UErrorCode &status)
  74         : icu::RuleBasedBreakIterator(image, status)
  75         { };
  76
  77 };
  78
  79 // loading ICU breakiterator on demand.
  80 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
  81         sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText)
  82 {
  83     bool bNewBreak = false;
  84     UErrorCode status = U_ZERO_ERROR;
  85     sal_Int16 breakType = 0;
  86     switch (rBreakType) {
  87         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
  88         case LOAD_WORD_BREAKITERATOR:
  89             assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
  90             icuBI=&words[nWordType];
  91             switch (nWordType) {
  92                 case WordType::ANY_WORD: break; // odd but previous behavior
  93                 case WordType::ANYWORD_IGNOREWHITESPACES:
  94                     breakType = 0; rule = "edit_word"; break;
  95                 case WordType::DICTIONARY_WORD:
  96                     breakType = 1; rule = "dict_word"; break;
  97                 default:
  98                 case WordType::WORD_COUNT:
  99                     breakType = 2; rule = "count_word"; break;
 100             }
 101             break;
 102         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
 103         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
 104     }
 105
 106     // Using the cache map prevents accessing the file system for each
 107     // udata_open() where ICU tries first files then data objects. And that for
 108     // two fallbacks worst case.. for each new allocated EditEngine, layout
 109     // cell, ... *ouch*  Also non-rule locale based iterators can be mapped.
 110     // This also speeds up loading iterators for alternating or generally more
 111     // than one language/locale in that iterators are not constructed and
 112     // destroyed en masse.
 113     // Four possible keys, locale rule based with break type, locale rule based
 114     // only, rule based only, locale based with break type. A fifth global key
 115     // for the initial lookup.
 116     // Multiple global keys may map to identical value data.
 117     // All enums used here should be in the range 0..9 so assert that and avoid
 118     // expensive numeric conversion in append() for faster construction of the
 119     // always used global key.
 120     assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
 121     const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
 122     OStringBuffer aKeyBuf(64);
 123     aKeyBuf.append( aLangtagStr).append(';');
 124     if (rule)
 125         aKeyBuf.append(rule);
 126     aKeyBuf.append(';').append( static_cast<sal_Char>('0'+breakType)).append(';').
 127         append( static_cast<sal_Char>('0'+rBreakType)).append(';').append( static_cast<sal_Char>('0'+nWordType));
 128     // langtag;rule;breakType;rBreakType;nWordType
 129     const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
 130
 131     if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
 132     {
 133
 134         auto aMapIt( theBIMap.find( aBIMapGlobalKey));
 135         bool bInMap = (aMapIt != theBIMap.end());
 136         if (bInMap)
 137             icuBI->mpValue = aMapIt->second;
 138         else
 139             icuBI->mpValue.reset();
 140
 141         if (!bInMap && rule) do {
 142             uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
 143
 144             status = U_ZERO_ERROR;
 145             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
 146             if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
 147
 148             OOoRuleBasedBreakIterator *rbi = nullptr;
 149
 150             if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
 151             {
 152                 // langtag;rule;breakType
 153                 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
 154                 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
 155                 bInMap = (aMapIt != theBIMap.end());
 156                 if (bInMap)
 157                 {
 158                     icuBI->mpValue = aMapIt->second;
 159                     icuBI->maBIMapKey = aBIMapGlobalKey;
 160                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 161                     break;  // do
 162                 }
 163
 164                 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
 165                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
 166
 167                 if (U_SUCCESS(status))
 168                 {
 169                     icuBI->mpValue.reset( new BI_ValueData);
 170                     icuBI->mpValue->mpBreakIterator.reset( rbi);
 171                     theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
 172                 }
 173                 else
 174                 {
 175                     delete rbi;
 176                     rbi = nullptr;
 177                 }
 178             }
 179             //use icu's breakiterator for Thai, Tibetan and Dzongkha
 180             else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
 181             {
 182                 // language;rule (not langtag, unless we'd actually load such)
 183                 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
 184                 const OString aBIMapRuleKey( aLanguage + ";" + rule);
 185                 aMapIt = theBIMap.find( aBIMapRuleKey);
 186                 bInMap = (aMapIt != theBIMap.end());
 187                 if (bInMap)
 188                 {
 189                     icuBI->mpValue = aMapIt->second;
 190                     icuBI->maBIMapKey = aBIMapGlobalKey;
 191                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 192                     break;  // do
 193                 }
 194
 195                 status = U_ZERO_ERROR;
 196                 OStringBuffer aUDName(64);
 197                 aUDName.append(rule);
 198                 aUDName.append('_');
 199                 aUDName.append( aLanguage);
 200                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
 201                 if( U_SUCCESS(status) )
 202                     rbi = new OOoRuleBasedBreakIterator( pUData, status);
 203                 if ( U_SUCCESS(status) )
 204                 {
 205                     icuBI->mpValue.reset( new BI_ValueData);
 206                     icuBI->mpValue->mpBreakIterator.reset( rbi);
 207                     theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
 208                 }
 209                 else
 210                 {
 211                     delete rbi;
 212                     rbi = nullptr;
 213
 214                     // ;rule (only)
 215                     const OString aBIMapRuleOnlyKey( OString(";") + rule);
 216                     aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
 217                     bInMap = (aMapIt != theBIMap.end());
 218                     if (bInMap)
 219                     {
 220                         icuBI->mpValue = aMapIt->second;
 221                         icuBI->maBIMapKey = aBIMapGlobalKey;
 222                         theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 223                         break;  // do
 224                     }
 225
 226                     status = U_ZERO_ERROR;
 227                     pUData = udata_open("OpenOffice", "brk", rule, &status);
 228                     if( U_SUCCESS(status) )
 229                         rbi = new OOoRuleBasedBreakIterator( pUData, status);
 230                     if ( U_SUCCESS(status) )
 231                     {
 232                         icuBI->mpValue.reset( new BI_ValueData);
 233                         icuBI->mpValue->mpBreakIterator.reset( rbi);
 234                         theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
 235                     }
 236                     else
 237                     {
 238                         delete rbi;
 239                         rbi = nullptr;
 240                     }
 241                 }
 242             }
 243             if (rbi) {
 244 #if (U_ICU_VERSION_MAJOR_NUM < 58)
 245                 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
 246                 // instead of protected, so the old workaround of
 247                 // https://ssl.icu-project.org/trac/ticket/5498
 248                 // doesn't work anymore. However, they also claim to have fixed
 249                 // the cause that an initial fBreakType==-1 would lead to an
 250                 // endless loop under some circumstances.
 251                 // Let's see ...
 252                 switch (rBreakType) {
 253                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
 254                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
 255                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
 256                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
 257                 }
 258 #endif
 259             }
 260         } while (false);
 261
 262         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) do {
 263             // langtag;;;rBreakType (empty rule; empty breakType)
 264             const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
 265             aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
 266             bInMap = (aMapIt != theBIMap.end());
 267             if (bInMap)
 268             {
 269                 icuBI->mpValue = aMapIt->second;
 270                 icuBI->maBIMapKey = aBIMapGlobalKey;
 271                 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 272                 break;  // do
 273             }
 274
 275             icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
 276             std::shared_ptr< icu::BreakIterator > pBI;
 277
 278             status = U_ZERO_ERROR;
 279             switch (rBreakType) {
 280                 case LOAD_CHARACTER_BREAKITERATOR:
 281                     pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
 282                     break;
 283                 case LOAD_WORD_BREAKITERATOR:
 284                     pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
 285                     break;
 286                 case LOAD_SENTENCE_BREAKITERATOR:
 287                     pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
 288                     break;
 289                 case LOAD_LINE_BREAKITERATOR:
 290                     pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
 291                     break;
 292             }
 293             if ( !U_SUCCESS(status) || !pBI ) {
 294                 throw uno::RuntimeException();
 295             }
 296             icuBI->mpValue.reset( new BI_ValueData);
 297             icuBI->mpValue->mpBreakIterator = pBI;
 298             theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
 299         } while (false);
 300         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
 301             throw uno::RuntimeException();
 302         }
 303         icuBI->maBIMapKey = aBIMapGlobalKey;
 304         if (!bInMap)
 305             theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 306         bNewBreak=true;
 307     }
 308
 309     if (bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData)
 310     {
 311         const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
 312
 313         status = U_ZERO_ERROR;
 314         icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
 315
 316         if (!U_SUCCESS(status))
 317             throw uno::RuntimeException();
 318
 319         icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
 320
 321         if (!U_SUCCESS(status))
 322             throw uno::RuntimeException();
 323
 324         icuBI->mpValue->maICUText = rText;
 325     }
 326 }
 327
 328 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
 329         sal_Int32 nStartPos, const lang::Locale &rLocale,
 330         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 331 {
 332     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 333         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 334         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
 335         for (nDone = 0; nDone < nCount; nDone++) {
 336             nStartPos = pBI->following(nStartPos);
 337             if (nStartPos == icu::BreakIterator::DONE)
 338                 return Text.getLength();
 339         }
 340     } else { // for CHARACTER mode
 341         for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
 342             Text.iterateCodePoints(&nStartPos);
 343     }
 344     return nStartPos;
 345 }
 346
 347 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
 348         sal_Int32 nStartPos, const lang::Locale& rLocale,
 349         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 350 {
 351     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 352         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 353         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
 354         for (nDone = 0; nDone < nCount; nDone++) {
 355             nStartPos = pBI->preceding(nStartPos);
 356             if (nStartPos == icu::BreakIterator::DONE)
 357                 return 0;
 358         }
 359     } else { // for BS to delete one char and CHARACTER mode.
 360         for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
 361             Text.iterateCodePoints(&nStartPos, -1);
 362     }
 363     return nStartPos;
 364 }
 365
 366
 367 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
 368     const lang::Locale& rLocale, sal_Int16 rWordType )
 369 {
 370     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
 371
 372     Boundary rv;
 373     rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
 374     if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
 375         rv.endPos = result.startPos;
 376     else {
 377         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 378                     rWordType == WordType::DICTIONARY_WORD ) &&
 379                 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
 380             rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
 381
 382         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
 383         if(rv.endPos == icu::BreakIterator::DONE)
 384             rv.endPos = rv.startPos;
 385     }
 386     return rv;
 387 }
 388
 389
 390 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
 391         const lang::Locale& rLocale, sal_Int16 rWordType)
 392 {
 393     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
 394
 395     Boundary rv;
 396     rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
 397     if( rv.startPos < 0 || rv.startPos == icu::BreakIterator::DONE)
 398         rv.endPos = rv.startPos;
 399     else {
 400         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 401                     rWordType == WordType::DICTIONARY_WORD) &&
 402                 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
 403             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
 404
 405         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
 406         if(rv.endPos == icu::BreakIterator::DONE)
 407             rv.endPos = rv.startPos;
 408     }
 409     return rv;
 410 }
 411
 412
 413 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
 414         sal_Int16 rWordType, sal_Bool bDirection )
 415 {
 416     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
 417     sal_Int32 len = Text.getLength();
 418
 419     Boundary rv;
 420     if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
 421         rv.startPos = rv.endPos = nPos;
 422         if((bDirection || nPos == 0) && nPos < len) //forward
 423             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
 424         else
 425             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
 426     } else {
 427         if(nPos <= 0) {
 428             rv.startPos = 0;
 429             rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
 430         } else if(nPos >= len) {
 431             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
 432             rv.endPos = len;
 433         } else {
 434             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
 435             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
 436         }
 437     }
 438     if (rv.startPos == icu::BreakIterator::DONE)
 439         rv.startPos = rv.endPos;
 440     else if (rv.endPos == icu::BreakIterator::DONE)
 441         rv.endPos = rv.startPos;
 442
 443     return rv;
 444 }
 445
 446
 447 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
 448         const lang::Locale &rLocale )
 449 {
 450     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 451
 452     sal_Int32 len = Text.getLength();
 453     if (len > 0 && nStartPos == len)
 454         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 455     if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
 456         nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
 457
 458     // skip preceding space.
 459     sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
 460     while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
 461     Text.iterateCodePoints(&nStartPos, -1);
 462
 463     return nStartPos;
 464 }
 465
 466 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
 467         const lang::Locale &rLocale )
 468 {
 469     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 470
 471     sal_Int32 len = Text.getLength();
 472     if (len > 0 && nStartPos == len)
 473         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 474     nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
 475
 476     sal_Int32 nPos=nStartPos;
 477     while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
 478
 479     return nStartPos;
 480 }
 481
 482 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
 483         const OUString& Text, sal_Int32 nStartPos,
 484         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
 485         const LineBreakHyphenationOptions& hOptions,
 486         const LineBreakUserOptions& /*rOptions*/ )
 487 {
 488     LineBreakResults lbr;
 489
 490     if (nStartPos >= Text.getLength()) {
 491         lbr.breakIndex = Text.getLength();
 492         lbr.breakType = BreakType::WORDBOUNDARY;
 493         return lbr;
 494     }
 495
 496     loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
 497
 498     icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
 499     bool GlueSpace=true;
 500     while (GlueSpace) {
 501         if (pLineBI->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
 502             lbr.breakIndex = nStartPos;
 503             lbr.breakType = BreakType::WORDBOUNDARY;
 504         } else if (hOptions.rHyphenator.is()) { //Hyphenation break
 505             sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
 506             pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
 507
 508             sal_Int32 nStartPosWordEnd = nStartPos;
 509             while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
 510                 nStartPosWordEnd --;
 511
 512             Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
 513                 WordType::DICTIONARY_WORD, false);
 514
 515             nStartPosWordEnd = wBoundary.endPos;
 516             while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
 517                 nStartPosWordEnd ++;
 518             nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
 519             if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
 520 #define SPACE 0x0020
 521             while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
 522             uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
 523             aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
 524                         wBoundary.endPos - wBoundary.startPos), rLocale,
 525                     static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
 526             if (aHyphenatedWord.is()) {
 527                 lbr.rHyphenatedWord = aHyphenatedWord;
 528                 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
 529                     lbr.breakIndex = -1;
 530                 else
 531                     lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
 532                 lbr.breakType = BreakType::HYPHENATION;
 533
 534                 // check not optimal hyphenation of "word-word" (word with hyphens)
 535                 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
 536                     lbr.breakIndex = pLineBI->current();
 537                     lbr.breakType = BreakType::WORDBOUNDARY;
 538                 }
 539
 540             } else {
 541                 lbr.breakIndex = pLineBI->preceding(nStartPos);
 542                 lbr.breakType = BreakType::WORDBOUNDARY;
 543             }
 544         } else { //word boundary break
 545             lbr.breakIndex = pLineBI->preceding(nStartPos);
 546             lbr.breakType = BreakType::WORDBOUNDARY;
 547
 548             // Special case for Slash U+002F SOLIDUS in URI and path names.
 549             // TR14 defines that as SY: Symbols Allowing Break After (A).
 550             // This is unwanted in paths, see also i#17155
 551             if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
 552             {
 553                 // Look backward and take any whitespace before as a break
 554                 // opportunity. This also glues something like "w/o".
 555                 // Avoid an overly long path and break it as was indicated.
 556                 // Overly long here is arbitrarily defined.
 557                 const sal_Int32 nOverlyLong = 66;
 558                 sal_Int32 nPos = lbr.breakIndex - 1;
 559                 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
 560                 {
 561                     if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
 562                     {
 563                         lbr.breakIndex = nPos + 1;
 564                         break;
 565                     }
 566                 }
 567             }
 568         }
 569
 570 #define WJ 0x2060   // Word Joiner
 571         GlueSpace=false;
 572         if (lbr.breakType == BreakType::WORDBOUNDARY) {
 573             nStartPos = lbr.breakIndex;
 574             if (nStartPos >= 0 && Text[nStartPos--] == WJ)
 575                 GlueSpace=true;
 576             while (nStartPos >= 0 &&
 577                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
 578                 if (Text[nStartPos--] == WJ)
 579                     GlueSpace=true;
 580             }
 581             if (GlueSpace && nStartPos < 0)  {
 582                 lbr.breakIndex = 0;
 583                 break;
 584             }
 585         }
 586     }
 587
 588     return lbr;
 589 }
 590
 591 OUString SAL_CALL
 592 BreakIterator_Unicode::getImplementationName()
 593 {
 594     return OUString::createFromAscii(cBreakIterator);
 595 }
 596
 597 sal_Bool SAL_CALL
 598 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
 599 {
 600     return cppu::supportsService(this, rServiceName);
 601 }
 602
 603 uno::Sequence< OUString > SAL_CALL
 604 BreakIterator_Unicode::getSupportedServiceNames()
 605 {
 606     uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
 607     return aRet;
 608 }
 609
 610 }
 611
 612 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
 613 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
 614     css::uno::XComponentContext *,
 615     css::uno::Sequence<css::uno::Any> const &)
 616 {
 617     return cppu::acquire(new i18npool::BreakIterator_Unicode());
 618 }
 619
 620 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */