i18npool/source/breakiterator/breakiterator_unicode.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <breakiterator_unicode.hxx>
  21 #include <cppuhelper/supportsservice.hxx>
  22 #include <localedata.hxx>
  23 #include <i18nlangtag/languagetag.hxx>
  24 #include <i18nlangtag/languagetagicu.hxx>
  25 #include <unicode/uchar.h>
  26 #include <unicode/locid.h>
  27 #include <unicode/rbbi.h>
  28 #include <unicode/udata.h>
  29 #include <rtl/strbuf.hxx>
  30 #include <rtl/ustring.hxx>
  31
  32 #include <com/sun/star/i18n/BreakType.hpp>
  33 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
  34 #include <com/sun/star/i18n/WordType.hpp>
  35
  36 U_CDECL_BEGIN
  37 extern const char OpenOffice_dat[];
  38 U_CDECL_END
  39
  40 using namespace ::com::sun::star;
  41 using namespace ::com::sun::star::i18n;
  42 using namespace ::com::sun::star::lang;
  43
  44 namespace i18npool {
  45
  46 // Cache map of breakiterators, stores state information so has to be
  47 // thread_local.
  48 thread_local static BreakIterator_Unicode::BIMap theBIMap;
  49
  50 BreakIterator_Unicode::BreakIterator_Unicode()
  51     : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" )    // implementation name
  52     , lineRule( "line" )
  53     , icuBI( nullptr )
  54 {
  55 }
  56
  57 BreakIterator_Unicode::~BreakIterator_Unicode()
  58 {
  59 }
  60
  61 /*
  62     Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
  63     setbreakType method.
  64 */
  65 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
  66 {
  67     public:
  68 #if (U_ICU_VERSION_MAJOR_NUM < 58)
  69     // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
  70     void publicSetBreakType(int32_t type)
  71         {
  72             setBreakType(type);
  73         };
  74 #endif
  75     OOoRuleBasedBreakIterator(UDataMemory* image,
  76                               UErrorCode &status)
  77         : icu::RuleBasedBreakIterator(image, status)
  78         { };
  79
  80 };
  81
  82 // loading ICU breakiterator on demand.
  83 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
  84         sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText)
  85 {
  86     bool bNewBreak = false;
  87     UErrorCode status = U_ZERO_ERROR;
  88     sal_Int16 breakType = 0;
  89     switch (rBreakType) {
  90         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
  91         case LOAD_WORD_BREAKITERATOR:
  92             assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
  93             icuBI=&words[nWordType];
  94             switch (nWordType) {
  95                 case WordType::ANY_WORD: break; // odd but previous behavior
  96                 case WordType::ANYWORD_IGNOREWHITESPACES:
  97                     breakType = 0; rule = "edit_word"; break;
  98                 case WordType::DICTIONARY_WORD:
  99                     breakType = 1; rule = "dict_word"; break;
 100                 default:
 101                 case WordType::WORD_COUNT:
 102                     breakType = 2; rule = "count_word"; break;
 103             }
 104             break;
 105         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
 106         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
 107     }
 108
 109     // Using the cache map prevents accessing the file system for each
 110     // udata_open() where ICU tries first files then data objects. And that for
 111     // two fallbacks worst case... for each new allocated EditEngine, layout
 112     // cell, ... *ouch*  Also non-rule locale based iterators can be mapped.
 113     // This also speeds up loading iterators for alternating or generally more
 114     // than one language/locale in that iterators are not constructed and
 115     // destroyed en masse.
 116     // Four possible keys, locale rule based with break type, locale rule based
 117     // only, rule based only, locale based with break type. A fifth global key
 118     // for the initial lookup.
 119     // Multiple global keys may map to identical value data.
 120     // All enums used here should be in the range 0..9 so assert that and avoid
 121     // expensive numeric conversion in append() for faster construction of the
 122     // always used global key.
 123     assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
 124     const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
 125     OStringBuffer aKeyBuf(64);
 126     aKeyBuf.append( aLangtagStr).append(';');
 127     if (rule)
 128         aKeyBuf.append(rule);
 129     aKeyBuf.append(';').append( static_cast<sal_Char>('0'+breakType)).append(';').
 130         append( static_cast<sal_Char>('0'+rBreakType)).append(';').append( static_cast<sal_Char>('0'+nWordType));
 131     // langtag;rule;breakType;rBreakType;nWordType
 132     const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
 133
 134     if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
 135     {
 136
 137         auto aMapIt( theBIMap.find( aBIMapGlobalKey));
 138         bool bInMap = (aMapIt != theBIMap.end());
 139         if (bInMap)
 140             icuBI->mpValue = aMapIt->second;
 141         else
 142             icuBI->mpValue.reset();
 143
 144         if (!bInMap && rule) do {
 145             const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
 146
 147             status = U_ZERO_ERROR;
 148             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
 149             if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
 150
 151             std::unique_ptr<OOoRuleBasedBreakIterator> rbi;
 152
 153             if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
 154             {
 155                 // langtag;rule;breakType
 156                 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
 157                 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
 158                 bInMap = (aMapIt != theBIMap.end());
 159                 if (bInMap)
 160                 {
 161                     icuBI->mpValue = aMapIt->second;
 162                     icuBI->maBIMapKey = aBIMapGlobalKey;
 163                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 164                     break;  // do
 165                 }
 166
 167                 rbi.reset(new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
 168                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status));
 169
 170                 if (U_SUCCESS(status))
 171                 {
 172                     icuBI->mpValue.reset( new BI_ValueData);
 173                     icuBI->mpValue->mpBreakIterator = std::move( rbi);
 174                     theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
 175                 }
 176                 else
 177                 {
 178                     rbi.reset();
 179                 }
 180             }
 181             //use icu's breakiterator for Thai, Tibetan and Dzongkha
 182             else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
 183             {
 184                 // language;rule (not langtag, unless we'd actually load such)
 185                 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
 186                 const OString aBIMapRuleKey( aLanguage + ";" + rule);
 187                 aMapIt = theBIMap.find( aBIMapRuleKey);
 188                 bInMap = (aMapIt != theBIMap.end());
 189                 if (bInMap)
 190                 {
 191                     icuBI->mpValue = aMapIt->second;
 192                     icuBI->maBIMapKey = aBIMapGlobalKey;
 193                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 194                     break;  // do
 195                 }
 196
 197                 status = U_ZERO_ERROR;
 198                 OString aUDName = rtl::OStringView(rule) + "_" + aLanguage;
 199                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
 200                 if( U_SUCCESS(status) )
 201                     rbi.reset(new OOoRuleBasedBreakIterator( pUData, status));
 202                 if ( U_SUCCESS(status) )
 203                 {
 204                     icuBI->mpValue.reset( new BI_ValueData);
 205                     icuBI->mpValue->mpBreakIterator = std::move( rbi);
 206                     theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
 207                 }
 208                 else
 209                 {
 210                     rbi.reset();
 211
 212                     // ;rule (only)
 213                     const OString aBIMapRuleOnlyKey( OStringLiteral(";") + rule);
 214                     aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
 215                     bInMap = (aMapIt != theBIMap.end());
 216                     if (bInMap)
 217                     {
 218                         icuBI->mpValue = aMapIt->second;
 219                         icuBI->maBIMapKey = aBIMapGlobalKey;
 220                         theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 221                         break;  // do
 222                     }
 223
 224                     status = U_ZERO_ERROR;
 225                     pUData = udata_open("OpenOffice", "brk", rule, &status);
 226                     if( U_SUCCESS(status) )
 227                         rbi.reset(new OOoRuleBasedBreakIterator( pUData, status));
 228                     if ( U_SUCCESS(status) )
 229                     {
 230                         icuBI->mpValue.reset( new BI_ValueData);
 231                         icuBI->mpValue->mpBreakIterator = std::move( rbi);
 232                         theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
 233                     }
 234                     else
 235                     {
 236                         rbi.reset();
 237                     }
 238                 }
 239             }
 240             if (rbi) {
 241 #if (U_ICU_VERSION_MAJOR_NUM < 58)
 242                 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
 243                 // instead of protected, so the old workaround of
 244                 // https://ssl.icu-project.org/trac/ticket/5498
 245                 // doesn't work anymore. However, they also claim to have fixed
 246                 // the cause that an initial fBreakType==-1 would lead to an
 247                 // endless loop under some circumstances.
 248                 // Let's see ...
 249                 switch (rBreakType) {
 250                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
 251                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
 252                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
 253                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
 254                 }
 255 #endif
 256             }
 257         } while (false);
 258
 259         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) do {
 260             // langtag;;;rBreakType (empty rule; empty breakType)
 261             const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
 262             aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
 263             bInMap = (aMapIt != theBIMap.end());
 264             if (bInMap)
 265             {
 266                 icuBI->mpValue = aMapIt->second;
 267                 icuBI->maBIMapKey = aBIMapGlobalKey;
 268                 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 269                 break;  // do
 270             }
 271
 272             icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
 273             std::shared_ptr< icu::BreakIterator > pBI;
 274
 275             status = U_ZERO_ERROR;
 276             switch (rBreakType) {
 277                 case LOAD_CHARACTER_BREAKITERATOR:
 278                     pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
 279                     break;
 280                 case LOAD_WORD_BREAKITERATOR:
 281                     pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
 282                     break;
 283                 case LOAD_SENTENCE_BREAKITERATOR:
 284                     pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
 285                     break;
 286                 case LOAD_LINE_BREAKITERATOR:
 287                     pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
 288                     break;
 289             }
 290             if ( !U_SUCCESS(status) || !pBI ) {
 291                 throw uno::RuntimeException();
 292             }
 293             icuBI->mpValue.reset( new BI_ValueData);
 294             icuBI->mpValue->mpBreakIterator = pBI;
 295             theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
 296         } while (false);
 297         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
 298             throw uno::RuntimeException();
 299         }
 300         icuBI->maBIMapKey = aBIMapGlobalKey;
 301         if (!bInMap)
 302             theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
 303         bNewBreak=true;
 304     }
 305
 306     if (bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData)
 307     {
 308         const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
 309
 310         status = U_ZERO_ERROR;
 311         icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
 312
 313         if (!U_SUCCESS(status))
 314             throw uno::RuntimeException();
 315
 316         icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
 317
 318         if (!U_SUCCESS(status))
 319             throw uno::RuntimeException();
 320
 321         icuBI->mpValue->maICUText = rText;
 322     }
 323 }
 324
 325 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
 326         sal_Int32 nStartPos, const lang::Locale &rLocale,
 327         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 328 {
 329     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 330         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 331         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
 332         for (nDone = 0; nDone < nCount; nDone++) {
 333             nStartPos = pBI->following(nStartPos);
 334             if (nStartPos == icu::BreakIterator::DONE)
 335                 return Text.getLength();
 336         }
 337     } else { // for CHARACTER mode
 338         for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
 339             Text.iterateCodePoints(&nStartPos);
 340     }
 341     return nStartPos;
 342 }
 343
 344 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
 345         sal_Int32 nStartPos, const lang::Locale& rLocale,
 346         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 347 {
 348     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 349         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 350         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
 351         for (nDone = 0; nDone < nCount; nDone++) {
 352             nStartPos = pBI->preceding(nStartPos);
 353             if (nStartPos == icu::BreakIterator::DONE)
 354                 return 0;
 355         }
 356     } else { // for BS to delete one char and CHARACTER mode.
 357         for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
 358             Text.iterateCodePoints(&nStartPos, -1);
 359     }
 360     return nStartPos;
 361 }
 362
 363
 364 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
 365     const lang::Locale& rLocale, sal_Int16 rWordType )
 366 {
 367     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
 368
 369     Boundary rv;
 370     rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
 371     if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
 372         rv.endPos = result.startPos;
 373     else {
 374         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 375                     rWordType == WordType::DICTIONARY_WORD ) &&
 376                 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
 377             rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
 378
 379         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
 380         if(rv.endPos == icu::BreakIterator::DONE)
 381             rv.endPos = rv.startPos;
 382     }
 383     return rv;
 384 }
 385
 386
 387 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
 388         const lang::Locale& rLocale, sal_Int16 rWordType)
 389 {
 390     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
 391
 392     Boundary rv;
 393     rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
 394     if( rv.startPos < 0)
 395         rv.endPos = rv.startPos;
 396     else {
 397         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 398                     rWordType == WordType::DICTIONARY_WORD) &&
 399                 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
 400             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
 401
 402         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
 403         if(rv.endPos == icu::BreakIterator::DONE)
 404             rv.endPos = rv.startPos;
 405     }
 406     return rv;
 407 }
 408
 409
 410 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
 411         sal_Int16 rWordType, sal_Bool bDirection )
 412 {
 413     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
 414     sal_Int32 len = Text.getLength();
 415
 416     Boundary rv;
 417     if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
 418         rv.startPos = rv.endPos = nPos;
 419         if((bDirection || nPos == 0) && nPos < len) //forward
 420             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
 421         else
 422             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
 423     } else {
 424         if(nPos <= 0) {
 425             rv.startPos = 0;
 426             rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
 427         } else if(nPos >= len) {
 428             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
 429             rv.endPos = len;
 430         } else {
 431             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
 432             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
 433         }
 434     }
 435     if (rv.startPos == icu::BreakIterator::DONE)
 436         rv.startPos = rv.endPos;
 437     else if (rv.endPos == icu::BreakIterator::DONE)
 438         rv.endPos = rv.startPos;
 439
 440     return rv;
 441 }
 442
 443
 444 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
 445         const lang::Locale &rLocale )
 446 {
 447     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 448
 449     sal_Int32 len = Text.getLength();
 450     if (len > 0 && nStartPos == len)
 451         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 452     if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
 453         nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
 454
 455     // skip preceding space.
 456     sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
 457     while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
 458     Text.iterateCodePoints(&nStartPos, -1);
 459
 460     return nStartPos;
 461 }
 462
 463 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
 464         const lang::Locale &rLocale )
 465 {
 466     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 467
 468     sal_Int32 len = Text.getLength();
 469     if (len > 0 && nStartPos == len)
 470         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 471     nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
 472
 473     sal_Int32 nPos=nStartPos;
 474     while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
 475
 476     return nStartPos;
 477 }
 478
 479 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
 480         const OUString& Text, sal_Int32 nStartPos,
 481         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
 482         const LineBreakHyphenationOptions& hOptions,
 483         const LineBreakUserOptions& /*rOptions*/ )
 484 {
 485     LineBreakResults lbr;
 486
 487     if (nStartPos >= Text.getLength()) {
 488         lbr.breakIndex = Text.getLength();
 489         lbr.breakType = BreakType::WORDBOUNDARY;
 490         return lbr;
 491     }
 492
 493     loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
 494
 495     icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
 496     bool GlueSpace=true;
 497     while (GlueSpace) {
 498         if (pLineBI->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
 499             lbr.breakIndex = nStartPos;
 500             lbr.breakType = BreakType::WORDBOUNDARY;
 501         } else if (hOptions.rHyphenator.is()) { //Hyphenation break
 502             sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
 503             pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
 504
 505             sal_Int32 nStartPosWordEnd = nStartPos;
 506             while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
 507                 nStartPosWordEnd --;
 508
 509             Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
 510                 WordType::DICTIONARY_WORD, false);
 511
 512             nStartPosWordEnd = wBoundary.endPos;
 513             while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
 514                 nStartPosWordEnd ++;
 515             nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
 516             if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
 517 #define SPACE 0x0020
 518             while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
 519             uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
 520                         wBoundary.endPos - wBoundary.startPos), rLocale,
 521                     static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
 522             if (aHyphenatedWord.is()) {
 523                 lbr.rHyphenatedWord = aHyphenatedWord;
 524                 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
 525                     lbr.breakIndex = -1;
 526                 else
 527                     lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
 528                 lbr.breakType = BreakType::HYPHENATION;
 529
 530                 // check not optimal hyphenation of "word-word" (word with hyphens)
 531                 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
 532                     lbr.breakIndex = pLineBI->current();
 533                     lbr.breakType = BreakType::WORDBOUNDARY;
 534                 }
 535
 536             } else {
 537                 lbr.breakIndex = pLineBI->preceding(nStartPos);
 538                 lbr.breakType = BreakType::WORDBOUNDARY;
 539             }
 540         } else { //word boundary break
 541             lbr.breakIndex = pLineBI->preceding(nStartPos);
 542             lbr.breakType = BreakType::WORDBOUNDARY;
 543
 544             // Special case for Slash U+002F SOLIDUS in URI and path names.
 545             // TR14 defines that as SY: Symbols Allowing Break After (A).
 546             // This is unwanted in paths, see also i#17155
 547             if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
 548             {
 549                 // Look backward and take any whitespace before as a break
 550                 // opportunity. This also glues something like "w/o".
 551                 // Avoid an overly long path and break it as was indicated.
 552                 // Overly long here is arbitrarily defined.
 553                 const sal_Int32 nOverlyLong = 66;
 554                 sal_Int32 nPos = lbr.breakIndex - 1;
 555                 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
 556                 {
 557                     if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
 558                     {
 559                         lbr.breakIndex = nPos + 1;
 560                         break;
 561                     }
 562                 }
 563             }
 564         }
 565
 566 #define WJ 0x2060   // Word Joiner
 567         GlueSpace=false;
 568         if (lbr.breakType == BreakType::WORDBOUNDARY) {
 569             nStartPos = lbr.breakIndex;
 570             if (nStartPos >= 0 && Text[nStartPos--] == WJ)
 571                 GlueSpace=true;
 572             while (nStartPos >= 0 &&
 573                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
 574                 if (Text[nStartPos--] == WJ)
 575                     GlueSpace=true;
 576             }
 577             if (GlueSpace && nStartPos < 0)  {
 578                 lbr.breakIndex = 0;
 579                 break;
 580             }
 581         }
 582     }
 583
 584     return lbr;
 585 }
 586
 587 OUString SAL_CALL
 588 BreakIterator_Unicode::getImplementationName()
 589 {
 590     return OUString::createFromAscii(cBreakIterator);
 591 }
 592
 593 sal_Bool SAL_CALL
 594 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
 595 {
 596     return cppu::supportsService(this, rServiceName);
 597 }
 598
 599 uno::Sequence< OUString > SAL_CALL
 600 BreakIterator_Unicode::getSupportedServiceNames()
 601 {
 602     uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
 603     return aRet;
 604 }
 605
 606 }
 607
 608 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
 609 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
 610     css::uno::XComponentContext *,
 611     css::uno::Sequence<css::uno::Any> const &)
 612 {
 613     return cppu::acquire(new i18npool::BreakIterator_Unicode());
 614 }
 615
 616 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */