i18npool/source/breakiterator/breakiterator_unicode.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <breakiterator_unicode.hxx>
  21 #include <cppuhelper/supportsservice.hxx>
  22 #include <localedata.hxx>
  23 #include <i18nlangtag/languagetag.hxx>
  24 #include <i18nlangtag/languagetagicu.hxx>
  25 #include <unicode/uchar.h>
  26 #include <unicode/locid.h>
  27 #include <unicode/rbbi.h>
  28 #include <unicode/udata.h>
  29 #include <rtl/strbuf.hxx>
  30 #include <rtl/ustring.hxx>
  31 #include <string.h>
  32
  33 U_CDECL_BEGIN
  34 extern const char OpenOffice_dat[];
  35 U_CDECL_END
  36
  37 using namespace ::com::sun::star;
  38 using namespace ::com::sun::star::lang;
  39
  40 namespace com { namespace sun { namespace star { namespace i18n {
  41
  42
  43 BreakIterator_Unicode::BreakIterator_Unicode()
  44     : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" )    // implementation name
  45     , wordRule( "word" )
  46     , lineRule( "line" )
  47     , icuBI( NULL )
  48     , aBreakType(0)
  49 {
  50 }
  51
  52 BreakIterator_Unicode::~BreakIterator_Unicode()
  53 {
  54     delete character.aBreakIterator;
  55     delete sentence.aBreakIterator;
  56     delete line.aBreakIterator;
  57     for (size_t i = 0; i < SAL_N_ELEMENTS(words); i++)
  58         delete words[i].aBreakIterator;
  59 }
  60
  61 /*
  62     Wrapper class to provide public access to the RuleBasedBreakIterator's
  63     setbreakType method.
  64 */
  65 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator
  66 {
  67     public:
  68     inline void publicSetBreakType(int32_t type)
  69         {
  70             setBreakType(type);
  71         };
  72     OOoRuleBasedBreakIterator(UDataMemory* image,
  73                               UErrorCode &status)
  74         : RuleBasedBreakIterator(image, status)
  75         { };
  76
  77 };
  78
  79 // loading ICU breakiterator on demand.
  80 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
  81         sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
  82 {
  83     bool newBreak = false;
  84     UErrorCode status = U_ZERO_ERROR;
  85     sal_Int16 breakType = 0;
  86     switch (rBreakType) {
  87         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
  88         case LOAD_WORD_BREAKITERATOR:
  89             assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
  90             icuBI=&words[nWordType];
  91             switch (nWordType) {
  92                 case WordType::ANY_WORD: break; // odd but previous behavior
  93                 case WordType::ANYWORD_IGNOREWHITESPACES:
  94                     breakType = 0; rule = wordRule = "edit_word"; break;
  95                 case WordType::DICTIONARY_WORD:
  96                     breakType = 1; rule = wordRule = "dict_word"; break;
  97                 default:
  98                 case WordType::WORD_COUNT:
  99                     breakType = 2; rule = wordRule = "count_word"; break;
 100             }
 101             break;
 102         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
 103         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
 104     }
 105     if (!icuBI->aBreakIterator ||
 106         rLocale.Language != icuBI->maLocale.Language ||
 107         rLocale.Country  != icuBI->maLocale.Country  ||
 108         rLocale.Variant  != icuBI->maLocale.Variant) {
 109         if (icuBI->aBreakIterator) {
 110             delete icuBI->aBreakIterator;
 111             icuBI->aBreakIterator=NULL;
 112         }
 113         if (rule) {
 114             uno::Sequence< OUString > breakRules = LocaleDataImpl().getBreakIteratorRules(rLocale);
 115
 116             status = U_ZERO_ERROR;
 117             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
 118             if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
 119
 120             OOoRuleBasedBreakIterator *rbi = NULL;
 121
 122             if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
 123             {
 124                 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
 125                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
 126             }
 127             //use icu's breakiterator for Thai, Khmer, Tibetan and Dzongkha
 128             else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "km" && rLocale.Language != "bo" && rLocale.Language != "dz")
 129             {
 130                 status = U_ZERO_ERROR;
 131                 OStringBuffer aUDName(64);
 132                 aUDName.append(rule);
 133                 aUDName.append('_');
 134                 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
 135                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
 136                 if( U_SUCCESS(status) )
 137                     rbi = new OOoRuleBasedBreakIterator( pUData, status);
 138                 if (!U_SUCCESS(status) ) {
 139                     status = U_ZERO_ERROR;
 140                     pUData = udata_open("OpenOffice", "brk", rule, &status);
 141                     if( U_SUCCESS(status) )
 142                         rbi = new OOoRuleBasedBreakIterator( pUData, status);
 143                     if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
 144                 }
 145             }
 146             if (rbi) {
 147                 switch (rBreakType) {
 148                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
 149                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
 150                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
 151                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
 152                 }
 153                 icuBI->aBreakIterator = rbi;
 154             }
 155         }
 156
 157         if (!icuBI->aBreakIterator) {
 158             icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
 159
 160             status = U_ZERO_ERROR;
 161             switch (rBreakType) {
 162                 case LOAD_CHARACTER_BREAKITERATOR:
 163                     icuBI->aBreakIterator =  icu::BreakIterator::createCharacterInstance(icuLocale, status);
 164                     break;
 165                 case LOAD_WORD_BREAKITERATOR:
 166                     icuBI->aBreakIterator =  icu::BreakIterator::createWordInstance(icuLocale, status);
 167                     break;
 168                 case LOAD_SENTENCE_BREAKITERATOR:
 169                     icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
 170                     break;
 171                 case LOAD_LINE_BREAKITERATOR:
 172                     icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
 173                     break;
 174             }
 175             if ( !U_SUCCESS(status) ) {
 176                 icuBI->aBreakIterator=NULL;
 177                 throw uno::RuntimeException();
 178             }
 179         }
 180         if (icuBI->aBreakIterator) {
 181             icuBI->maLocale=rLocale;
 182             newBreak=true;
 183         } else {
 184             throw uno::RuntimeException();
 185         }
 186     }
 187
 188     if (newBreak || !icuBI->aICUText.equals(rText))
 189     {
 190         // UChar != sal_Unicode in MinGW
 191         const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
 192
 193         icuBI->ut = utext_openUChars(icuBI->ut, pText, rText.getLength(), &status);
 194
 195         if (!U_SUCCESS(status))
 196             throw uno::RuntimeException();
 197
 198         icuBI->aBreakIterator->setText(icuBI->ut, status);
 199
 200         if (!U_SUCCESS(status))
 201             throw uno::RuntimeException();
 202
 203         icuBI->aICUText = rText;
 204     }
 205 }
 206
 207 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
 208         sal_Int32 nStartPos, const lang::Locale &rLocale,
 209         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 210         throw(uno::RuntimeException, std::exception)
 211 {
 212     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 213         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 214         for (nDone = 0; nDone < nCount; nDone++) {
 215             nStartPos = character.aBreakIterator->following(nStartPos);
 216             if (nStartPos == BreakIterator::DONE)
 217                 return Text.getLength();
 218         }
 219     } else { // for CHARACTER mode
 220         for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
 221             Text.iterateCodePoints(&nStartPos, 1);
 222     }
 223     return nStartPos;
 224 }
 225
 226 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
 227         sal_Int32 nStartPos, const lang::Locale& rLocale,
 228         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 229         throw(uno::RuntimeException, std::exception)
 230 {
 231     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 232         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 233         for (nDone = 0; nDone < nCount; nDone++) {
 234             nStartPos = character.aBreakIterator->preceding(nStartPos);
 235             if (nStartPos == BreakIterator::DONE)
 236                 return 0;
 237         }
 238     } else { // for BS to delete one char and CHARACTER mode.
 239         for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
 240             Text.iterateCodePoints(&nStartPos, -1);
 241     }
 242     return nStartPos;
 243 }
 244
 245
 246 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
 247     const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException, std::exception)
 248 {
 249     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 250
 251     result.startPos = icuBI->aBreakIterator->following(nStartPos);
 252     if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
 253         result.endPos = result.startPos;
 254     else {
 255         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 256                     rWordType == WordType::DICTIONARY_WORD ) &&
 257                 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
 258             result.startPos = icuBI->aBreakIterator->following(result.startPos);
 259
 260         result.endPos = icuBI->aBreakIterator->following(result.startPos);
 261         if(result.endPos == BreakIterator::DONE)
 262             result.endPos = result.startPos;
 263     }
 264     return result;
 265 }
 266
 267
 268 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
 269         const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException, std::exception)
 270 {
 271     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 272
 273     result.startPos = icuBI->aBreakIterator->preceding(nStartPos);
 274     if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
 275         result.endPos = result.startPos;
 276     else {
 277         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 278                     rWordType == WordType::DICTIONARY_WORD) &&
 279                 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
 280             result.startPos = icuBI->aBreakIterator->preceding(result.startPos);
 281
 282         result.endPos = icuBI->aBreakIterator->following(result.startPos);
 283         if(result.endPos == BreakIterator::DONE)
 284             result.endPos = result.startPos;
 285     }
 286     return result;
 287 }
 288
 289
 290 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
 291         sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException, std::exception)
 292 {
 293     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 294     sal_Int32 len = Text.getLength();
 295
 296     if(icuBI->aBreakIterator->isBoundary(nPos)) {
 297         result.startPos = result.endPos = nPos;
 298         if((bDirection || nPos == 0) && nPos < len) //forward
 299             result.endPos = icuBI->aBreakIterator->following(nPos);
 300         else
 301             result.startPos = icuBI->aBreakIterator->preceding(nPos);
 302     } else {
 303         if(nPos <= 0) {
 304             result.startPos = 0;
 305             result.endPos = len ? icuBI->aBreakIterator->following((sal_Int32)0) : 0;
 306         } else if(nPos >= len) {
 307             result.startPos = icuBI->aBreakIterator->preceding(len);
 308             result.endPos = len;
 309         } else {
 310             result.startPos = icuBI->aBreakIterator->preceding(nPos);
 311             result.endPos = icuBI->aBreakIterator->following(nPos);
 312         }
 313     }
 314     if (result.startPos == BreakIterator::DONE)
 315         result.startPos = result.endPos;
 316     else if (result.endPos == BreakIterator::DONE)
 317         result.endPos = result.startPos;
 318
 319     return result;
 320 }
 321
 322
 323 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
 324         const lang::Locale &rLocale ) throw(uno::RuntimeException, std::exception)
 325 {
 326     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 327
 328     sal_Int32 len = Text.getLength();
 329     if (len > 0 && nStartPos == len)
 330         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 331     if (!sentence.aBreakIterator->isBoundary(nStartPos))
 332         nStartPos = sentence.aBreakIterator->preceding(nStartPos);
 333
 334     // skip preceding space.
 335     sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
 336     while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
 337     Text.iterateCodePoints(&nStartPos, -1);
 338
 339     return nStartPos;
 340 }
 341
 342 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
 343         const lang::Locale &rLocale ) throw(uno::RuntimeException, std::exception)
 344 {
 345     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 346
 347     sal_Int32 len = Text.getLength();
 348     if (len > 0 && nStartPos == len)
 349         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 350     nStartPos = sentence.aBreakIterator->following(nStartPos);
 351
 352     sal_Int32 nPos=nStartPos;
 353     while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
 354
 355     return nStartPos;
 356 }
 357
 358 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
 359         const OUString& Text, sal_Int32 nStartPos,
 360         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
 361         const LineBreakHyphenationOptions& hOptions,
 362         const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException, std::exception)
 363 {
 364     LineBreakResults lbr;
 365
 366     if (nStartPos >= Text.getLength()) {
 367         lbr.breakIndex = Text.getLength();
 368         lbr.breakType = BreakType::WORDBOUNDARY;
 369         return lbr;
 370     }
 371
 372     loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
 373
 374     bool GlueSpace=true;
 375     while (GlueSpace) {
 376         if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
 377             lbr.breakIndex = nStartPos;
 378             lbr.breakType = BreakType::WORDBOUNDARY;
 379         } else if (hOptions.rHyphenator.is()) { //Hyphenation break
 380             sal_Int32 boundary_with_punctuation = (line.aBreakIterator->next() != BreakIterator::DONE) ? line.aBreakIterator->current() : 0;
 381             line.aBreakIterator->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
 382
 383             sal_Int32 nStartPosWordEnd = nStartPos;
 384             while (line.aBreakIterator->current() < nStartPosWordEnd && u_ispunct((sal_uInt32)Text[nStartPosWordEnd])) // starting punctuation
 385                 nStartPosWordEnd --;
 386
 387             Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
 388                 WordType::DICTIONARY_WORD, false);
 389
 390             nStartPosWordEnd = wBoundary.endPos;
 391             while (nStartPosWordEnd < Text.getLength() && (u_ispunct((sal_uInt32)Text[nStartPosWordEnd]))) // ending punctuation
 392                 nStartPosWordEnd ++;
 393             nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
 394             if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
 395 #define SPACE 0x0020
 396             while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
 397             if (boundary_with_punctuation != 0) boundary_with_punctuation += 1 - wBoundary.endPos;
 398             uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
 399             aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
 400                         wBoundary.endPos - wBoundary.startPos), rLocale,
 401                     (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
 402             if (aHyphenatedWord.is()) {
 403                 lbr.rHyphenatedWord = aHyphenatedWord;
 404                 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
 405                     lbr.breakIndex = -1;
 406                 else
 407                     lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
 408                 lbr.breakType = BreakType::HYPHENATION;
 409
 410                 // check not optimal hyphenation of "word-word" (word with hyphens)
 411                 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < line.aBreakIterator->current()) {
 412                     lbr.breakIndex = line.aBreakIterator->current();
 413                     lbr.breakType = BreakType::WORDBOUNDARY;
 414                 }
 415
 416             } else {
 417                 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
 418                 lbr.breakType = BreakType::WORDBOUNDARY;;
 419             }
 420         } else { //word boundary break
 421             lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
 422             lbr.breakType = BreakType::WORDBOUNDARY;
 423         }
 424
 425 #define WJ 0x2060   // Word Joiner
 426         GlueSpace=false;
 427         if (lbr.breakType == BreakType::WORDBOUNDARY) {
 428             nStartPos = lbr.breakIndex;
 429             if (Text[nStartPos--] == WJ)
 430                 GlueSpace=true;
 431             while (nStartPos >= 0 &&
 432                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
 433                 if (Text[nStartPos--] == WJ)
 434                     GlueSpace=true;
 435             }
 436             if (GlueSpace && nStartPos < 0)  {
 437                 lbr.breakIndex = 0;
 438                 break;
 439             }
 440         }
 441     }
 442
 443     return lbr;
 444 }
 445
 446 OUString SAL_CALL
 447 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException, std::exception )
 448 {
 449     return OUString::createFromAscii(cBreakIterator);
 450 }
 451
 452 sal_Bool SAL_CALL
 453 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException, std::exception )
 454 {
 455     return cppu::supportsService(this, rServiceName);
 456 }
 457
 458 uno::Sequence< OUString > SAL_CALL
 459 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException, std::exception )
 460 {
 461     uno::Sequence< OUString > aRet(1);
 462     aRet[0] = OUString::createFromAscii(cBreakIterator);
 463     return aRet;
 464 }
 465
 466 } } } }
 467
 468 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * SAL_CALL
 469 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
 470     css::uno::XComponentContext *,
 471     css::uno::Sequence<css::uno::Any> const &)
 472 {
 473     return cppu::acquire(new css::i18n::BreakIterator_Unicode());
 474 }
 475
 476 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */