i18npool/source/breakiterator/breakiterator_unicode.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2000, 2010 Oracle and/or its affiliates.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * This file is part of OpenOffice.org.
  10  *
  11  * OpenOffice.org is free software: you can redistribute it and/or modify
  12  * it under the terms of the GNU Lesser General Public License version 3
  13  * only, as published by the Free Software Foundation.
  14  *
  15  * OpenOffice.org is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU Lesser General Public License version 3 for more details
  19  * (a copy is included in the LICENSE file that accompanied this code).
  20  *
  21  * You should have received a copy of the GNU Lesser General Public License
  22  * version 3 along with OpenOffice.org.  If not, see
  23  * <http://www.openoffice.org/license.html>
  24  * for a copy of the LGPLv3 License.
  25  *
  26  ************************************************************************/
  27
  28 // MARKER(update_precomp.py): autogen include statement, do not remove
  29 #include "precompiled_i18npool.hxx"
  30 #include <breakiterator_unicode.hxx>
  31 #include <localedata.hxx>
  32 #include <unicode/uchar.h>
  33 #include <unicode/locid.h>
  34 #include <unicode/rbbi.h>
  35 #include <unicode/udata.h>
  36 #include <rtl/strbuf.hxx>
  37 #include <rtl/ustring.hxx>
  38
  39 U_CDECL_BEGIN
  40 extern const char OpenOffice_dat[];
  41 U_CDECL_END
  42
  43 using namespace ::com::sun::star;
  44 using namespace ::com::sun::star::lang;
  45 using namespace ::rtl;
  46
  47 namespace com { namespace sun { namespace star { namespace i18n {
  48
  49 #define ERROR ::com::sun::star::uno::RuntimeException()
  50
  51 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
  52
  53
  54 BreakIterator_Unicode::BreakIterator_Unicode() :
  55     cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ),    // implementation name
  56     wordRule( "word" ),
  57     lineRule( "line" ),
  58     result(),
  59     character(),
  60     word(),
  61     sentence(),
  62     line(),
  63     icuBI( NULL ),
  64     aLocale(),
  65     aBreakType(),
  66     aWordType()
  67 {
  68 }
  69
  70
  71 BreakIterator_Unicode::~BreakIterator_Unicode()
  72 {
  73         if (icuBI && icuBI->aBreakIterator) {
  74             delete icuBI->aBreakIterator;
  75             icuBI->aBreakIterator=NULL;
  76         }
  77         if (character.aBreakIterator) delete character.aBreakIterator;
  78         if (word.aBreakIterator) delete word.aBreakIterator;
  79         if (sentence.aBreakIterator) delete sentence.aBreakIterator;
  80         if (line.aBreakIterator) delete line.aBreakIterator;
  81 }
  82
  83 /*
  84     Wrapper class to provide public access to the RuleBasedBreakIterator's
  85     setbreakType method.
  86 */
  87 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
  88     public:
  89         inline void publicSetBreakType(int32_t type) {
  90             setBreakType(type);
  91         };
  92         OOoRuleBasedBreakIterator(UDataMemory* image,
  93                 UErrorCode &status) :
  94             RuleBasedBreakIterator(image, status) { };
  95
  96 };
  97
  98 // loading ICU breakiterator on demand.
  99 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
 100         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
 101 {
 102     sal_Bool newBreak = sal_False;
 103     UErrorCode status = U_ZERO_ERROR;
 104     sal_Int16 breakType = 0;
 105     switch (rBreakType) {
 106         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
 107         case LOAD_WORD_BREAKITERATOR: icuBI=&word;
 108             switch (rWordType) {
 109                 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
 110                 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
 111                 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
 112             }
 113             break;
 114         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
 115         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
 116     }
 117     if (!icuBI->aBreakIterator || rWordType != aWordType ||
 118             rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
 119             rLocale.Variant != aLocale.Variant) {
 120         if (icuBI->aBreakIterator) {
 121             delete icuBI->aBreakIterator;
 122             icuBI->aBreakIterator=NULL;
 123         }
 124         if (rule) {
 125             uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
 126
 127             status = U_ZERO_ERROR;
 128             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
 129             if ( !U_SUCCESS(status) ) throw ERROR;
 130
 131             OOoRuleBasedBreakIterator *rbi = NULL;
 132
 133             if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
 134                 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
 135                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
 136             } else {
 137                 status = U_ZERO_ERROR;
 138                 OStringBuffer aUDName(64);
 139                 aUDName.append(rule);
 140                 aUDName.append('_');
 141                 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
 142                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
 143                 if( U_SUCCESS(status) )
 144                     rbi = new OOoRuleBasedBreakIterator( pUData, status);
 145                 if (!U_SUCCESS(status) ) {
 146                     status = U_ZERO_ERROR;
 147                     pUData = udata_open("OpenOffice", "brk", rule, &status);
 148                     if( U_SUCCESS(status) )
 149                         rbi = new OOoRuleBasedBreakIterator( pUData, status);
 150                     if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
 151                 }
 152             }
 153             if (rbi) {
 154                 switch (rBreakType) {
 155                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
 156                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
 157                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
 158                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
 159                 }
 160                 icuBI->aBreakIterator = rbi;
 161             }
 162         }
 163
 164         if (!icuBI->aBreakIterator) {
 165             icu::Locale icuLocale(
 166                     OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
 167                     OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
 168                     OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
 169
 170             status = U_ZERO_ERROR;
 171             switch (rBreakType) {
 172                 case LOAD_CHARACTER_BREAKITERATOR:
 173                     icuBI->aBreakIterator =  icu::BreakIterator::createCharacterInstance(icuLocale, status);
 174                     break;
 175                 case LOAD_WORD_BREAKITERATOR:
 176                     icuBI->aBreakIterator =  icu::BreakIterator::createWordInstance(icuLocale, status);
 177                     break;
 178                 case LOAD_SENTENCE_BREAKITERATOR:
 179                     icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
 180                     break;
 181                 case LOAD_LINE_BREAKITERATOR:
 182                     icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
 183                     break;
 184             }
 185             if ( !U_SUCCESS(status) ) {
 186                 icuBI->aBreakIterator=NULL;
 187                 throw ERROR;
 188             }
 189         }
 190         if (icuBI->aBreakIterator) {
 191             aLocale=rLocale;
 192             aWordType=rWordType;
 193             aBreakType=rBreakType;
 194             newBreak=sal_True;
 195         } else {
 196             throw ERROR;
 197         }
 198     }
 199
 200     if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) {   // UChar != sal_Unicode in MinGW
 201         icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
 202         icuBI->aBreakIterator->setText(icuBI->aICUText);
 203     }
 204 }
 205
 206
 207 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
 208         sal_Int32 nStartPos, const lang::Locale &rLocale,
 209         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 210         throw(uno::RuntimeException)
 211 {
 212         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 213             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 214             for (nDone = 0; nDone < nCount; nDone++) {
 215                 nStartPos = character.aBreakIterator->following(nStartPos);
 216                 if (nStartPos == BreakIterator::DONE)
 217                     return Text.getLength();
 218             }
 219         } else { // for CHARACTER mode
 220             for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
 221                 Text.iterateCodePoints(&nStartPos, 1);
 222         }
 223         return nStartPos;
 224 }
 225
 226 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
 227         sal_Int32 nStartPos, const lang::Locale& rLocale,
 228         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 229         throw(uno::RuntimeException)
 230 {
 231         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 232             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 233             for (nDone = 0; nDone < nCount; nDone++) {
 234                 nStartPos = character.aBreakIterator->preceding(nStartPos);
 235                 if (nStartPos == BreakIterator::DONE)
 236                     return 0;
 237             }
 238         } else { // for BS to delete one char and CHARACTER mode.
 239             for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
 240                 Text.iterateCodePoints(&nStartPos, -1);
 241         }
 242         return nStartPos;
 243 }
 244
 245
 246 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
 247     const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
 248 {
 249         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 250
 251         result.startPos = word.aBreakIterator->following(nStartPos);
 252         if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
 253             result.endPos = result.startPos;
 254         else {
 255             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 256                     rWordType == WordType::DICTIONARY_WORD ) &&
 257                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
 258                 result.startPos = word.aBreakIterator->following(result.startPos);
 259
 260             result.endPos = word.aBreakIterator->following(result.startPos);
 261             if(result.endPos == BreakIterator::DONE)
 262                 result.endPos = result.startPos;
 263         }
 264         return result;
 265 }
 266
 267
 268 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
 269         const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
 270 {
 271         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 272
 273         result.startPos = word.aBreakIterator->preceding(nStartPos);
 274         if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
 275             result.endPos = result.startPos;
 276         else {
 277             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 278                     rWordType == WordType::DICTIONARY_WORD) &&
 279                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
 280                 result.startPos = word.aBreakIterator->preceding(result.startPos);
 281
 282             result.endPos = word.aBreakIterator->following(result.startPos);
 283             if(result.endPos == BreakIterator::DONE)
 284                 result.endPos = result.startPos;
 285         }
 286         return result;
 287 }
 288
 289
 290 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
 291         sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
 292 {
 293         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 294         sal_Int32 len = Text.getLength();
 295
 296         if(word.aBreakIterator->isBoundary(nPos)) {
 297             result.startPos = result.endPos = nPos;
 298             if((bDirection || nPos == 0) && nPos < len) //forward
 299                 result.endPos = word.aBreakIterator->following(nPos);
 300             else
 301                 result.startPos = word.aBreakIterator->preceding(nPos);
 302         } else {
 303             if(nPos <= 0) {
 304                 result.startPos = 0;
 305                 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
 306             } else if(nPos >= len) {
 307                 result.startPos = word.aBreakIterator->preceding(len);
 308                 result.endPos = len;
 309             } else {
 310                 result.startPos = word.aBreakIterator->preceding(nPos);
 311                 result.endPos = word.aBreakIterator->following(nPos);
 312             }
 313         }
 314         if (result.startPos == BreakIterator::DONE)
 315             result.startPos = result.endPos;
 316         else if (result.endPos == BreakIterator::DONE)
 317             result.endPos = result.startPos;
 318
 319         return result;
 320 }
 321
 322
 323 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
 324         const lang::Locale &rLocale ) throw(uno::RuntimeException)
 325 {
 326         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 327
 328         sal_Int32 len = Text.getLength();
 329         if (len > 0 && nStartPos == len)
 330             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 331         if (!sentence.aBreakIterator->isBoundary(nStartPos))
 332             nStartPos = sentence.aBreakIterator->preceding(nStartPos);
 333
 334         // skip preceding space.
 335         sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
 336         while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
 337         Text.iterateCodePoints(&nStartPos, -1);
 338
 339         return nStartPos;
 340 }
 341
 342 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
 343         const lang::Locale &rLocale ) throw(uno::RuntimeException)
 344 {
 345         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 346
 347         sal_Int32 len = Text.getLength();
 348         if (len > 0 && nStartPos == len)
 349             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 350         nStartPos = sentence.aBreakIterator->following(nStartPos);
 351
 352         sal_Int32 nPos=nStartPos;
 353         while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
 354
 355         return nStartPos;
 356 }
 357
 358 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
 359         const OUString& Text, sal_Int32 nStartPos,
 360         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
 361         const LineBreakHyphenationOptions& hOptions,
 362         const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
 363 {
 364         LineBreakResults lbr;
 365
 366         if (nStartPos >= Text.getLength()) {
 367             lbr.breakIndex = Text.getLength();
 368             lbr.breakType = BreakType::WORDBOUNDARY;
 369             return lbr;
 370         }
 371
 372         loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
 373
 374         sal_Bool GlueSpace=sal_True;
 375         while (GlueSpace) {
 376             if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
 377                 lbr.breakIndex = nStartPos;
 378                 lbr.breakType = BreakType::WORDBOUNDARY;
 379             } else if (hOptions.rHyphenator.is()) { //Hyphenation break
 380                 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
 381                                                 WordType::DICTIONARY_WORD, false);
 382                 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
 383                 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
 384                     wBoundary.endPos - wBoundary.startPos), rLocale,
 385                     (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
 386                 if (aHyphenatedWord.is()) {
 387                     lbr.rHyphenatedWord = aHyphenatedWord;
 388                     if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
 389                         lbr.breakIndex = -1;
 390                     else
 391                         lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
 392                     lbr.breakType = BreakType::HYPHENATION;
 393                 } else {
 394                     lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
 395                     lbr.breakType = BreakType::WORDBOUNDARY;;
 396                 }
 397             } else { //word boundary break
 398                 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
 399                 lbr.breakType = BreakType::WORDBOUNDARY;
 400             }
 401
 402 #define WJ 0x2060   // Word Joiner
 403             GlueSpace=sal_False;
 404             if (lbr.breakType == BreakType::WORDBOUNDARY) {
 405                 nStartPos = lbr.breakIndex;
 406                 if (Text[nStartPos--] == WJ)
 407                     GlueSpace=sal_True;
 408                 while (nStartPos >= 0 &&
 409                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
 410                     if (Text[nStartPos--] == WJ)
 411                         GlueSpace=sal_True;
 412                 }
 413                 if (GlueSpace && nStartPos < 0)  {
 414                     lbr.breakIndex = 0;
 415                     break;
 416                 }
 417             }
 418         }
 419
 420         return lbr;
 421 }
 422
 423
 424
 425 OUString SAL_CALL
 426 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
 427 {
 428         return OUString::createFromAscii(cBreakIterator);
 429 }
 430
 431 sal_Bool SAL_CALL
 432 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
 433 {
 434         return !rServiceName.compareToAscii(cBreakIterator);
 435 }
 436
 437 uno::Sequence< OUString > SAL_CALL
 438 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
 439 {
 440         uno::Sequence< OUString > aRet(1);
 441         aRet[0] = OUString::createFromAscii(cBreakIterator);
 442         return aRet;
 443 }
 444
 445 } } } }