i18npool/source/breakiterator/breakiterator_unicode.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: breakiterator_unicode.cxx,v $
  10  * $Revision: 1.36.2.1 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_i18npool.hxx"
  33 #include <breakiterator_unicode.hxx>
  34 #include <localedata.hxx>
  35 #include <unicode/uchar.h>
  36 #include <unicode/locid.h>
  37 #include <unicode/rbbi.h>
  38 #include <unicode/udata.h>
  39 #include <rtl/strbuf.hxx>
  40 #include <rtl/ustring.hxx>
  41
  42 U_CDECL_BEGIN
  43 extern const char OpenOffice_dat[];
  44 U_CDECL_END
  45
  46 using namespace ::com::sun::star;
  47 using namespace ::com::sun::star::lang;
  48 using namespace ::rtl;
  49
  50 namespace com { namespace sun { namespace star { namespace i18n {
  51
  52 #define ERROR ::com::sun::star::uno::RuntimeException()
  53
  54 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
  55
  56
  57 BreakIterator_Unicode::BreakIterator_Unicode() :
  58     cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ),    // implementation name
  59     wordRule( "word" ),
  60     lineRule( "line" ),
  61     result(),
  62     character(),
  63     word(),
  64     sentence(),
  65     line(),
  66     icuBI( NULL ),
  67     aLocale(),
  68     aBreakType(),
  69     aWordType()
  70 {
  71 }
  72
  73
  74 BreakIterator_Unicode::~BreakIterator_Unicode()
  75 {
  76         if (icuBI && icuBI->aBreakIterator) {
  77             delete icuBI->aBreakIterator;
  78             icuBI->aBreakIterator=NULL;
  79         }
  80         if (character.aBreakIterator) delete character.aBreakIterator;
  81         if (word.aBreakIterator) delete word.aBreakIterator;
  82         if (sentence.aBreakIterator) delete sentence.aBreakIterator;
  83         if (line.aBreakIterator) delete line.aBreakIterator;
  84 }
  85
  86 /*
  87     Wrapper class to provide public access to the RuleBasedBreakIterator's
  88     setbreakType method.
  89 */
  90 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
  91     public:
  92         inline void publicSetBreakType(int32_t type) {
  93             setBreakType(type);
  94         };
  95         OOoRuleBasedBreakIterator(UDataMemory* image,
  96                 UErrorCode &status) :
  97             RuleBasedBreakIterator(image, status) { };
  98
  99 };
 100
 101 // loading ICU breakiterator on demand.
 102 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
 103         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
 104 {
 105     sal_Bool newBreak = sal_False;
 106     UErrorCode status = U_ZERO_ERROR;
 107     sal_Int16 breakType = 0;
 108     switch (rBreakType) {
 109         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
 110         case LOAD_WORD_BREAKITERATOR: icuBI=&word;
 111             switch (rWordType) {
 112                 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
 113                 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
 114                 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
 115             }
 116             break;
 117         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
 118         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
 119     }
 120     if (!icuBI->aBreakIterator || rWordType != aWordType ||
 121             rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
 122             rLocale.Variant != aLocale.Variant) {
 123         if (icuBI->aBreakIterator) {
 124             delete icuBI->aBreakIterator;
 125             icuBI->aBreakIterator=NULL;
 126         }
 127         if (rule) {
 128             uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
 129
 130             status = U_ZERO_ERROR;
 131             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
 132             if ( !U_SUCCESS(status) ) throw ERROR;
 133
 134             OOoRuleBasedBreakIterator *rbi = NULL;
 135
 136             if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
 137                 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
 138                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
 139             } else {
 140                 status = U_ZERO_ERROR;
 141                 OStringBuffer aUDName(64);
 142                 aUDName.append(rule);
 143                 aUDName.append('_');
 144                 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
 145                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
 146                 if( U_SUCCESS(status) )
 147                     rbi = new OOoRuleBasedBreakIterator( pUData, status);
 148                 if (!U_SUCCESS(status) ) {
 149                     status = U_ZERO_ERROR;
 150                     pUData = udata_open("OpenOffice", "brk", rule, &status);
 151                     if( U_SUCCESS(status) )
 152                         rbi = new OOoRuleBasedBreakIterator( pUData, status);
 153                     if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
 154                 }
 155             }
 156             if (rbi) {
 157                 switch (rBreakType) {
 158                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
 159                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
 160                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
 161                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
 162                 }
 163                 icuBI->aBreakIterator = rbi;
 164             }
 165         }
 166
 167         if (!icuBI->aBreakIterator) {
 168             icu::Locale icuLocale(
 169                     OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
 170                     OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
 171                     OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
 172
 173             status = U_ZERO_ERROR;
 174             switch (rBreakType) {
 175                 case LOAD_CHARACTER_BREAKITERATOR:
 176                     icuBI->aBreakIterator =  icu::BreakIterator::createCharacterInstance(icuLocale, status);
 177                     break;
 178                 case LOAD_WORD_BREAKITERATOR:
 179                     icuBI->aBreakIterator =  icu::BreakIterator::createWordInstance(icuLocale, status);
 180                     break;
 181                 case LOAD_SENTENCE_BREAKITERATOR:
 182                     icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
 183                     break;
 184                 case LOAD_LINE_BREAKITERATOR:
 185                     icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
 186                     break;
 187             }
 188             if ( !U_SUCCESS(status) ) {
 189                 icuBI->aBreakIterator=NULL;
 190                 throw ERROR;
 191             }
 192         }
 193         if (icuBI->aBreakIterator) {
 194             aLocale=rLocale;
 195             aWordType=rWordType;
 196             aBreakType=rBreakType;
 197             newBreak=sal_True;
 198         } else {
 199             throw ERROR;
 200         }
 201     }
 202
 203     if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) {       // UChar != sal_Unicode in MinGW
 204         icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
 205         icuBI->aBreakIterator->setText(icuBI->aICUText);
 206     }
 207 }
 208
 209
 210 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
 211         sal_Int32 nStartPos, const lang::Locale &rLocale,
 212         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 213         throw(uno::RuntimeException)
 214 {
 215         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 216             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 217             for (nDone = 0; nDone < nCount; nDone++) {
 218                 nStartPos = character.aBreakIterator->following(nStartPos);
 219                 if (nStartPos == BreakIterator::DONE)
 220                     return Text.getLength();
 221             }
 222         } else { // for CHARACTER mode
 223             for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
 224                 Text.iterateCodePoints(&nStartPos, 1);
 225         }
 226         return nStartPos;
 227 }
 228
 229 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
 230         sal_Int32 nStartPos, const lang::Locale& rLocale,
 231         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
 232         throw(uno::RuntimeException)
 233 {
 234         if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
 235             loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
 236             for (nDone = 0; nDone < nCount; nDone++) {
 237                 nStartPos = character.aBreakIterator->preceding(nStartPos);
 238                 if (nStartPos == BreakIterator::DONE)
 239                     return 0;
 240             }
 241         } else { // for BS to delete one char and CHARACTER mode.
 242             for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
 243                 Text.iterateCodePoints(&nStartPos, -1);
 244         }
 245         return nStartPos;
 246 }
 247
 248
 249 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
 250     const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
 251 {
 252         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 253
 254         result.startPos = word.aBreakIterator->following(nStartPos);
 255         if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
 256             result.endPos = result.startPos;
 257         else {
 258             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 259                     rWordType == WordType::DICTIONARY_WORD ) &&
 260                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
 261                 result.startPos = word.aBreakIterator->following(result.startPos);
 262
 263             result.endPos = word.aBreakIterator->following(result.startPos);
 264             if(result.endPos == BreakIterator::DONE)
 265                 result.endPos = result.startPos;
 266         }
 267         return result;
 268 }
 269
 270
 271 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
 272         const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
 273 {
 274         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 275
 276         result.startPos = word.aBreakIterator->preceding(nStartPos);
 277         if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
 278             result.endPos = result.startPos;
 279         else {
 280             if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
 281                     rWordType == WordType::DICTIONARY_WORD) &&
 282                         u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
 283                 result.startPos = word.aBreakIterator->preceding(result.startPos);
 284
 285             result.endPos = word.aBreakIterator->following(result.startPos);
 286             if(result.endPos == BreakIterator::DONE)
 287                 result.endPos = result.startPos;
 288         }
 289         return result;
 290 }
 291
 292
 293 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
 294         sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
 295 {
 296         loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
 297         sal_Int32 len = Text.getLength();
 298
 299         if(word.aBreakIterator->isBoundary(nPos)) {
 300             result.startPos = result.endPos = nPos;
 301             if((bDirection || nPos == 0) && nPos < len) //forward
 302                 result.endPos = word.aBreakIterator->following(nPos);
 303             else
 304                 result.startPos = word.aBreakIterator->preceding(nPos);
 305         } else {
 306             if(nPos <= 0) {
 307                 result.startPos = 0;
 308                 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
 309             } else if(nPos >= len) {
 310                 result.startPos = word.aBreakIterator->preceding(len);
 311                 result.endPos = len;
 312             } else {
 313                 result.startPos = word.aBreakIterator->preceding(nPos);
 314                 result.endPos = word.aBreakIterator->following(nPos);
 315             }
 316         }
 317         if (result.startPos == BreakIterator::DONE)
 318             result.startPos = result.endPos;
 319         else if (result.endPos == BreakIterator::DONE)
 320             result.endPos = result.startPos;
 321
 322         return result;
 323 }
 324
 325
 326 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
 327         const lang::Locale &rLocale ) throw(uno::RuntimeException)
 328 {
 329         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 330
 331         sal_Int32 len = Text.getLength();
 332         if (len > 0 && nStartPos == len)
 333             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 334         if (!sentence.aBreakIterator->isBoundary(nStartPos))
 335             nStartPos = sentence.aBreakIterator->preceding(nStartPos);
 336
 337         // skip preceding space.
 338         sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
 339         while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
 340         Text.iterateCodePoints(&nStartPos, -1);
 341
 342         return nStartPos;
 343 }
 344
 345 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
 346         const lang::Locale &rLocale ) throw(uno::RuntimeException)
 347 {
 348         loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
 349
 350         sal_Int32 len = Text.getLength();
 351         if (len > 0 && nStartPos == len)
 352             Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
 353         nStartPos = sentence.aBreakIterator->following(nStartPos);
 354
 355         sal_Int32 nPos=nStartPos;
 356         while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
 357
 358         return nStartPos;
 359 }
 360
 361 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
 362         const OUString& Text, sal_Int32 nStartPos,
 363         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
 364         const LineBreakHyphenationOptions& hOptions,
 365         const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
 366 {
 367         LineBreakResults lbr;
 368
 369         if (nStartPos >= Text.getLength()) {
 370             lbr.breakIndex = Text.getLength();
 371             lbr.breakType = BreakType::WORDBOUNDARY;
 372             return lbr;
 373         }
 374
 375         loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
 376
 377         sal_Bool GlueSpace=sal_True;
 378         while (GlueSpace) {
 379             if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
 380                 lbr.breakIndex = nStartPos;
 381                 lbr.breakType = BreakType::WORDBOUNDARY;
 382             } else if (hOptions.rHyphenator.is()) { //Hyphenation break
 383                 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
 384                                                 WordType::DICTIONARY_WORD, false);
 385                 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
 386                 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
 387                     wBoundary.endPos - wBoundary.startPos), rLocale,
 388                     (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
 389                 if (aHyphenatedWord.is()) {
 390                     lbr.rHyphenatedWord = aHyphenatedWord;
 391                     if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
 392                         lbr.breakIndex = -1;
 393                     else
 394                         lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
 395                     lbr.breakType = BreakType::HYPHENATION;
 396                 } else {
 397                     lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
 398                     lbr.breakType = BreakType::WORDBOUNDARY;;
 399                 }
 400             } else { //word boundary break
 401                 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
 402                 lbr.breakType = BreakType::WORDBOUNDARY;
 403             }
 404
 405 #define WJ 0x2060   // Word Joiner
 406             GlueSpace=sal_False;
 407             if (lbr.breakType == BreakType::WORDBOUNDARY) {
 408                 nStartPos = lbr.breakIndex;
 409                 if (Text[nStartPos--] == WJ)
 410                     GlueSpace=sal_True;
 411                 while (nStartPos >= 0 &&
 412                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
 413                     if (Text[nStartPos--] == WJ)
 414                         GlueSpace=sal_True;
 415                 }
 416                 if (GlueSpace && nStartPos < 0)  {
 417                     lbr.breakIndex = 0;
 418                     break;
 419                 }
 420             }
 421         }
 422
 423         return lbr;
 424 }
 425
 426
 427
 428 OUString SAL_CALL
 429 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
 430 {
 431         return OUString::createFromAscii(cBreakIterator);
 432 }
 433
 434 sal_Bool SAL_CALL
 435 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
 436 {
 437         return !rServiceName.compareToAscii(cBreakIterator);
 438 }
 439
 440 uno::Sequence< OUString > SAL_CALL
 441 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
 442 {
 443         uno::Sequence< OUString > aRet(1);
 444         aRet[0] = OUString::createFromAscii(cBreakIterator);
 445         return aRet;
 446 }
 447
 448 } } } }