i18npool/source/breakiterator/xdictionary.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <config_folders.h>
  21
  22 #include <osl/file.h>
  23 #include <osl/mutex.hxx>
  24 #include <rtl/ustrbuf.hxx>
  25 #include <rtl/bootstrap.hxx>
  26 #include <com/sun/star/i18n/WordType.hpp>
  27 #include <xdictionary.hxx>
  28 #include <unicode/uchar.h>
  29 #include <string.h>
  30 #include <breakiteratorImpl.hxx>
  31
  32 namespace com { namespace sun { namespace star { namespace i18n {
  33
  34 #ifdef DICT_JA_ZH_IN_DATAFILE
  35
  36 #elif !defined DISABLE_DYNLOADING
  37
  38 extern "C" { static void SAL_CALL thisModule() {} }
  39
  40 #else
  41
  42 extern "C" {
  43
  44 sal_uInt8* getExistMark_ja();
  45 sal_Int16* getIndex1_ja();
  46 sal_Int32* getIndex2_ja();
  47 sal_Int32* getLenArray_ja();
  48 sal_Unicode* getDataArea_ja();
  49
  50 sal_uInt8* getExistMark_zh();
  51 sal_Int16* getIndex1_zh();
  52 sal_Int32* getIndex2_zh();
  53 sal_Int32* getLenArray_zh();
  54 sal_Unicode* getDataArea_zh();
  55
  56 }
  57
  58 #endif
  59
  60 xdictionary::xdictionary(const sal_Char *lang) :
  61     boundary(),
  62     japaneseWordBreak( false )
  63 {
  64
  65 #ifdef DICT_JA_ZH_IN_DATAFILE
  66
  67     if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
  68     {
  69         OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
  70         rtl::Bootstrap::expandMacros(sUrl);
  71
  72         if( strcmp( lang, "ja" ) == 0 )
  73             sUrl += "ja.data";
  74         else if( strcmp( lang, "zh" ) == 0 )
  75             sUrl += "zh.data";
  76
  77         oslFileHandle aFileHandle;
  78         sal_uInt64 nFileSize;
  79         char *pMapping;
  80         if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
  81             osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None &&
  82             osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
  83         {
  84             // We have the offsets to the parts of the file at its end, see gendict.cxx
  85             sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize);
  86
  87             data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]);
  88             data.index2 = (sal_Int32*) (pMapping + pEOF[-2]);
  89             data.index1 = (sal_Int16*) (pMapping + pEOF[-3]);
  90             data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]);
  91             data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]);
  92         }
  93     }
  94
  95 #elif !defined DISABLE_DYNLOADING
  96
  97     initDictionaryData( lang );
  98
  99 #else
 100
 101     if( strcmp( lang, "ja" ) == 0 ) {
 102         data.existMark = getExistMark_ja();
 103         data.index1 = getIndex1_ja();
 104         data.index2 = getIndex2_ja();
 105         data.lenArray = getLenArray_ja();
 106         data.dataArea = getDataArea_ja();
 107     }
 108     else if( strcmp( lang, "zh" ) == 0 ) {
 109         data.existMark = getExistMark_zh();
 110         data.index1 = getIndex1_zh();
 111         data.index2 = getIndex2_zh();
 112         data.lenArray = getLenArray_zh();
 113         data.dataArea = getDataArea_zh();
 114     }
 115
 116 #endif
 117
 118     for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 119         cache[i].size = 0;
 120
 121     japaneseWordBreak = false;
 122 }
 123
 124 xdictionary::~xdictionary()
 125 {
 126     for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 127         if (cache[i].size > 0) {
 128             delete [] cache[i].contents;
 129             delete [] cache[i].wordboundary;
 130         }
 131     }
 132 }
 133
 134 namespace {
 135     struct datacache {
 136         oslModule       mhModule;
 137         OString         maLang;
 138         xdictionarydata maData;
 139     };
 140 }
 141
 142 #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
 143
 144 void xdictionary::initDictionaryData(const sal_Char *pLang)
 145 {
 146     // Global cache, never released for performance
 147     static std::vector< datacache > aLoadedCache;
 148
 149     osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
 150     for( size_t i = 0; i < aLoadedCache.size(); ++i )
 151     {
 152         if( !strcmp( pLang, aLoadedCache[ i ].maLang.getStr() ) )
 153         {
 154             data = aLoadedCache[ i ].maData;
 155             return;
 156         }
 157     }
 158
 159     // otherwise add to the cache, positive or negative.
 160     datacache aEntry;
 161     aEntry.maLang = OString( pLang, strlen( pLang ) );
 162
 163 #ifdef SAL_DLLPREFIX
 164     OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 6) );    // mostly "lib*.so" (with * == dict_zh)
 165     aBuf.appendAscii( SAL_DLLPREFIX );
 166 #else
 167     OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 4) );    // mostly "*.dll" (with * == dict_zh)
 168 #endif
 169     aBuf.appendAscii( "dict_" ).appendAscii( pLang ).appendAscii( SAL_DLLEXTENSION );
 170     aEntry.mhModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
 171     if( aEntry.mhModule ) {
 172         oslGenericFunction func;
 173         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
 174         aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
 175         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
 176         aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
 177         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
 178         aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
 179         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
 180         aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
 181         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
 182         aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
 183     }
 184
 185     data = aEntry.maData;
 186     aLoadedCache.push_back( aEntry );
 187 }
 188
 189 #endif
 190
 191 void xdictionary::setJapaneseWordBreak()
 192 {
 193     japaneseWordBreak = true;
 194 }
 195
 196 bool xdictionary::exists(const sal_uInt32 c)
 197 {
 198     // 0x1FFF is the hardcoded limit in gendict for data.existMarks
 199     bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
 200     if (!exist && japaneseWordBreak)
 201         return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 202     else
 203         return exist;
 204 }
 205
 206 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
 207 {
 208     if ( !data.index1 ) return 0;
 209
 210     sal_Int16 idx = data.index1[str[0] >> 8];
 211
 212     if (idx == 0xFF) return 0;
 213
 214     idx = (idx<<8) | (str[0]&0xff);
 215
 216     sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
 217
 218     if (begin == 0) return 0;
 219
 220     str++; sLen--; // first character is not stored in the dictionary
 221     for (sal_uInt32 i = end; i > begin; i--) {
 222         sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
 223         if (sLen >= len) {
 224             const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
 225             sal_Int32 pos = 0;
 226
 227             while (pos < len && dstr[pos] == str[pos]) { pos++; }
 228
 229             if (pos == len)
 230                 return len + 1;
 231         }
 232     }
 233     return 0;
 234 }
 235
 236
 237 /*
 238  * c-tor
 239  */
 240
 241 WordBreakCache::WordBreakCache() :
 242     length( 0 ),
 243     contents( NULL ),
 244     wordboundary( NULL ),
 245     size( 0 )
 246 {
 247 }
 248
 249 /*
 250  * Compare two unicode string,
 251  */
 252
 253 bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary)
 254 {
 255     // Different length, different string.
 256     if (length != boundary.endPos - boundary.startPos) return false;
 257
 258     for (sal_Int32 i = 0; i < length; i++)
 259         if (contents[i] != str[i + boundary.startPos]) return false;
 260
 261     return true;
 262 }
 263
 264
 265 /*
 266  * Retrieve the segment containing the character at pos.
 267  * @param pos : Position of the given character.
 268  * @return true if CJK.
 269  */
 270 bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
 271     Boundary& segBoundary)
 272 {
 273     sal_Int32 indexUtf16;
 274
 275     if (segmentCachedString.pData != rText.pData) {
 276         // Cache the passed text so we can avoid regenerating the segment if it's the same
 277         // (pData is refcounted and assigning the OUString references it, which ensures that
 278         // the object is the same if we get the same pointer back later)
 279         segmentCachedString = rText;
 280     } else {
 281         // If pos is within the cached boundary, use that boundary
 282         if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
 283             segBoundary.startPos = segmentCachedBoundary.startPos;
 284             segBoundary.endPos = segmentCachedBoundary.endPos;
 285             indexUtf16 = segmentCachedBoundary.startPos;
 286             rText.iterateCodePoints(&indexUtf16, 1);
 287             return segmentCachedBoundary.endPos > indexUtf16;
 288         }
 289     }
 290
 291     segBoundary.endPos = segBoundary.startPos = pos;
 292
 293     indexUtf16 = pos;
 294     while (indexUtf16 > 0)
 295     {
 296         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 297         if (u_isWhitespace(ch) || exists(ch))
 298             segBoundary.startPos = indexUtf16;
 299         else
 300             break;
 301     }
 302
 303     indexUtf16 = pos;
 304     while (indexUtf16 < rText.getLength())
 305     {
 306         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 307         if (u_isWhitespace(ch) || exists(ch))
 308             segBoundary.endPos = indexUtf16;
 309         else
 310             break;
 311     }
 312
 313     // Cache the calculated boundary
 314     segmentCachedBoundary.startPos = segBoundary.startPos;
 315     segmentCachedBoundary.endPos = segBoundary.endPos;
 316
 317     indexUtf16 = segBoundary.startPos;
 318     rText.iterateCodePoints(&indexUtf16, 1);
 319     return segBoundary.endPos > indexUtf16;
 320 }
 321
 322 #define KANJA       1
 323 #define KATAKANA    2
 324 #define HIRAKANA    3
 325
 326 static sal_Int16 JapaneseCharType(sal_Unicode c)
 327 {
 328     if (0x3041 <= c && c <= 0x309e)
 329         return HIRAKANA;
 330     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 331         return KATAKANA;
 332     return KANJA;
 333 }
 334
 335 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 336 {
 337     WordBreakCache& rCache = cache[text[0] & 0x1f];
 338
 339     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
 340         return rCache;
 341
 342     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 343
 344     if (rCache.size == 0 || len > rCache.size) {
 345         if (rCache.size != 0) {
 346             delete [] rCache.contents;
 347             delete [] rCache.wordboundary;
 348             rCache.size = len;
 349         }
 350         else
 351             rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 352         rCache.contents = new sal_Unicode[rCache.size + 1];
 353         rCache.wordboundary = new sal_Int32[rCache.size + 2];
 354     }
 355     rCache.length  = len;
 356     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 357     *(rCache.contents + len) = 0x0000;
 358     // reset the wordboundary in cache
 359     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 360
 361     sal_Int32 i = 0;        // loop variable
 362     while (rCache.wordboundary[i] < rCache.length) {
 363         len = 0;
 364         // look the continuous white space as one word and cashe it
 365         while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
 366             len ++;
 367
 368         if (len == 0) {
 369             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
 370             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
 371             sal_Int16 type = 0, count = 0;
 372             for (;len == 0 && slen > 0; str++, slen--) {
 373                 len = getLongestMatch(str, slen);
 374                 if (len == 0) {
 375                     if (!japaneseWordBreak) {
 376                         len = 1;
 377                     } else {
 378                         if (count == 0)
 379                             type = JapaneseCharType(*str);
 380                         else if (type != JapaneseCharType(*str))
 381                             break;
 382                         count++;
 383                     }
 384                 }
 385             }
 386             if (count)
 387             {
 388                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
 389                 i++;
 390             }
 391         }
 392
 393         if (len) {
 394             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
 395             i++;
 396         }
 397     }
 398     rCache.wordboundary[i + 1] = rCache.length + 1;
 399
 400     return rCache;
 401 }
 402
 403 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 404 {
 405         // looking for the first non-whitespace character from anyPos
 406         sal_uInt32 ch = 0;
 407         if (anyPos > 0)
 408             rText.iterateCodePoints(&anyPos, -1);
 409
 410         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 411
 412         return getWordBoundary(rText, anyPos, wordType, true);
 413 }
 414
 415 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 416 {
 417         boundary = getWordBoundary(rText, anyPos, wordType, true);
 418         anyPos = boundary.endPos;
 419         const sal_Int32 nLen = rText.getLength();
 420         if (anyPos < nLen) {
 421             // looknig for the first non-whitespace character from anyPos
 422             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 423             while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos, 1);
 424             if (anyPos > 0)
 425                 rText.iterateCodePoints(&anyPos, -1);
 426         }
 427
 428         return getWordBoundary(rText, anyPos, wordType, true);
 429 }
 430
 431 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
 432 {
 433         const sal_Unicode *text=rText.getStr();
 434         sal_Int32 len=rText.getLength();
 435         if (anyPos >= len || anyPos < 0) {
 436             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 437         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 438             WordBreakCache& aCache = getCache(text, boundary);
 439             sal_Int32 i = 0;
 440
 441             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 442
 443             sal_Int32 startPos = aCache.wordboundary[i - 1];
 444             // if bDirection is false
 445             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 446             {
 447                 sal_Int32 indexUtf16 = anyPos-1;
 448                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 449                 if (u_isWhitespace(ch))
 450                     i--;
 451             }
 452
 453             boundary.endPos = boundary.startPos;
 454             boundary.endPos += aCache.wordboundary[i];
 455             boundary.startPos += aCache.wordboundary[i-1];
 456
 457         } else {
 458             boundary.startPos = anyPos;
 459             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 460             boundary.endPos = anyPos < len ? anyPos : len;
 461         }
 462         if (wordType == WordType::WORD_COUNT) {
 463             // skip punctuation for word count.
 464             while (boundary.endPos < len)
 465             {
 466                 sal_Int32 indexUtf16 = boundary.endPos;
 467                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
 468                     boundary.endPos = indexUtf16;
 469                 else
 470                     break;
 471             }
 472         }
 473
 474         return boundary;
 475 }
 476
 477 } } } }
 478
 479 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */