i18npool/source/breakiterator/xdictionary.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <config_folders.h>
  21
  22 #include <osl/file.h>
  23 #include <osl/module.h>
  24 #include <osl/mutex.hxx>
  25 #include <rtl/ustrbuf.hxx>
  26 #include <rtl/bootstrap.hxx>
  27 #include <com/sun/star/i18n/ScriptType.hpp>
  28 #include <com/sun/star/i18n/WordType.hpp>
  29 #include <xdictionary.hxx>
  30 #include <unicode/uchar.h>
  31 #include <string.h>
  32 #include <breakiteratorImpl.hxx>
  33
  34 using namespace com::sun::star::i18n;
  35
  36 namespace i18npool {
  37
  38 #ifdef DICT_JA_ZH_IN_DATAFILE
  39
  40 #elif !defined DISABLE_DYNLOADING
  41
  42 extern "C" { static void thisModule() {} }
  43
  44 #else
  45
  46 extern "C" {
  47
  48 sal_uInt8* getExistMark_ja();
  49 sal_Int16* getIndex1_ja();
  50 sal_Int32* getIndex2_ja();
  51 sal_Int32* getLenArray_ja();
  52 sal_Unicode* getDataArea_ja();
  53
  54 sal_uInt8* getExistMark_zh();
  55 sal_Int16* getIndex1_zh();
  56 sal_Int32* getIndex2_zh();
  57 sal_Int32* getLenArray_zh();
  58 sal_Unicode* getDataArea_zh();
  59
  60 }
  61
  62 #endif
  63
  64 xdictionary::xdictionary(const sal_Char *lang) :
  65     boundary(),
  66     japaneseWordBreak( false )
  67 {
  68
  69 #ifdef DICT_JA_ZH_IN_DATAFILE
  70
  71     if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
  72     {
  73         OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
  74         rtl::Bootstrap::expandMacros(sUrl);
  75
  76         if( strcmp( lang, "ja" ) == 0 )
  77             sUrl += "ja.data";
  78         else if( strcmp( lang, "zh" ) == 0 )
  79             sUrl += "zh.data";
  80
  81         oslFileHandle aFileHandle;
  82         sal_uInt64 nFileSize;
  83         char *pMapping;
  84         if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
  85             osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None &&
  86             osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
  87         {
  88             // We have the offsets to the parts of the file at its end, see gendict.cxx
  89             sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize);
  90
  91             data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]);
  92             data.index2 = (sal_Int32*) (pMapping + pEOF[-2]);
  93             data.index1 = (sal_Int16*) (pMapping + pEOF[-3]);
  94             data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]);
  95             data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]);
  96         }
  97     }
  98
  99 #elif !defined DISABLE_DYNLOADING
 100
 101     initDictionaryData( lang );
 102
 103 #else
 104
 105     if( strcmp( lang, "ja" ) == 0 ) {
 106         data.existMark = getExistMark_ja();
 107         data.index1 = getIndex1_ja();
 108         data.index2 = getIndex2_ja();
 109         data.lenArray = getLenArray_ja();
 110         data.dataArea = getDataArea_ja();
 111     }
 112     else if( strcmp( lang, "zh" ) == 0 ) {
 113         data.existMark = getExistMark_zh();
 114         data.index1 = getIndex1_zh();
 115         data.index2 = getIndex2_zh();
 116         data.lenArray = getLenArray_zh();
 117         data.dataArea = getDataArea_zh();
 118     }
 119
 120 #endif
 121
 122     for (WordBreakCache & i : cache)
 123         i.size = 0;
 124
 125     japaneseWordBreak = false;
 126 }
 127
 128 xdictionary::~xdictionary()
 129 {
 130     for (WordBreakCache & i : cache) {
 131         if (i.size > 0) {
 132             delete [] i.contents;
 133             delete [] i.wordboundary;
 134         }
 135     }
 136 }
 137
 138 namespace {
 139     struct datacache {
 140         oslModule       mhModule;
 141         OString         maLang;
 142         xdictionarydata maData;
 143     };
 144 }
 145
 146 #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
 147
 148 void xdictionary::initDictionaryData(const sal_Char *pLang)
 149 {
 150     // Global cache, never released for performance
 151     static std::vector< datacache > aLoadedCache;
 152
 153     osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
 154     for(datacache & i : aLoadedCache)
 155     {
 156         if( i.maLang != pLang )
 157         {
 158             data = i.maData;
 159             return;
 160         }
 161     }
 162
 163     // otherwise add to the cache, positive or negative.
 164     datacache aEntry;
 165     aEntry.maLang = OString( pLang, strlen( pLang ) );
 166
 167 #ifdef SAL_DLLPREFIX
 168     OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 6) );    // mostly "lib*.so" (with * == dict_zh)
 169     aBuf.append( SAL_DLLPREFIX );
 170 #else
 171     OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 4) );    // mostly "*.dll" (with * == dict_zh)
 172 #endif
 173     aBuf.append( "dict_" ).appendAscii( pLang ).append( SAL_DLLEXTENSION );
 174     aEntry.mhModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
 175     if( aEntry.mhModule ) {
 176         oslGenericFunction func;
 177         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
 178         aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
 179         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
 180         aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
 181         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
 182         aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
 183         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
 184         aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
 185         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
 186         aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
 187     }
 188
 189     data = aEntry.maData;
 190     aLoadedCache.push_back( aEntry );
 191 }
 192
 193 #endif
 194
 195 void xdictionary::setJapaneseWordBreak()
 196 {
 197     japaneseWordBreak = true;
 198 }
 199
 200 bool xdictionary::exists(const sal_uInt32 c)
 201 {
 202     // 0x1FFF is the hardcoded limit in gendict for data.existMarks
 203     bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
 204     if (!exist && japaneseWordBreak)
 205         return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN;
 206     else
 207         return exist;
 208 }
 209
 210 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
 211 {
 212     if ( !data.index1 ) return 0;
 213
 214     sal_Int16 idx = data.index1[str[0] >> 8];
 215
 216     if (idx == 0xFF) return 0;
 217
 218     idx = (idx<<8) | (str[0]&0xff);
 219
 220     sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
 221
 222     if (begin == 0) return 0;
 223
 224     str++; sLen--; // first character is not stored in the dictionary
 225     for (sal_uInt32 i = end; i > begin; i--) {
 226         sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
 227         if (sLen >= len) {
 228             const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
 229             sal_Int32 pos = 0;
 230
 231             while (pos < len && dstr[pos] == str[pos]) { pos++; }
 232
 233             if (pos == len)
 234                 return len + 1;
 235         }
 236     }
 237     return 0;
 238 }
 239
 240
 241 /*
 242  * c-tor
 243  */
 244
 245 WordBreakCache::WordBreakCache() :
 246     length( 0 ),
 247     contents( nullptr ),
 248     wordboundary( nullptr ),
 249     size( 0 )
 250 {
 251 }
 252
 253 /*
 254  * Compare two unicode string,
 255  */
 256
 257 bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary)
 258 {
 259     // Different length, different string.
 260     if (length != boundary.endPos - boundary.startPos) return false;
 261
 262     for (sal_Int32 i = 0; i < length; i++)
 263         if (contents[i] != str[i + boundary.startPos]) return false;
 264
 265     return true;
 266 }
 267
 268
 269 /*
 270  * Retrieve the segment containing the character at pos.
 271  * @param pos : Position of the given character.
 272  * @return true if CJK.
 273  */
 274 bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
 275     Boundary& segBoundary)
 276 {
 277     sal_Int32 indexUtf16;
 278
 279     if (segmentCachedString.pData != rText.pData) {
 280         // Cache the passed text so we can avoid regenerating the segment if it's the same
 281         // (pData is refcounted and assigning the OUString references it, which ensures that
 282         // the object is the same if we get the same pointer back later)
 283         segmentCachedString = rText;
 284     } else {
 285         // If pos is within the cached boundary, use that boundary
 286         if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
 287             segBoundary.startPos = segmentCachedBoundary.startPos;
 288             segBoundary.endPos = segmentCachedBoundary.endPos;
 289             indexUtf16 = segmentCachedBoundary.startPos;
 290             rText.iterateCodePoints(&indexUtf16);
 291             return segmentCachedBoundary.endPos > indexUtf16;
 292         }
 293     }
 294
 295     segBoundary.endPos = segBoundary.startPos = pos;
 296
 297     indexUtf16 = pos;
 298     while (indexUtf16 > 0)
 299     {
 300         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 301         if (u_isWhitespace(ch) || exists(ch))
 302             segBoundary.startPos = indexUtf16;
 303         else
 304             break;
 305     }
 306
 307     indexUtf16 = pos;
 308     while (indexUtf16 < rText.getLength())
 309     {
 310         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
 311         if (u_isWhitespace(ch) || exists(ch))
 312             segBoundary.endPos = indexUtf16;
 313         else
 314             break;
 315     }
 316
 317     // Cache the calculated boundary
 318     segmentCachedBoundary.startPos = segBoundary.startPos;
 319     segmentCachedBoundary.endPos = segBoundary.endPos;
 320
 321     indexUtf16 = segBoundary.startPos;
 322     rText.iterateCodePoints(&indexUtf16);
 323     return segBoundary.endPos > indexUtf16;
 324 }
 325
 326 #define KANJA       1
 327 #define KATAKANA    2
 328 #define HIRAKANA    3
 329
 330 static sal_Int16 JapaneseCharType(sal_Unicode c)
 331 {
 332     if (0x3041 <= c && c <= 0x309e)
 333         return HIRAKANA;
 334     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 335         return KATAKANA;
 336     return KANJA;
 337 }
 338
 339 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary)
 340 {
 341     WordBreakCache& rCache = cache[text[0] & 0x1f];
 342
 343     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
 344         return rCache;
 345
 346     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 347
 348     if (rCache.size == 0 || len > rCache.size) {
 349         if (rCache.size != 0) {
 350             delete [] rCache.contents;
 351             delete [] rCache.wordboundary;
 352             rCache.size = len;
 353         }
 354         else
 355             rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE);
 356         rCache.contents = new sal_Unicode[rCache.size + 1];
 357         rCache.wordboundary = new sal_Int32[rCache.size + 2];
 358     }
 359     rCache.length  = len;
 360     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 361     *(rCache.contents + len) = 0x0000;
 362     // reset the wordboundary in cache
 363     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 364
 365     sal_Int32 i = 0;        // loop variable
 366     while (rCache.wordboundary[i] < rCache.length) {
 367         len = 0;
 368         // look the continuous white space as one word and cache it
 369         while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len])))
 370             len ++;
 371
 372         if (len == 0) {
 373             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
 374             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
 375             sal_Int16 type = 0, count = 0;
 376             for (;len == 0 && slen > 0; str++, slen--) {
 377                 len = getLongestMatch(str, slen);
 378                 if (len == 0) {
 379                     if (!japaneseWordBreak) {
 380                         len = 1;
 381                     } else {
 382                         if (count == 0)
 383                             type = JapaneseCharType(*str);
 384                         else if (type != JapaneseCharType(*str))
 385                             break;
 386                         count++;
 387                     }
 388                 }
 389             }
 390             if (count)
 391             {
 392                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
 393                 i++;
 394             }
 395         }
 396
 397         if (len) {
 398             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
 399             i++;
 400         }
 401     }
 402     rCache.wordboundary[i + 1] = rCache.length + 1;
 403
 404     return rCache;
 405 }
 406
 407 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 408 {
 409         // looking for the first non-whitespace character from anyPos
 410         sal_uInt32 ch = 0;
 411         if (anyPos > 0)
 412             rText.iterateCodePoints(&anyPos, -1);
 413
 414         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 415
 416         return getWordBoundary(rText, anyPos, wordType, true);
 417 }
 418
 419 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 420 {
 421         boundary = getWordBoundary(rText, anyPos, wordType, true);
 422         anyPos = boundary.endPos;
 423         const sal_Int32 nLen = rText.getLength();
 424         if (anyPos < nLen) {
 425             // looking for the first non-whitespace character from anyPos
 426             sal_uInt32 ch = rText.iterateCodePoints(&anyPos);
 427             while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos);
 428             if (anyPos > 0)
 429                 rText.iterateCodePoints(&anyPos, -1);
 430         }
 431
 432         return getWordBoundary(rText, anyPos, wordType, true);
 433 }
 434
 435 Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
 436 {
 437         const sal_Unicode *text=rText.getStr();
 438         sal_Int32 len=rText.getLength();
 439         if (anyPos >= len || anyPos < 0) {
 440             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 441         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 442             WordBreakCache& aCache = getCache(text, boundary);
 443             sal_Int32 i = 0;
 444
 445             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 446
 447             sal_Int32 startPos = aCache.wordboundary[i - 1];
 448             // if bDirection is false
 449             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 450             {
 451                 sal_Int32 indexUtf16 = anyPos-1;
 452                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
 453                 if (u_isWhitespace(ch))
 454                     i--;
 455             }
 456
 457             boundary.endPos = boundary.startPos;
 458             boundary.endPos += aCache.wordboundary[i];
 459             boundary.startPos += aCache.wordboundary[i-1];
 460
 461         } else {
 462             boundary.startPos = anyPos;
 463             if (anyPos < len) rText.iterateCodePoints(&anyPos);
 464             boundary.endPos = std::min(anyPos, len);
 465         }
 466         if (wordType == WordType::WORD_COUNT) {
 467             // skip punctuation for word count.
 468             while (boundary.endPos < len)
 469             {
 470                 sal_Int32 indexUtf16 = boundary.endPos;
 471                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16)))
 472                     boundary.endPos = indexUtf16;
 473                 else
 474                     break;
 475             }
 476         }
 477
 478         return boundary;
 479 }
 480
 481 }
 482
 483 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */