i18npool/source/breakiterator/xdictionary.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20
  21 // xdictionary.cpp: implementation of the xdictionary class.
  22 //
  23 //////////////////////////////////////////////////////////////////////
  24
  25
  26 #include <rtl/ustrbuf.hxx>
  27
  28 #include <com/sun/star/i18n/WordType.hpp>
  29 #include <xdictionary.hxx>
  30 #include <unicode/uchar.h>
  31 #include <string.h>
  32 #include <breakiteratorImpl.hxx>
  33
  34 //////////////////////////////////////////////////////////////////////
  35 // Construction/Destruction
  36 //////////////////////////////////////////////////////////////////////
  37
  38 using ::rtl::OUString;
  39 using ::rtl::OUStringBuffer;
  40
  41 namespace com { namespace sun { namespace star { namespace i18n {
  42
  43 #ifndef DISABLE_DYNLOADING
  44
  45 extern "C" { static void SAL_CALL thisModule() {} }
  46
  47 #else
  48
  49 extern "C" {
  50
  51 sal_uInt8* getExistMark_ja();
  52 sal_Int16* getIndex1_ja();
  53 sal_Int32* getIndex2_ja();
  54 sal_Int32* getLenArray_ja();
  55 sal_Unicode* getDataArea_ja();
  56
  57 sal_uInt8* getExistMark_zh();
  58 sal_Int16* getIndex1_zh();
  59 sal_Int32* getIndex2_zh();
  60 sal_Int32* getLenArray_zh();
  61 sal_Unicode* getDataArea_zh();
  62
  63 }
  64
  65 #endif
  66
  67 xdictionary::xdictionary(const sal_Char *lang) :
  68     existMark( NULL ),
  69     index1( NULL ),
  70     index2( NULL ),
  71     lenArray( NULL ),
  72     dataArea( NULL ),
  73 #ifndef DISABLE_DYNLOADING
  74     hModule( NULL ),
  75 #endif
  76     boundary(),
  77     japaneseWordBreak( sal_False )
  78 {
  79     index1 = 0;
  80 #ifndef DISABLE_DYNLOADING
  81 #ifdef SAL_DLLPREFIX
  82     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
  83     aBuf.appendAscii( SAL_DLLPREFIX );
  84 #else
  85     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
  86 #endif
  87     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
  88         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
  89         if( hModule ) {
  90             sal_IntPtr (*func)();
  91             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getExistMark").pData );
  92             existMark = (sal_uInt8*) (*func)();
  93             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex1").pData );
  94             index1 = (sal_Int16*) (*func)();
  95             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex2").pData );
  96             index2 = (sal_Int32*) (*func)();
  97             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getLenArray").pData );
  98             lenArray = (sal_Int32*) (*func)();
  99             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getDataArea").pData );
 100             dataArea = (sal_Unicode*) (*func)();
 101         }
 102         else
 103         {
 104             existMark = NULL;
 105             index1 = NULL;
 106             index2 = NULL;
 107             lenArray = NULL;
 108             dataArea = NULL;
 109         }
 110
 111 #else
 112         if( strcmp( lang, "ja" ) == 0 ) {
 113             existMark = getExistMark_ja();
 114             index1 = getIndex1_ja();
 115             index2 = getIndex2_ja();
 116             lenArray = getLenArray_ja();
 117             dataArea = getDataArea_ja();
 118         }
 119         else if( strcmp( lang, "zh" ) == 0 ) {
 120             existMark = getExistMark_zh();
 121             index1 = getIndex1_zh();
 122             index2 = getIndex2_zh();
 123             lenArray = getLenArray_zh();
 124             dataArea = getDataArea_zh();
 125         }
 126         else
 127         {
 128             existMark = NULL;
 129             index1 = NULL;
 130             index2 = NULL;
 131             lenArray = NULL;
 132             dataArea = NULL;
 133         }
 134 #endif
 135
 136         for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 137             cache[i].size = 0;
 138
 139         japaneseWordBreak = sal_False;
 140 }
 141
 142 xdictionary::~xdictionary() {
 143 #ifndef DISABLE_DYNLOADING
 144         osl_unloadModule(hModule);
 145 #endif
 146         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 147             if (cache[i].size > 0) {
 148                 delete [] cache[i].contents;
 149                 delete [] cache[i].wordboundary;
 150             }
 151         }
 152 }
 153
 154 void xdictionary::setJapaneseWordBreak()
 155 {
 156         japaneseWordBreak = sal_True;
 157 }
 158
 159 sal_Bool xdictionary::exists(const sal_uInt32 c) {
 160         // 0x1FFF is the hardcoded limit in gendict for existMarks
 161         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
 162         if (!exist && japaneseWordBreak)
 163             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 164         else
 165             return exist;
 166 }
 167
 168 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
 169
 170         if ( !index1 ) return 0;
 171
 172         sal_Int16 idx = index1[str[0] >> 8];
 173
 174         if (idx == 0xFF) return 0;
 175
 176         idx = (idx<<8) | (str[0]&0xff);
 177
 178         sal_uInt32 begin = index2[idx], end = index2[idx+1];
 179
 180         if (begin == 0) return 0;
 181
 182         str++; sLen--; // first character is not stored in the dictionary
 183         for (sal_uInt32 i = end; i > begin; i--) {
 184             sal_Int32 len = lenArray[i] - lenArray[i - 1];
 185             if (sLen >= len) {
 186                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
 187                 sal_Int32 pos = 0;
 188
 189                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
 190
 191                 if (pos == len)
 192                     return len + 1;
 193             }
 194         }
 195         return 0;
 196 }
 197
 198
 199 /*
 200  * c-tor
 201  */
 202
 203 WordBreakCache::WordBreakCache() :
 204     length( 0 ),
 205     contents( NULL ),
 206     wordboundary( NULL ),
 207     size( 0 )
 208 {
 209 }
 210
 211 /*
 212  * Compare two unicode string,
 213  */
 214
 215 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
 216         // Different length, different string.
 217         if (length != boundary.endPos - boundary.startPos) return sal_False;
 218
 219         for (sal_Int32 i = 0; i < length; i++)
 220             if (contents[i] != str[i + boundary.startPos]) return sal_False;
 221
 222         return sal_True;
 223 }
 224
 225
 226 /*
 227  * Retrieve the segment containing the character at pos.
 228  * @param pos : Position of the given character.
 229  * @return true if CJK.
 230  */
 231 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
 232     Boundary& segBoundary)
 233 {
 234     sal_Int32 indexUtf16;
 235     segBoundary.endPos = segBoundary.startPos = pos;
 236
 237     indexUtf16 = pos;
 238     while (indexUtf16 > 0)
 239     {
 240         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 241         if (u_isWhitespace(ch) || exists(ch))
 242             segBoundary.startPos = indexUtf16;
 243         else
 244             break;
 245     }
 246
 247     indexUtf16 = pos;
 248     while (indexUtf16 < rText.getLength())
 249     {
 250         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 251         if (u_isWhitespace(ch) || exists(ch))
 252             segBoundary.endPos = indexUtf16;
 253         else
 254             break;
 255     }
 256
 257     indexUtf16 = segBoundary.startPos;
 258     rText.iterateCodePoints(&indexUtf16, 1);
 259     return segBoundary.endPos > indexUtf16;
 260 }
 261
 262 #define KANJA       1
 263 #define KATAKANA    2
 264 #define HIRAKANA    3
 265
 266 static sal_Int16 JapaneseCharType(sal_Unicode c)
 267 {
 268     if (0x3041 <= c && c <= 0x309e)
 269         return HIRAKANA;
 270     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 271         return KATAKANA;
 272     return KANJA;
 273 }
 274
 275 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 276 {
 277     WordBreakCache& rCache = cache[text[0] & 0x1f];
 278
 279     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
 280         return rCache;
 281
 282     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 283
 284     if (rCache.size == 0 || len > rCache.size) {
 285         if (rCache.size != 0) {
 286             delete rCache.contents;
 287             delete rCache.wordboundary;
 288             rCache.size = len;
 289         }
 290         else
 291             rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 292         rCache.contents = new sal_Unicode[rCache.size + 1];
 293         rCache.wordboundary = new sal_Int32[rCache.size + 2];
 294     }
 295     rCache.length  = len;
 296     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 297     *(rCache.contents + len) = 0x0000;
 298     // reset the wordboundary in cache
 299     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 300
 301     sal_Int32 i = 0;        // loop variable
 302     while (rCache.wordboundary[i] < rCache.length) {
 303         len = 0;
 304         // look the continuous white space as one word and cashe it
 305         while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
 306             len ++;
 307
 308         if (len == 0) {
 309             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
 310             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
 311             sal_Int16 type = 0, count = 0;
 312             for (;len == 0 && slen > 0; str++, slen--) {
 313                 len = getLongestMatch(str, slen);
 314                 if (len == 0) {
 315                     if (!japaneseWordBreak) {
 316                         len = 1;
 317                     } else {
 318                         if (count == 0)
 319                             type = JapaneseCharType(*str);
 320                         else if (type != JapaneseCharType(*str))
 321                             break;
 322                         count++;
 323                     }
 324                 }
 325             }
 326             if (count)
 327             {
 328                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
 329                 i++;
 330             }
 331         }
 332
 333         if (len) {
 334             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
 335             i++;
 336         }
 337     }
 338     rCache.wordboundary[i + 1] = rCache.length + 1;
 339
 340     return rCache;
 341 }
 342
 343 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 344 {
 345         // looking for the first non-whitespace character from anyPos
 346         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
 347
 348         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 349
 350         return getWordBoundary(rText, anyPos, wordType, true);
 351 }
 352
 353 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 354 {
 355         boundary = getWordBoundary(rText, anyPos, wordType, true);
 356         anyPos = boundary.endPos;
 357         if (anyPos < rText.getLength()) {
 358             // looknig for the first non-whitespace character from anyPos
 359             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 360             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
 361             rText.iterateCodePoints(&anyPos, -1);
 362         }
 363
 364         return getWordBoundary(rText, anyPos, wordType, true);
 365 }
 366
 367 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
 368 {
 369         const sal_Unicode *text=rText.getStr();
 370         sal_Int32 len=rText.getLength();
 371         if (anyPos >= len || anyPos < 0) {
 372             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 373         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 374             WordBreakCache& aCache = getCache(text, boundary);
 375             sal_Int32 i = 0;
 376
 377             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 378
 379             sal_Int32 startPos = aCache.wordboundary[i - 1];
 380             // if bDirection is false
 381             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 382             {
 383                 sal_Int32 indexUtf16 = anyPos-1;
 384                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 385                 if (u_isWhitespace(ch))
 386                     i--;
 387             }
 388             boundary.endPos = boundary.startPos;
 389             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
 390             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
 391         } else {
 392             boundary.startPos = anyPos;
 393             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 394             boundary.endPos = anyPos < len ? anyPos : len;
 395         }
 396         if (wordType == WordType::WORD_COUNT) {
 397             // skip punctuation for word count.
 398             while (boundary.endPos < len)
 399             {
 400                 sal_Int32 indexUtf16 = boundary.endPos;
 401                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
 402                     boundary.endPos = indexUtf16;
 403                 else
 404                     break;
 405             }
 406         }
 407
 408         return boundary;
 409 }
 410
 411 } } } }
 412
 413 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */