i18npool/source/breakiterator/xdictionary.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20
  21 // xdictionary.cpp: implementation of the xdictionary class.
  22 //
  23 //////////////////////////////////////////////////////////////////////
  24
  25
  26 #include <rtl/ustrbuf.hxx>
  27
  28 #include <com/sun/star/i18n/WordType.hpp>
  29 #include <xdictionary.hxx>
  30 #include <unicode/uchar.h>
  31 #include <string.h>
  32 #include <breakiteratorImpl.hxx>
  33
  34 //////////////////////////////////////////////////////////////////////
  35 // Construction/Destruction
  36 //////////////////////////////////////////////////////////////////////
  37
  38
  39 namespace com { namespace sun { namespace star { namespace i18n {
  40
  41 #ifndef DISABLE_DYNLOADING
  42
  43 extern "C" { static void SAL_CALL thisModule() {} }
  44
  45 #else
  46
  47 extern "C" {
  48
  49 sal_uInt8* getExistMark_ja();
  50 sal_Int16* getIndex1_ja();
  51 sal_Int32* getIndex2_ja();
  52 sal_Int32* getLenArray_ja();
  53 sal_Unicode* getDataArea_ja();
  54
  55 sal_uInt8* getExistMark_zh();
  56 sal_Int16* getIndex1_zh();
  57 sal_Int32* getIndex2_zh();
  58 sal_Int32* getLenArray_zh();
  59 sal_Unicode* getDataArea_zh();
  60
  61 }
  62
  63 #endif
  64
  65 xdictionary::xdictionary(const sal_Char *lang) :
  66     existMark( NULL ),
  67     index1( NULL ),
  68     index2( NULL ),
  69     lenArray( NULL ),
  70     dataArea( NULL ),
  71 #ifndef DISABLE_DYNLOADING
  72     hModule( NULL ),
  73 #endif
  74     boundary(),
  75     japaneseWordBreak( sal_False )
  76 {
  77     index1 = 0;
  78 #ifndef DISABLE_DYNLOADING
  79 #ifdef SAL_DLLPREFIX
  80     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
  81     aBuf.appendAscii( SAL_DLLPREFIX );
  82 #else
  83     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
  84 #endif
  85     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
  86         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
  87         if( hModule ) {
  88             sal_IntPtr (*func)();
  89             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getExistMark").pData );
  90             existMark = (sal_uInt8*) (*func)();
  91             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex1").pData );
  92             index1 = (sal_Int16*) (*func)();
  93             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex2").pData );
  94             index2 = (sal_Int32*) (*func)();
  95             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getLenArray").pData );
  96             lenArray = (sal_Int32*) (*func)();
  97             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getDataArea").pData );
  98             dataArea = (sal_Unicode*) (*func)();
  99         }
 100         else
 101         {
 102             existMark = NULL;
 103             index1 = NULL;
 104             index2 = NULL;
 105             lenArray = NULL;
 106             dataArea = NULL;
 107         }
 108
 109 #else
 110         if( strcmp( lang, "ja" ) == 0 ) {
 111             existMark = getExistMark_ja();
 112             index1 = getIndex1_ja();
 113             index2 = getIndex2_ja();
 114             lenArray = getLenArray_ja();
 115             dataArea = getDataArea_ja();
 116         }
 117         else if( strcmp( lang, "zh" ) == 0 ) {
 118             existMark = getExistMark_zh();
 119             index1 = getIndex1_zh();
 120             index2 = getIndex2_zh();
 121             lenArray = getLenArray_zh();
 122             dataArea = getDataArea_zh();
 123         }
 124         else
 125         {
 126             existMark = NULL;
 127             index1 = NULL;
 128             index2 = NULL;
 129             lenArray = NULL;
 130             dataArea = NULL;
 131         }
 132 #endif
 133
 134         for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 135             cache[i].size = 0;
 136
 137         japaneseWordBreak = sal_False;
 138 }
 139
 140 xdictionary::~xdictionary() {
 141 #ifndef DISABLE_DYNLOADING
 142         osl_unloadModule(hModule);
 143 #endif
 144         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 145             if (cache[i].size > 0) {
 146                 delete [] cache[i].contents;
 147                 delete [] cache[i].wordboundary;
 148             }
 149         }
 150 }
 151
 152 void xdictionary::setJapaneseWordBreak()
 153 {
 154         japaneseWordBreak = sal_True;
 155 }
 156
 157 sal_Bool xdictionary::exists(const sal_uInt32 c) {
 158         // 0x1FFF is the hardcoded limit in gendict for existMarks
 159         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
 160         if (!exist && japaneseWordBreak)
 161             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 162         else
 163             return exist;
 164 }
 165
 166 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
 167
 168         if ( !index1 ) return 0;
 169
 170         sal_Int16 idx = index1[str[0] >> 8];
 171
 172         if (idx == 0xFF) return 0;
 173
 174         idx = (idx<<8) | (str[0]&0xff);
 175
 176         sal_uInt32 begin = index2[idx], end = index2[idx+1];
 177
 178         if (begin == 0) return 0;
 179
 180         str++; sLen--; // first character is not stored in the dictionary
 181         for (sal_uInt32 i = end; i > begin; i--) {
 182             sal_Int32 len = lenArray[i] - lenArray[i - 1];
 183             if (sLen >= len) {
 184                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
 185                 sal_Int32 pos = 0;
 186
 187                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
 188
 189                 if (pos == len)
 190                     return len + 1;
 191             }
 192         }
 193         return 0;
 194 }
 195
 196
 197 /*
 198  * c-tor
 199  */
 200
 201 WordBreakCache::WordBreakCache() :
 202     length( 0 ),
 203     contents( NULL ),
 204     wordboundary( NULL ),
 205     size( 0 )
 206 {
 207 }
 208
 209 /*
 210  * Compare two unicode string,
 211  */
 212
 213 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
 214         // Different length, different string.
 215         if (length != boundary.endPos - boundary.startPos) return sal_False;
 216
 217         for (sal_Int32 i = 0; i < length; i++)
 218             if (contents[i] != str[i + boundary.startPos]) return sal_False;
 219
 220         return sal_True;
 221 }
 222
 223
 224 /*
 225  * Retrieve the segment containing the character at pos.
 226  * @param pos : Position of the given character.
 227  * @return true if CJK.
 228  */
 229 sal_Bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
 230     Boundary& segBoundary)
 231 {
 232     sal_Int32 indexUtf16;
 233     segBoundary.endPos = segBoundary.startPos = pos;
 234
 235     indexUtf16 = pos;
 236     while (indexUtf16 > 0)
 237     {
 238         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 239         if (u_isWhitespace(ch) || exists(ch))
 240             segBoundary.startPos = indexUtf16;
 241         else
 242             break;
 243     }
 244
 245     indexUtf16 = pos;
 246     while (indexUtf16 < rText.getLength())
 247     {
 248         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 249         if (u_isWhitespace(ch) || exists(ch))
 250             segBoundary.endPos = indexUtf16;
 251         else
 252             break;
 253     }
 254
 255     indexUtf16 = segBoundary.startPos;
 256     rText.iterateCodePoints(&indexUtf16, 1);
 257     return segBoundary.endPos > indexUtf16;
 258 }
 259
 260 #define KANJA       1
 261 #define KATAKANA    2
 262 #define HIRAKANA    3
 263
 264 static sal_Int16 JapaneseCharType(sal_Unicode c)
 265 {
 266     if (0x3041 <= c && c <= 0x309e)
 267         return HIRAKANA;
 268     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 269         return KATAKANA;
 270     return KANJA;
 271 }
 272
 273 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 274 {
 275     WordBreakCache& rCache = cache[text[0] & 0x1f];
 276
 277     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
 278         return rCache;
 279
 280     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 281
 282     if (rCache.size == 0 || len > rCache.size) {
 283         if (rCache.size != 0) {
 284             delete rCache.contents;
 285             delete rCache.wordboundary;
 286             rCache.size = len;
 287         }
 288         else
 289             rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 290         rCache.contents = new sal_Unicode[rCache.size + 1];
 291         rCache.wordboundary = new sal_Int32[rCache.size + 2];
 292     }
 293     rCache.length  = len;
 294     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 295     *(rCache.contents + len) = 0x0000;
 296     // reset the wordboundary in cache
 297     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 298
 299     sal_Int32 i = 0;        // loop variable
 300     while (rCache.wordboundary[i] < rCache.length) {
 301         len = 0;
 302         // look the continuous white space as one word and cashe it
 303         while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
 304             len ++;
 305
 306         if (len == 0) {
 307             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
 308             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
 309             sal_Int16 type = 0, count = 0;
 310             for (;len == 0 && slen > 0; str++, slen--) {
 311                 len = getLongestMatch(str, slen);
 312                 if (len == 0) {
 313                     if (!japaneseWordBreak) {
 314                         len = 1;
 315                     } else {
 316                         if (count == 0)
 317                             type = JapaneseCharType(*str);
 318                         else if (type != JapaneseCharType(*str))
 319                             break;
 320                         count++;
 321                     }
 322                 }
 323             }
 324             if (count)
 325             {
 326                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
 327                 i++;
 328             }
 329         }
 330
 331         if (len) {
 332             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
 333             i++;
 334         }
 335     }
 336     rCache.wordboundary[i + 1] = rCache.length + 1;
 337
 338     return rCache;
 339 }
 340
 341 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 342 {
 343         // looking for the first non-whitespace character from anyPos
 344         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
 345
 346         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 347
 348         return getWordBoundary(rText, anyPos, wordType, true);
 349 }
 350
 351 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 352 {
 353         boundary = getWordBoundary(rText, anyPos, wordType, true);
 354         anyPos = boundary.endPos;
 355         if (anyPos < rText.getLength()) {
 356             // looknig for the first non-whitespace character from anyPos
 357             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 358             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
 359             rText.iterateCodePoints(&anyPos, -1);
 360         }
 361
 362         return getWordBoundary(rText, anyPos, wordType, true);
 363 }
 364
 365 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
 366 {
 367         const sal_Unicode *text=rText.getStr();
 368         sal_Int32 len=rText.getLength();
 369         if (anyPos >= len || anyPos < 0) {
 370             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 371         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 372             WordBreakCache& aCache = getCache(text, boundary);
 373             sal_Int32 i = 0;
 374
 375             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 376
 377             sal_Int32 startPos = aCache.wordboundary[i - 1];
 378             // if bDirection is false
 379             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 380             {
 381                 sal_Int32 indexUtf16 = anyPos-1;
 382                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 383                 if (u_isWhitespace(ch))
 384                     i--;
 385             }
 386
 387             boundary.endPos = boundary.startPos;
 388             boundary.endPos += aCache.wordboundary[i];
 389             boundary.startPos += aCache.wordboundary[i-1];
 390
 391         } else {
 392             boundary.startPos = anyPos;
 393             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 394             boundary.endPos = anyPos < len ? anyPos : len;
 395         }
 396         if (wordType == WordType::WORD_COUNT) {
 397             // skip punctuation for word count.
 398             while (boundary.endPos < len)
 399             {
 400                 sal_Int32 indexUtf16 = boundary.endPos;
 401                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
 402                     boundary.endPos = indexUtf16;
 403                 else
 404                     break;
 405             }
 406         }
 407
 408         return boundary;
 409 }
 410
 411 } } } }
 412
 413 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */