i18npool/source/breakiterator/xdictionary.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20
  21 // xdictionary.cpp: implementation of the xdictionary class.
  22 //
  23 //////////////////////////////////////////////////////////////////////
  24
  25
  26 #include <rtl/ustrbuf.hxx>
  27
  28 #include <com/sun/star/i18n/WordType.hpp>
  29 #include <xdictionary.hxx>
  30 #include <unicode/uchar.h>
  31 #include <string.h>
  32 #include <breakiteratorImpl.hxx>
  33
  34 //////////////////////////////////////////////////////////////////////
  35 // Construction/Destruction
  36 //////////////////////////////////////////////////////////////////////
  37
  38
  39 namespace com { namespace sun { namespace star { namespace i18n {
  40
  41 #ifndef DISABLE_DYNLOADING
  42
  43 extern "C" { static void SAL_CALL thisModule() {} }
  44
  45 #else
  46
  47 extern "C" {
  48
  49 sal_uInt8* getExistMark_ja();
  50 sal_Int16* getIndex1_ja();
  51 sal_Int32* getIndex2_ja();
  52 sal_Int32* getLenArray_ja();
  53 sal_Unicode* getDataArea_ja();
  54
  55 sal_uInt8* getExistMark_zh();
  56 sal_Int16* getIndex1_zh();
  57 sal_Int32* getIndex2_zh();
  58 sal_Int32* getLenArray_zh();
  59 sal_Unicode* getDataArea_zh();
  60
  61 }
  62
  63 #endif
  64
  65 xdictionary::xdictionary(const sal_Char *lang) :
  66     existMark( NULL ),
  67     index1( NULL ),
  68     index2( NULL ),
  69     lenArray( NULL ),
  70     dataArea( NULL ),
  71 #ifndef DISABLE_DYNLOADING
  72     hModule( NULL ),
  73 #endif
  74     boundary(),
  75     japaneseWordBreak( sal_False )
  76 {
  77     index1 = 0;
  78 #ifndef DISABLE_DYNLOADING
  79 #ifdef SAL_DLLPREFIX
  80     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
  81     aBuf.appendAscii( SAL_DLLPREFIX );
  82 #else
  83     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
  84 #endif
  85     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
  86     hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
  87     if( hModule ) {
  88         sal_IntPtr (*func)();
  89         func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getExistMark").pData );
  90         existMark = (sal_uInt8*) (*func)();
  91         func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex1").pData );
  92         index1 = (sal_Int16*) (*func)();
  93         func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex2").pData );
  94         index2 = (sal_Int32*) (*func)();
  95         func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getLenArray").pData );
  96         lenArray = (sal_Int32*) (*func)();
  97         func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getDataArea").pData );
  98         dataArea = (sal_Unicode*) (*func)();
  99     }
 100     else
 101     {
 102         existMark = NULL;
 103         index1 = NULL;
 104         index2 = NULL;
 105         lenArray = NULL;
 106         dataArea = NULL;
 107     }
 108
 109 #else
 110     if( strcmp( lang, "ja" ) == 0 ) {
 111         existMark = getExistMark_ja();
 112         index1 = getIndex1_ja();
 113         index2 = getIndex2_ja();
 114         lenArray = getLenArray_ja();
 115         dataArea = getDataArea_ja();
 116     }
 117     else if( strcmp( lang, "zh" ) == 0 ) {
 118         existMark = getExistMark_zh();
 119         index1 = getIndex1_zh();
 120         index2 = getIndex2_zh();
 121         lenArray = getLenArray_zh();
 122         dataArea = getDataArea_zh();
 123     }
 124     else
 125     {
 126         existMark = NULL;
 127         index1 = NULL;
 128         index2 = NULL;
 129         lenArray = NULL;
 130         dataArea = NULL;
 131     }
 132 #endif
 133
 134     for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 135         cache[i].size = 0;
 136
 137     japaneseWordBreak = sal_False;
 138 }
 139
 140 xdictionary::~xdictionary()
 141 {
 142 #ifndef DISABLE_DYNLOADING
 143         osl_unloadModule(hModule);
 144 #endif
 145         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 146             if (cache[i].size > 0) {
 147                 delete [] cache[i].contents;
 148                 delete [] cache[i].wordboundary;
 149             }
 150         }
 151 }
 152
 153 void xdictionary::setJapaneseWordBreak()
 154 {
 155     japaneseWordBreak = sal_True;
 156 }
 157
 158 sal_Bool xdictionary::exists(const sal_uInt32 c)
 159 {
 160     // 0x1FFF is the hardcoded limit in gendict for existMarks
 161     sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
 162     if (!exist && japaneseWordBreak)
 163         return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 164     else
 165         return exist;
 166 }
 167
 168 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
 169 {
 170
 171     if ( !index1 ) return 0;
 172
 173     sal_Int16 idx = index1[str[0] >> 8];
 174
 175     if (idx == 0xFF) return 0;
 176
 177     idx = (idx<<8) | (str[0]&0xff);
 178
 179     sal_uInt32 begin = index2[idx], end = index2[idx+1];
 180
 181     if (begin == 0) return 0;
 182
 183     str++; sLen--; // first character is not stored in the dictionary
 184     for (sal_uInt32 i = end; i > begin; i--) {
 185         sal_Int32 len = lenArray[i] - lenArray[i - 1];
 186         if (sLen >= len) {
 187             const sal_Unicode *dstr = dataArea + lenArray[i-1];
 188             sal_Int32 pos = 0;
 189
 190             while (pos < len && dstr[pos] == str[pos]) { pos++; }
 191
 192             if (pos == len)
 193                 return len + 1;
 194         }
 195     }
 196     return 0;
 197 }
 198
 199
 200 /*
 201  * c-tor
 202  */
 203
 204 WordBreakCache::WordBreakCache() :
 205     length( 0 ),
 206     contents( NULL ),
 207     wordboundary( NULL ),
 208     size( 0 )
 209 {
 210 }
 211
 212 /*
 213  * Compare two unicode string,
 214  */
 215
 216 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary)
 217 {
 218     // Different length, different string.
 219     if (length != boundary.endPos - boundary.startPos) return sal_False;
 220
 221     for (sal_Int32 i = 0; i < length; i++)
 222         if (contents[i] != str[i + boundary.startPos]) return sal_False;
 223
 224     return sal_True;
 225 }
 226
 227
 228 /*
 229  * Retrieve the segment containing the character at pos.
 230  * @param pos : Position of the given character.
 231  * @return true if CJK.
 232  */
 233 sal_Bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
 234     Boundary& segBoundary)
 235 {
 236     sal_Int32 indexUtf16;
 237     segBoundary.endPos = segBoundary.startPos = pos;
 238
 239     indexUtf16 = pos;
 240     while (indexUtf16 > 0)
 241     {
 242         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 243         if (u_isWhitespace(ch) || exists(ch))
 244             segBoundary.startPos = indexUtf16;
 245         else
 246             break;
 247     }
 248
 249     indexUtf16 = pos;
 250     while (indexUtf16 < rText.getLength())
 251     {
 252         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 253         if (u_isWhitespace(ch) || exists(ch))
 254             segBoundary.endPos = indexUtf16;
 255         else
 256             break;
 257     }
 258
 259     indexUtf16 = segBoundary.startPos;
 260     rText.iterateCodePoints(&indexUtf16, 1);
 261     return segBoundary.endPos > indexUtf16;
 262 }
 263
 264 #define KANJA       1
 265 #define KATAKANA    2
 266 #define HIRAKANA    3
 267
 268 static sal_Int16 JapaneseCharType(sal_Unicode c)
 269 {
 270     if (0x3041 <= c && c <= 0x309e)
 271         return HIRAKANA;
 272     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 273         return KATAKANA;
 274     return KANJA;
 275 }
 276
 277 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 278 {
 279     WordBreakCache& rCache = cache[text[0] & 0x1f];
 280
 281     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
 282         return rCache;
 283
 284     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 285
 286     if (rCache.size == 0 || len > rCache.size) {
 287         if (rCache.size != 0) {
 288             delete [] rCache.contents;
 289             delete [] rCache.wordboundary;
 290             rCache.size = len;
 291         }
 292         else
 293             rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 294         rCache.contents = new sal_Unicode[rCache.size + 1];
 295         rCache.wordboundary = new sal_Int32[rCache.size + 2];
 296     }
 297     rCache.length  = len;
 298     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 299     *(rCache.contents + len) = 0x0000;
 300     // reset the wordboundary in cache
 301     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 302
 303     sal_Int32 i = 0;        // loop variable
 304     while (rCache.wordboundary[i] < rCache.length) {
 305         len = 0;
 306         // look the continuous white space as one word and cashe it
 307         while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
 308             len ++;
 309
 310         if (len == 0) {
 311             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
 312             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
 313             sal_Int16 type = 0, count = 0;
 314             for (;len == 0 && slen > 0; str++, slen--) {
 315                 len = getLongestMatch(str, slen);
 316                 if (len == 0) {
 317                     if (!japaneseWordBreak) {
 318                         len = 1;
 319                     } else {
 320                         if (count == 0)
 321                             type = JapaneseCharType(*str);
 322                         else if (type != JapaneseCharType(*str))
 323                             break;
 324                         count++;
 325                     }
 326                 }
 327             }
 328             if (count)
 329             {
 330                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
 331                 i++;
 332             }
 333         }
 334
 335         if (len) {
 336             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
 337             i++;
 338         }
 339     }
 340     rCache.wordboundary[i + 1] = rCache.length + 1;
 341
 342     return rCache;
 343 }
 344
 345 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 346 {
 347         // looking for the first non-whitespace character from anyPos
 348         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
 349
 350         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 351
 352         return getWordBoundary(rText, anyPos, wordType, true);
 353 }
 354
 355 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 356 {
 357         boundary = getWordBoundary(rText, anyPos, wordType, true);
 358         anyPos = boundary.endPos;
 359         if (anyPos < rText.getLength()) {
 360             // looknig for the first non-whitespace character from anyPos
 361             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 362             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
 363             rText.iterateCodePoints(&anyPos, -1);
 364         }
 365
 366         return getWordBoundary(rText, anyPos, wordType, true);
 367 }
 368
 369 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
 370 {
 371         const sal_Unicode *text=rText.getStr();
 372         sal_Int32 len=rText.getLength();
 373         if (anyPos >= len || anyPos < 0) {
 374             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 375         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 376             WordBreakCache& aCache = getCache(text, boundary);
 377             sal_Int32 i = 0;
 378
 379             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 380
 381             sal_Int32 startPos = aCache.wordboundary[i - 1];
 382             // if bDirection is false
 383             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 384             {
 385                 sal_Int32 indexUtf16 = anyPos-1;
 386                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 387                 if (u_isWhitespace(ch))
 388                     i--;
 389             }
 390             boundary.endPos = boundary.startPos;
 391             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
 392             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
 393         } else {
 394             boundary.startPos = anyPos;
 395             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 396             boundary.endPos = anyPos < len ? anyPos : len;
 397         }
 398         if (wordType == WordType::WORD_COUNT) {
 399             // skip punctuation for word count.
 400             while (boundary.endPos < len)
 401             {
 402                 sal_Int32 indexUtf16 = boundary.endPos;
 403                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
 404                     boundary.endPos = indexUtf16;
 405                 else
 406                     break;
 407             }
 408         }
 409
 410         return boundary;
 411 }
 412
 413 } } } }
 414
 415 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */