i18npool/source/breakiterator/xdictionary.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2000, 2010 Oracle and/or its affiliates.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * This file is part of OpenOffice.org.
  10  *
  11  * OpenOffice.org is free software: you can redistribute it and/or modify
  12  * it under the terms of the GNU Lesser General Public License version 3
  13  * only, as published by the Free Software Foundation.
  14  *
  15  * OpenOffice.org is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU Lesser General Public License version 3 for more details
  19  * (a copy is included in the LICENSE file that accompanied this code).
  20  *
  21  * You should have received a copy of the GNU Lesser General Public License
  22  * version 3 along with OpenOffice.org.  If not, see
  23  * <http://www.openoffice.org/license.html>
  24  * for a copy of the LGPLv3 License.
  25  *
  26  ************************************************************************/
  27
  28 // MARKER(update_precomp.py): autogen include statement, do not remove
  29 #include "precompiled_i18npool.hxx"
  30
  31 // xdictionary.cpp: implementation of the xdictionary class.
  32 //
  33 //////////////////////////////////////////////////////////////////////
  34
  35
  36 #include <rtl/ustrbuf.hxx>
  37
  38 #include <com/sun/star/i18n/WordType.hpp>
  39 #include <xdictionary.hxx>
  40 #include <unicode/uchar.h>
  41 #include <string.h>
  42 #include <breakiteratorImpl.hxx>
  43
  44 //////////////////////////////////////////////////////////////////////
  45 // Construction/Destruction
  46 //////////////////////////////////////////////////////////////////////
  47
  48 using namespace rtl;
  49
  50 namespace com { namespace sun { namespace star { namespace i18n {
  51
  52 extern "C" { static void SAL_CALL thisModule() {} }
  53
  54 xdictionary::xdictionary(const sal_Char *lang) :
  55     existMark( NULL ),
  56     index1( NULL ),
  57     index2( NULL ),
  58     lenArray( NULL ),
  59     dataArea( NULL ),
  60     hModule( NULL ),
  61     boundary(),
  62     japaneseWordBreak( sal_False )
  63 #if USE_CELL_BOUNDARY_CODE
  64     // For CTL breakiterator, where the word boundary should not be inside cell.
  65     ,
  66     useCellBoundary( sal_False ),
  67     cellBoundary( NULL )
  68 #endif
  69 {
  70     index1 = 0;
  71 #ifdef SAL_DLLPREFIX
  72     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
  73     aBuf.appendAscii( SAL_DLLPREFIX );
  74 #else
  75     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
  76 #endif
  77     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
  78         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
  79         if( hModule ) {
  80             sal_IntPtr (*func)();
  81             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
  82             existMark = (sal_uInt8*) (*func)();
  83             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
  84             index1 = (sal_Int16*) (*func)();
  85             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
  86             index2 = (sal_Int32*) (*func)();
  87             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
  88             lenArray = (sal_Int32*) (*func)();
  89             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
  90             dataArea = (sal_Unicode*) (*func)();
  91         }
  92         else
  93         {
  94             existMark = NULL;
  95             index1 = NULL;
  96             index2 = NULL;
  97             lenArray = NULL;
  98             dataArea = NULL;
  99         }
 100
 101         for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 102             cache[i].size = 0;
 103
 104 #if USE_CELL_BOUNDARY_CODE
 105         useCellBoundary = sal_False;
 106         cellBoundary = NULL;
 107 #endif
 108         japaneseWordBreak = sal_False;
 109 }
 110
 111 xdictionary::~xdictionary() {
 112         osl_unloadModule(hModule);
 113         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 114             if (cache[i].size > 0) {
 115                 delete cache[i].contents;
 116                 delete cache[i].wordboundary;
 117             }
 118         }
 119 }
 120
 121 void xdictionary::setJapaneseWordBreak()
 122 {
 123         japaneseWordBreak = sal_True;
 124 }
 125
 126 sal_Bool xdictionary::exists(const sal_uInt32 c) {
 127         // 0x1FFF is the hardcoded limit in gendict for existMarks
 128         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
 129         if (!exist && japaneseWordBreak)
 130             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 131         else
 132             return exist;
 133 }
 134
 135 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
 136
 137         if ( !index1 ) return 0;
 138
 139         sal_Int16 idx = index1[str[0] >> 8];
 140
 141         if (idx == 0xFF) return 0;
 142
 143         idx = (idx<<8) | (str[0]&0xff);
 144
 145         sal_uInt32 begin = index2[idx], end = index2[idx+1];
 146
 147         if (begin == 0) return 0;
 148
 149         str++; sLen--; // first character is not stored in the dictionary
 150         for (sal_uInt32 i = end; i > begin; i--) {
 151             sal_Int32 len = lenArray[i] - lenArray[i - 1];
 152             if (sLen >= len) {
 153                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
 154                 sal_Int32 pos = 0;
 155
 156                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
 157
 158                 if (pos == len)
 159                     return len + 1;
 160             }
 161         }
 162         return 0;
 163 }
 164
 165
 166 /*
 167  * c-tor
 168  */
 169
 170 WordBreakCache::WordBreakCache() :
 171     length( 0 ),
 172     contents( NULL ),
 173     wordboundary( NULL ),
 174     size( 0 )
 175 {
 176 }
 177
 178 /*
 179  * Compare two unicode string,
 180  */
 181
 182 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
 183         // Different length, different string.
 184         if (length != boundary.endPos - boundary.startPos) return sal_False;
 185
 186         for (sal_Int32 i = 0; i < length; i++)
 187             if (contents[i] != str[i + boundary.startPos]) return sal_False;
 188
 189         return sal_True;
 190 }
 191
 192
 193 /*
 194  * Retrieve the segment containing the character at pos.
 195  * @param pos : Position of the given character.
 196  * @return true if CJK.
 197  */
 198 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
 199     Boundary& segBoundary)
 200 {
 201     sal_Int32 indexUtf16;
 202     segBoundary.endPos = segBoundary.startPos = pos;
 203
 204     indexUtf16 = pos;
 205     while (indexUtf16 > 0)
 206     {
 207         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 208         if (u_isWhitespace(ch) || exists(ch))
 209             segBoundary.startPos = indexUtf16;
 210         else
 211             break;
 212     }
 213
 214     indexUtf16 = pos;
 215     while (indexUtf16 < rText.getLength())
 216     {
 217         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 218         if (u_isWhitespace(ch) || exists(ch))
 219             segBoundary.endPos = indexUtf16;
 220         else
 221             break;
 222     }
 223
 224     indexUtf16 = segBoundary.startPos;
 225     rText.iterateCodePoints(&indexUtf16, 1);
 226     return segBoundary.endPos > indexUtf16;
 227 }
 228
 229 #define KANJA       1
 230 #define KATAKANA    2
 231 #define HIRAKANA    3
 232
 233 static sal_Int16 JapaneseCharType(sal_Unicode c)
 234 {
 235     if (0x3041 <= c && c <= 0x309e)
 236         return HIRAKANA;
 237     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 238         return KATAKANA;
 239     return KANJA;
 240 }
 241
 242 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 243 {
 244
 245         WordBreakCache& aCache = cache[text[0] & 0x1f];
 246
 247         if (aCache.size != 0 && aCache.equals(text, wordBoundary))
 248             return aCache;
 249
 250         sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 251
 252         if (aCache.size == 0 || len > aCache.size) {
 253             if (aCache.size != 0) {
 254                 delete aCache.contents;
 255                 delete aCache.wordboundary;
 256                 aCache.size = len;
 257             }
 258             else
 259                 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 260             aCache.contents = new sal_Unicode[aCache.size + 1];
 261             aCache.wordboundary = new sal_Int32[aCache.size + 2];
 262         }
 263         aCache.length  = len;
 264         memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 265         *(aCache.contents + len) = 0x0000;
 266         // reset the wordboundary in cache
 267         memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 268
 269         sal_Int32 i = 0;        // loop variable
 270         while (aCache.wordboundary[i] < aCache.length) {
 271             len = 0;
 272             // look the continuous white space as one word and cashe it
 273             while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
 274                 len ++;
 275
 276             if (len == 0) {
 277                 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
 278                 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
 279                 sal_Int16 type = 0, count = 0;
 280                 for (;len == 0 && slen > 0; str++, slen--) {
 281                     len = getLongestMatch(str, slen);
 282                     if (len == 0) {
 283                         if (!japaneseWordBreak) {
 284                             len = 1;
 285                         } else {
 286                             if (count == 0)
 287                                 type = JapaneseCharType(*str);
 288                             else if (type != JapaneseCharType(*str))
 289                                 break;
 290                             count++;
 291                         }
 292                     }
 293                 }
 294                 if (count) {
 295                     aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
 296                     i++;
 297
 298 #if USE_CELL_BOUNDARY_CODE
 299                     if (useCellBoundary) {
 300                         sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
 301                         if (cBoundary > 0)
 302                             aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
 303                     }
 304 #endif
 305                 }
 306             }
 307
 308             if (len) {
 309                 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
 310                 i++;
 311
 312 #if USE_CELL_BOUNDARY_CODE
 313                 if (useCellBoundary) {
 314                     sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
 315                     if (cBoundary > 0)
 316                         aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
 317                 }
 318 #endif
 319             }
 320         }
 321         aCache.wordboundary[i + 1] = aCache.length + 1;
 322
 323         return aCache;
 324 }
 325
 326 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 327 {
 328         // looking for the first non-whitespace character from anyPos
 329         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
 330
 331         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 332
 333         return getWordBoundary(rText, anyPos, wordType, true);
 334 }
 335
 336 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 337 {
 338         boundary = getWordBoundary(rText, anyPos, wordType, true);
 339         anyPos = boundary.endPos;
 340         if (anyPos < rText.getLength()) {
 341             // looknig for the first non-whitespace character from anyPos
 342             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 343             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
 344             rText.iterateCodePoints(&anyPos, -1);
 345         }
 346
 347         return getWordBoundary(rText, anyPos, wordType, true);
 348 }
 349
 350 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
 351 {
 352         const sal_Unicode *text=rText.getStr();
 353         sal_Int32 len=rText.getLength();
 354         if (anyPos >= len || anyPos < 0) {
 355             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 356         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 357             WordBreakCache& aCache = getCache(text, boundary);
 358             sal_Int32 i = 0;
 359
 360             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 361
 362             sal_Int32 startPos = aCache.wordboundary[i - 1];
 363             // if bDirection is false
 364             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 365             {
 366                 sal_Int32 indexUtf16 = anyPos-1;
 367                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 368                 if (u_isWhitespace(ch))
 369                     i--;
 370             }
 371             boundary.endPos = boundary.startPos;
 372             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
 373             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
 374         } else {
 375             boundary.startPos = anyPos;
 376             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 377             boundary.endPos = anyPos < len ? anyPos : len;
 378         }
 379         if (wordType == WordType::WORD_COUNT) {
 380             // skip punctuation for word count.
 381             while (boundary.endPos < len)
 382             {
 383                 sal_Int32 indexUtf16 = boundary.endPos;
 384                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
 385                     boundary.endPos = indexUtf16;
 386                 else
 387                     break;
 388             }
 389         }
 390
 391         return boundary;
 392 }
 393
 394 #if USE_CELL_BOUNDARY_CODE
 395 void xdictionary::setCellBoundary(sal_Int32* cellArray)
 396 {
 397         useCellBoundary = sal_True;
 398         cellBoundary = cellArray;
 399 }
 400 #endif
 401
 402 } } } }