i18npool/source/breakiterator/xdictionary.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: xdictionary.cxx,v $
  10  * $Revision: 1.18.24.1 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_i18npool.hxx"
  33
  34 // xdictionary.cpp: implementation of the xdictionary class.
  35 //
  36 //////////////////////////////////////////////////////////////////////
  37
  38
  39 #include <rtl/ustrbuf.hxx>
  40
  41 #include <com/sun/star/i18n/WordType.hpp>
  42 #include <xdictionary.hxx>
  43 #include <unicode/uchar.h>
  44 #include <string.h>
  45 #include <breakiteratorImpl.hxx>
  46
  47 //////////////////////////////////////////////////////////////////////
  48 // Construction/Destruction
  49 //////////////////////////////////////////////////////////////////////
  50
  51 using namespace rtl;
  52
  53 namespace com { namespace sun { namespace star { namespace i18n {
  54
  55 extern "C" { static void SAL_CALL thisModule() {} }
  56
  57 xdictionary::xdictionary(const sal_Char *lang) :
  58     existMark( NULL ),
  59     index1( NULL ),
  60     index2( NULL ),
  61     lenArray( NULL ),
  62     dataArea( NULL ),
  63     hModule( NULL ),
  64     boundary(),
  65     japaneseWordBreak( sal_False )
  66 #if USE_CELL_BOUNDARY_CODE
  67     // For CTL breakiterator, where the word boundary should not be inside cell.
  68     ,
  69     useCellBoundary( sal_False ),
  70     cellBoundary( NULL )
  71 #endif
  72 {
  73     index1 = 0;
  74 #ifdef SAL_DLLPREFIX
  75     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
  76     aBuf.appendAscii( SAL_DLLPREFIX );
  77 #else
  78     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
  79 #endif
  80     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
  81         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
  82         if( hModule ) {
  83             sal_IntPtr (*func)();
  84             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
  85             existMark = (sal_uInt8*) (*func)();
  86             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
  87             index1 = (sal_Int16*) (*func)();
  88             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
  89             index2 = (sal_Int32*) (*func)();
  90             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
  91             lenArray = (sal_Int32*) (*func)();
  92             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
  93             dataArea = (sal_Unicode*) (*func)();
  94         }
  95         else
  96         {
  97             existMark = NULL;
  98             index1 = NULL;
  99             index2 = NULL;
 100             lenArray = NULL;
 101             dataArea = NULL;
 102         }
 103
 104         for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 105             cache[i].size = 0;
 106
 107 #if USE_CELL_BOUNDARY_CODE
 108         useCellBoundary = sal_False;
 109         cellBoundary = NULL;
 110 #endif
 111         japaneseWordBreak = sal_False;
 112 }
 113
 114 xdictionary::~xdictionary() {
 115         osl_unloadModule(hModule);
 116         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 117             if (cache[i].size > 0) {
 118                 delete cache[i].contents;
 119                 delete cache[i].wordboundary;
 120             }
 121         }
 122 }
 123
 124 void xdictionary::setJapaneseWordBreak()
 125 {
 126         japaneseWordBreak = sal_True;
 127 }
 128
 129 sal_Bool xdictionary::exists(const sal_uInt32 c) {
 130         // 0x1FFF is the hardcoded limit in gendict for existMarks
 131         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
 132         if (!exist && japaneseWordBreak)
 133             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 134         else
 135             return exist;
 136 }
 137
 138 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
 139
 140         if ( !index1 ) return 0;
 141
 142         sal_Int16 idx = index1[str[0] >> 8];
 143
 144         if (idx == 0xFF) return 0;
 145
 146         idx = (idx<<8) | (str[0]&0xff);
 147
 148         sal_uInt32 begin = index2[idx], end = index2[idx+1];
 149
 150         if (begin == 0) return 0;
 151
 152         str++; sLen--; // first character is not stored in the dictionary
 153         for (sal_uInt32 i = end; i > begin; i--) {
 154             sal_Int32 len = lenArray[i] - lenArray[i - 1];
 155             if (sLen >= len) {
 156                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
 157                 sal_Int32 pos = 0;
 158
 159                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
 160
 161                 if (pos == len)
 162                     return len + 1;
 163             }
 164         }
 165         return 0;
 166 }
 167
 168
 169 /*
 170  * c-tor
 171  */
 172
 173 WordBreakCache::WordBreakCache() :
 174     length( 0 ),
 175     contents( NULL ),
 176     wordboundary( NULL ),
 177     size( 0 )
 178 {
 179 }
 180
 181 /*
 182  * Compare two unicode string,
 183  */
 184
 185 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
 186         // Different length, different string.
 187         if (length != boundary.endPos - boundary.startPos) return sal_False;
 188
 189         for (sal_Int32 i = 0; i < length; i++)
 190             if (contents[i] != str[i + boundary.startPos]) return sal_False;
 191
 192         return sal_True;
 193 }
 194
 195
 196 /*
 197  * Retrieve the segment containing the character at pos.
 198  * @param pos : Position of the given character.
 199  * @return true if CJK.
 200  */
 201 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
 202     Boundary& segBoundary)
 203 {
 204     sal_Int32 indexUtf16;
 205     segBoundary.endPos = segBoundary.startPos = pos;
 206
 207     indexUtf16 = pos;
 208     while (indexUtf16 > 0)
 209     {
 210         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
 211         if (u_isWhitespace(ch) || exists(ch))
 212             segBoundary.startPos = indexUtf16;
 213         else
 214             break;
 215     }
 216
 217     indexUtf16 = pos;
 218     while (indexUtf16 < rText.getLength())
 219     {
 220         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 221         if (u_isWhitespace(ch) || exists(ch))
 222             segBoundary.endPos = indexUtf16;
 223         else
 224             break;
 225     }
 226
 227     indexUtf16 = segBoundary.startPos;
 228     rText.iterateCodePoints(&indexUtf16, 1);
 229     return segBoundary.endPos > indexUtf16;
 230 }
 231
 232 #define KANJA       1
 233 #define KATAKANA    2
 234 #define HIRAKANA    3
 235
 236 static sal_Int16 JapaneseCharType(sal_Unicode c)
 237 {
 238     if (0x3041 <= c && c <= 0x309e)
 239         return HIRAKANA;
 240     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 241         return KATAKANA;
 242     return KANJA;
 243 }
 244
 245 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 246 {
 247
 248         WordBreakCache& aCache = cache[text[0] & 0x1f];
 249
 250         if (aCache.size != 0 && aCache.equals(text, wordBoundary))
 251             return aCache;
 252
 253         sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 254
 255         if (aCache.size == 0 || len > aCache.size) {
 256             if (aCache.size != 0) {
 257                 delete aCache.contents;
 258                 delete aCache.wordboundary;
 259                 aCache.size = len;
 260             }
 261             else
 262                 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 263             aCache.contents = new sal_Unicode[aCache.size + 1];
 264             aCache.wordboundary = new sal_Int32[aCache.size + 2];
 265         }
 266         aCache.length  = len;
 267         memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 268         *(aCache.contents + len) = 0x0000;
 269         // reset the wordboundary in cache
 270         memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 271
 272         sal_Int32 i = 0;        // loop variable
 273         while (aCache.wordboundary[i] < aCache.length) {
 274             len = 0;
 275             // look the continuous white space as one word and cashe it
 276             while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
 277                 len ++;
 278
 279             if (len == 0) {
 280                 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
 281                 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
 282                 sal_Int16 type = 0, count = 0;
 283                 for (;len == 0 && slen > 0; str++, slen--) {
 284                     len = getLongestMatch(str, slen);
 285                     if (len == 0) {
 286                         if (!japaneseWordBreak) {
 287                             len = 1;
 288                         } else {
 289                             if (count == 0)
 290                                 type = JapaneseCharType(*str);
 291                             else if (type != JapaneseCharType(*str))
 292                                 break;
 293                             count++;
 294                         }
 295                     }
 296                 }
 297                 if (count) {
 298                     aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
 299                     i++;
 300
 301 #if USE_CELL_BOUNDARY_CODE
 302                     if (useCellBoundary) {
 303                         sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
 304                         if (cBoundary > 0)
 305                             aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
 306                     }
 307 #endif
 308                 }
 309             }
 310
 311             if (len) {
 312                 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
 313                 i++;
 314
 315 #if USE_CELL_BOUNDARY_CODE
 316                 if (useCellBoundary) {
 317                     sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
 318                     if (cBoundary > 0)
 319                         aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
 320                 }
 321 #endif
 322             }
 323         }
 324         aCache.wordboundary[i + 1] = aCache.length + 1;
 325
 326         return aCache;
 327 }
 328
 329 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 330 {
 331         // looking for the first non-whitespace character from anyPos
 332         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
 333
 334         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 335
 336         return getWordBoundary(rText, anyPos, wordType, true);
 337 }
 338
 339 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 340 {
 341         boundary = getWordBoundary(rText, anyPos, wordType, true);
 342         anyPos = boundary.endPos;
 343         if (anyPos < rText.getLength()) {
 344             // looknig for the first non-whitespace character from anyPos
 345             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 346             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
 347             rText.iterateCodePoints(&anyPos, -1);
 348         }
 349
 350         return getWordBoundary(rText, anyPos, wordType, true);
 351 }
 352
 353 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
 354 {
 355         const sal_Unicode *text=rText.getStr();
 356         sal_Int32 len=rText.getLength();
 357         if (anyPos >= len || anyPos < 0) {
 358             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 359         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
 360             WordBreakCache& aCache = getCache(text, boundary);
 361             sal_Int32 i = 0;
 362
 363             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
 364
 365             sal_Int32 startPos = aCache.wordboundary[i - 1];
 366             // if bDirection is false
 367             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
 368             {
 369                 sal_Int32 indexUtf16 = anyPos-1;
 370                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
 371                 if (u_isWhitespace(ch))
 372                     i--;
 373             }
 374             boundary.endPos = boundary.startPos;
 375             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
 376             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
 377         } else {
 378             boundary.startPos = anyPos;
 379             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 380             boundary.endPos = anyPos < len ? anyPos : len;
 381         }
 382         if (wordType == WordType::WORD_COUNT) {
 383             // skip punctuation for word count.
 384             while (boundary.endPos < len)
 385             {
 386                 sal_Int32 indexUtf16 = boundary.endPos;
 387                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
 388                     boundary.endPos = indexUtf16;
 389                 else
 390                     break;
 391             }
 392         }
 393
 394         return boundary;
 395 }
 396
 397 #if USE_CELL_BOUNDARY_CODE
 398 void xdictionary::setCellBoundary(sal_Int32* cellArray)
 399 {
 400         useCellBoundary = sal_True;
 401         cellBoundary = cellArray;
 402 }
 403 #endif
 404
 405 } } } }