i18npool/source/breakiterator/xdictionary.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: xdictionary.cxx,v $
  10  * $Revision: 1.18.24.1 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_i18npool.hxx"
  33
  34 // xdictionary.cpp: implementation of the xdictionary class.
  35 //
  36 //////////////////////////////////////////////////////////////////////
  37
  38
  39 #include <rtl/ustrbuf.hxx>
  40
  41 #include <com/sun/star/i18n/WordType.hpp>
  42 #include <xdictionary.hxx>
  43 #include <unicode/uchar.h>
  44 #include <string.h>
  45 #include <breakiteratorImpl.hxx>
  46
  47 //////////////////////////////////////////////////////////////////////
  48 // Construction/Destruction
  49 //////////////////////////////////////////////////////////////////////
  50
  51 using namespace rtl;
  52
  53 namespace com { namespace sun { namespace star { namespace i18n {
  54
  55 extern "C" { static void SAL_CALL thisModule() {} }
  56
  57 xdictionary::xdictionary(const sal_Char *lang) :
  58     existMark( NULL ),
  59     index1( NULL ),
  60     index2( NULL ),
  61     lenArray( NULL ),
  62     dataArea( NULL ),
  63     hModule( NULL ),
  64     boundary(),
  65     japaneseWordBreak( sal_False )
  66 #if USE_CELL_BOUNDARY_CODE
  67     // For CTL breakiterator, where the word boundary should not be inside cell.
  68     ,
  69     useCellBoundary( sal_False ),
  70     cellBoundary( NULL )
  71 #endif
  72 {
  73     index1 = 0;
  74 #ifdef SAL_DLLPREFIX
  75     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
  76     aBuf.appendAscii( SAL_DLLPREFIX );
  77 #else
  78     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
  79 #endif
  80     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
  81         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
  82         if( hModule ) {
  83             sal_IntPtr (*func)();
  84             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
  85             existMark = (sal_uInt8*) (*func)();
  86             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
  87             index1 = (sal_Int16*) (*func)();
  88             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
  89             index2 = (sal_Int32*) (*func)();
  90             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
  91             lenArray = (sal_Int32*) (*func)();
  92             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
  93             dataArea = (sal_Unicode*) (*func)();
  94         }
  95         else
  96         {
  97             existMark = NULL;
  98             index1 = NULL;
  99             index2 = NULL;
 100             lenArray = NULL;
 101             dataArea = NULL;
 102         }
 103
 104         for (sal_Int32 i = 0; i < CACHE_MAX; i++)
 105             cache[i].size = 0;
 106
 107 #if USE_CELL_BOUNDARY_CODE
 108         useCellBoundary = sal_False;
 109         cellBoundary = NULL;
 110 #endif
 111         japaneseWordBreak = sal_False;
 112 }
 113
 114 xdictionary::~xdictionary() {
 115         osl_unloadModule(hModule);
 116         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
 117             if (cache[i].size > 0) {
 118                 delete cache[i].contents;
 119                 delete cache[i].wordboundary;
 120             }
 121         }
 122 }
 123
 124 void xdictionary::setJapaneseWordBreak()
 125 {
 126         japaneseWordBreak = sal_True;
 127 }
 128
 129 sal_Bool xdictionary::exists(const sal_Unicode c) {
 130         sal_Bool exist = existMark ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
 131         if (!exist && japaneseWordBreak)
 132             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
 133         else
 134             return exist;
 135 }
 136
 137 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
 138
 139         if ( !index1 ) return 0;
 140
 141         sal_Int16 idx = index1[str[0] >> 8];
 142
 143         if (idx == 0xFF) return 0;
 144
 145         idx = (idx<<8) | (str[0]&0xff);
 146
 147         sal_uInt32 begin = index2[idx], end = index2[idx+1];
 148
 149         if (begin == 0) return 0;
 150
 151         str++; sLen--; // first character is not stored in the dictionary
 152         for (sal_uInt32 i = end; i > begin; i--) {
 153             sal_Int32 len = lenArray[i] - lenArray[i - 1];
 154             if (sLen >= len) {
 155                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
 156                 sal_Int32 pos = 0;
 157
 158                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
 159
 160                 if (pos == len)
 161                     return len + 1;
 162             }
 163         }
 164         return 0;
 165 }
 166
 167
 168 /*
 169  * c-tor
 170  */
 171
 172 WordBreakCache::WordBreakCache() :
 173     length( 0 ),
 174     contents( NULL ),
 175     wordboundary( NULL ),
 176     size( 0 )
 177 {
 178 }
 179
 180 /*
 181  * Compare two unicode string,
 182  */
 183
 184 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
 185         // Different length, different string.
 186         if (length != boundary.endPos - boundary.startPos) return sal_False;
 187
 188         for (sal_Int32 i = 0; i < length; i++)
 189             if (contents[i] != str[i + boundary.startPos]) return sal_False;
 190
 191         return sal_True;
 192 }
 193
 194
 195 /*
 196  * Retrieve the segment containing the character at pos.
 197  * @param pos : Position of the given character.
 198  * @return true if CJK.
 199  */
 200 sal_Bool xdictionary::seekSegment(const sal_Unicode *text, sal_Int32 pos,
 201                 sal_Int32 len, Boundary& segBoundary) {
 202         for (segBoundary.startPos = pos - 1;
 203             segBoundary.startPos >= 0 &&
 204                 (u_isWhitespace((sal_uInt32)text[segBoundary.startPos]) || exists(text[segBoundary.startPos]));
 205             segBoundary.startPos--) ;
 206         segBoundary.startPos++;
 207
 208         for (segBoundary.endPos = pos;
 209             segBoundary.endPos < len &&
 210                     (u_isWhitespace((sal_uInt32)text[segBoundary.endPos]) || exists(text[segBoundary.endPos]));
 211             segBoundary.endPos++) ;
 212
 213         return segBoundary.endPos > segBoundary.startPos + 1;
 214 }
 215
 216 #define KANJA       1
 217 #define KATAKANA    2
 218 #define HIRAKANA    3
 219
 220 static sal_Int16 JapaneseCharType(sal_Unicode c)
 221 {
 222     if (0x3041 <= c && c <= 0x309e)
 223         return HIRAKANA;
 224     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
 225         return KATAKANA;
 226     return KANJA;
 227 }
 228
 229 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
 230 {
 231
 232         WordBreakCache& aCache = cache[text[0] & 0x1f];
 233
 234         if (aCache.size != 0 && aCache.equals(text, wordBoundary))
 235             return aCache;
 236
 237         sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
 238
 239         if (aCache.size == 0 || len > aCache.size) {
 240             if (aCache.size != 0) {
 241                 delete aCache.contents;
 242                 delete aCache.wordboundary;
 243                 aCache.size = len;
 244             }
 245             else
 246                 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
 247             aCache.contents = new sal_Unicode[aCache.size + 1];
 248             aCache.wordboundary = new sal_Int32[aCache.size + 2];
 249         }
 250         aCache.length  = len;
 251         memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
 252         *(aCache.contents + len) = 0x0000;
 253         // reset the wordboundary in cache
 254         memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
 255
 256         sal_Int32 i = 0;        // loop variable
 257         while (aCache.wordboundary[i] < aCache.length) {
 258             len = 0;
 259             // look the continuous white space as one word and cashe it
 260             while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
 261                 len ++;
 262
 263             if (len == 0) {
 264                 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
 265                 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
 266                 sal_Int16 type = 0, count = 0;
 267                 for (;len == 0 && slen > 0; str++, slen--) {
 268                     len = getLongestMatch(str, slen);
 269                     if (len == 0) {
 270                         if (!japaneseWordBreak) {
 271                             len = 1;
 272                         } else {
 273                             if (count == 0)
 274                                 type = JapaneseCharType(*str);
 275                             else if (type != JapaneseCharType(*str))
 276                                 break;
 277                             count++;
 278                         }
 279                     }
 280                 }
 281                 if (count) {
 282                     aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
 283                     i++;
 284
 285 #if USE_CELL_BOUNDARY_CODE
 286                     if (useCellBoundary) {
 287                         sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
 288                         if (cBoundary > 0)
 289                             aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
 290                     }
 291 #endif
 292                 }
 293             }
 294
 295             if (len) {
 296                 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
 297                 i++;
 298
 299 #if USE_CELL_BOUNDARY_CODE
 300                 if (useCellBoundary) {
 301                     sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
 302                     if (cBoundary > 0)
 303                         aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
 304                 }
 305 #endif
 306             }
 307         }
 308         aCache.wordboundary[i + 1] = aCache.length + 1;
 309
 310         return aCache;
 311 }
 312
 313 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 314 {
 315         // looking for the first non-whitespace character from anyPos
 316         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
 317
 318         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
 319
 320         return getWordBoundary(rText, anyPos, wordType, true);
 321 }
 322
 323 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
 324 {
 325         boundary = getWordBoundary(rText, anyPos, wordType, true);
 326         anyPos = boundary.endPos;
 327         if (anyPos < rText.getLength()) {
 328             // looknig for the first non-whitespace character from anyPos
 329             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
 330             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
 331             rText.iterateCodePoints(&anyPos, -1);
 332         }
 333
 334         return getWordBoundary(rText, anyPos, wordType, true);
 335 }
 336
 337 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
 338 {
 339         const sal_Unicode *text=rText.getStr();
 340         sal_Int32 len=rText.getLength();
 341         if (anyPos >= len || anyPos < 0) {
 342             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
 343         } else if (seekSegment(text, anyPos, len, boundary)) {          // character in dict
 344             WordBreakCache& aCache = getCache(text, boundary);
 345             sal_Int32 i = 0;
 346
 347             while (aCache.wordboundary[i] <= (sal_Int32)anyPos - boundary.startPos) i++;
 348
 349             sal_Int32 startPos = aCache.wordboundary[i - 1];
 350             // if bDirection is false
 351             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos) &&
 352                                                 u_isWhitespace((sal_uInt32) text[anyPos - 1]))
 353                 i--;
 354             boundary.endPos = aCache.wordboundary[i] + boundary.startPos;
 355             boundary.startPos += aCache.wordboundary[i - 1];
 356         } else {
 357             boundary.startPos = anyPos;
 358             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
 359             boundary.endPos = anyPos < len ? anyPos : len;
 360         }
 361         if (wordType == WordType::WORD_COUNT) {
 362             // skip punctuation for word count.
 363             while (boundary.endPos < len && u_ispunct((sal_uInt32)text[boundary.endPos]))
 364                 boundary.endPos++;
 365         }
 366
 367         return boundary;
 368 }
 369
 370 #if USE_CELL_BOUNDARY_CODE
 371 void xdictionary::setCellBoundary(sal_Int32* cellArray)
 372 {
 373         useCellBoundary = sal_True;
 374         cellBoundary = cellArray;
 375 }
 376 #endif
 377
 378 } } } }