update dev300-m58
[ooovba.git] / i18npool / source / breakiterator / xdictionary.cxx
blobcd35372f41e373ba2af29280843e525517e85d3f
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: xdictionary.cxx,v $
10 * $Revision: 1.18.24.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_i18npool.hxx"
34 // xdictionary.cpp: implementation of the xdictionary class.
36 //////////////////////////////////////////////////////////////////////
39 #include <rtl/ustrbuf.hxx>
41 #include <com/sun/star/i18n/WordType.hpp>
42 #include <xdictionary.hxx>
43 #include <unicode/uchar.h>
44 #include <string.h>
45 #include <breakiteratorImpl.hxx>
47 //////////////////////////////////////////////////////////////////////
48 // Construction/Destruction
49 //////////////////////////////////////////////////////////////////////
51 using namespace rtl;
53 namespace com { namespace sun { namespace star { namespace i18n {
55 extern "C" { static void SAL_CALL thisModule() {} }
57 xdictionary::xdictionary(const sal_Char *lang) :
58 existMark( NULL ),
59 index1( NULL ),
60 index2( NULL ),
61 lenArray( NULL ),
62 dataArea( NULL ),
63 hModule( NULL ),
64 boundary(),
65 japaneseWordBreak( sal_False )
66 #if USE_CELL_BOUNDARY_CODE
67 // For CTL breakiterator, where the word boundary should not be inside cell.
69 useCellBoundary( sal_False ),
70 cellBoundary( NULL )
71 #endif
73 index1 = 0;
74 #ifdef SAL_DLLPREFIX
75 OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
76 aBuf.appendAscii( SAL_DLLPREFIX );
77 #else
78 OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
79 #endif
80 aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
81 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
82 if( hModule ) {
83 sal_IntPtr (*func)();
84 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
85 existMark = (sal_uInt8*) (*func)();
86 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
87 index1 = (sal_Int16*) (*func)();
88 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
89 index2 = (sal_Int32*) (*func)();
90 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
91 lenArray = (sal_Int32*) (*func)();
92 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
93 dataArea = (sal_Unicode*) (*func)();
95 else
97 existMark = NULL;
98 index1 = NULL;
99 index2 = NULL;
100 lenArray = NULL;
101 dataArea = NULL;
104 for (sal_Int32 i = 0; i < CACHE_MAX; i++)
105 cache[i].size = 0;
107 #if USE_CELL_BOUNDARY_CODE
108 useCellBoundary = sal_False;
109 cellBoundary = NULL;
110 #endif
111 japaneseWordBreak = sal_False;
114 xdictionary::~xdictionary() {
115 osl_unloadModule(hModule);
116 for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
117 if (cache[i].size > 0) {
118 delete cache[i].contents;
119 delete cache[i].wordboundary;
124 void xdictionary::setJapaneseWordBreak()
126 japaneseWordBreak = sal_True;
129 sal_Bool xdictionary::exists(const sal_Unicode c) {
130 sal_Bool exist = existMark ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
131 if (!exist && japaneseWordBreak)
132 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
133 else
134 return exist;
137 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
139 if ( !index1 ) return 0;
141 sal_Int16 idx = index1[str[0] >> 8];
143 if (idx == 0xFF) return 0;
145 idx = (idx<<8) | (str[0]&0xff);
147 sal_uInt32 begin = index2[idx], end = index2[idx+1];
149 if (begin == 0) return 0;
151 str++; sLen--; // first character is not stored in the dictionary
152 for (sal_uInt32 i = end; i > begin; i--) {
153 sal_Int32 len = lenArray[i] - lenArray[i - 1];
154 if (sLen >= len) {
155 const sal_Unicode *dstr = dataArea + lenArray[i-1];
156 sal_Int32 pos = 0;
158 while (pos < len && dstr[pos] == str[pos]) { pos++; }
160 if (pos == len)
161 return len + 1;
164 return 0;
169 * c-tor
172 WordBreakCache::WordBreakCache() :
173 length( 0 ),
174 contents( NULL ),
175 wordboundary( NULL ),
176 size( 0 )
181 * Compare two unicode string,
184 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
185 // Different length, different string.
186 if (length != boundary.endPos - boundary.startPos) return sal_False;
188 for (sal_Int32 i = 0; i < length; i++)
189 if (contents[i] != str[i + boundary.startPos]) return sal_False;
191 return sal_True;
196 * Retrieve the segment containing the character at pos.
197 * @param pos : Position of the given character.
198 * @return true if CJK.
200 sal_Bool xdictionary::seekSegment(const sal_Unicode *text, sal_Int32 pos,
201 sal_Int32 len, Boundary& segBoundary) {
202 for (segBoundary.startPos = pos - 1;
203 segBoundary.startPos >= 0 &&
204 (u_isWhitespace((sal_uInt32)text[segBoundary.startPos]) || exists(text[segBoundary.startPos]));
205 segBoundary.startPos--) ;
206 segBoundary.startPos++;
208 for (segBoundary.endPos = pos;
209 segBoundary.endPos < len &&
210 (u_isWhitespace((sal_uInt32)text[segBoundary.endPos]) || exists(text[segBoundary.endPos]));
211 segBoundary.endPos++) ;
213 return segBoundary.endPos > segBoundary.startPos + 1;
216 #define KANJA 1
217 #define KATAKANA 2
218 #define HIRAKANA 3
220 static sal_Int16 JapaneseCharType(sal_Unicode c)
222 if (0x3041 <= c && c <= 0x309e)
223 return HIRAKANA;
224 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
225 return KATAKANA;
226 return KANJA;
229 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
232 WordBreakCache& aCache = cache[text[0] & 0x1f];
234 if (aCache.size != 0 && aCache.equals(text, wordBoundary))
235 return aCache;
237 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
239 if (aCache.size == 0 || len > aCache.size) {
240 if (aCache.size != 0) {
241 delete aCache.contents;
242 delete aCache.wordboundary;
243 aCache.size = len;
245 else
246 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
247 aCache.contents = new sal_Unicode[aCache.size + 1];
248 aCache.wordboundary = new sal_Int32[aCache.size + 2];
250 aCache.length = len;
251 memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
252 *(aCache.contents + len) = 0x0000;
253 // reset the wordboundary in cache
254 memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
256 sal_Int32 i = 0; // loop variable
257 while (aCache.wordboundary[i] < aCache.length) {
258 len = 0;
259 // look the continuous white space as one word and cashe it
260 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
261 len ++;
263 if (len == 0) {
264 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
265 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
266 sal_Int16 type = 0, count = 0;
267 for (;len == 0 && slen > 0; str++, slen--) {
268 len = getLongestMatch(str, slen);
269 if (len == 0) {
270 if (!japaneseWordBreak) {
271 len = 1;
272 } else {
273 if (count == 0)
274 type = JapaneseCharType(*str);
275 else if (type != JapaneseCharType(*str))
276 break;
277 count++;
281 if (count) {
282 aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
283 i++;
285 #if USE_CELL_BOUNDARY_CODE
286 if (useCellBoundary) {
287 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
288 if (cBoundary > 0)
289 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
291 #endif
295 if (len) {
296 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
297 i++;
299 #if USE_CELL_BOUNDARY_CODE
300 if (useCellBoundary) {
301 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
302 if (cBoundary > 0)
303 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
305 #endif
308 aCache.wordboundary[i + 1] = aCache.length + 1;
310 return aCache;
313 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
315 // looking for the first non-whitespace character from anyPos
316 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
318 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
320 return getWordBoundary(rText, anyPos, wordType, true);
323 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
325 boundary = getWordBoundary(rText, anyPos, wordType, true);
326 anyPos = boundary.endPos;
327 if (anyPos < rText.getLength()) {
328 // looknig for the first non-whitespace character from anyPos
329 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
330 while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
331 rText.iterateCodePoints(&anyPos, -1);
334 return getWordBoundary(rText, anyPos, wordType, true);
337 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
339 const sal_Unicode *text=rText.getStr();
340 sal_Int32 len=rText.getLength();
341 if (anyPos >= len || anyPos < 0) {
342 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
343 } else if (seekSegment(text, anyPos, len, boundary)) { // character in dict
344 WordBreakCache& aCache = getCache(text, boundary);
345 sal_Int32 i = 0;
347 while (aCache.wordboundary[i] <= (sal_Int32)anyPos - boundary.startPos) i++;
349 sal_Int32 startPos = aCache.wordboundary[i - 1];
350 // if bDirection is false
351 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos) &&
352 u_isWhitespace((sal_uInt32) text[anyPos - 1]))
353 i--;
354 boundary.endPos = aCache.wordboundary[i] + boundary.startPos;
355 boundary.startPos += aCache.wordboundary[i - 1];
356 } else {
357 boundary.startPos = anyPos;
358 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
359 boundary.endPos = anyPos < len ? anyPos : len;
361 if (wordType == WordType::WORD_COUNT) {
362 // skip punctuation for word count.
363 while (boundary.endPos < len && u_ispunct((sal_uInt32)text[boundary.endPos]))
364 boundary.endPos++;
367 return boundary;
370 #if USE_CELL_BOUNDARY_CODE
371 void xdictionary::setCellBoundary(sal_Int32* cellArray)
373 useCellBoundary = sal_True;
374 cellBoundary = cellArray;
376 #endif
378 } } } }