merged tag ooo/OOO330_m14
[LibreOffice.git] / i18npool / source / breakiterator / xdictionary.cxx
blobaba69b5e9a2166d62fe77cc21d23cb9be973922b
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
31 // xdictionary.cpp: implementation of the xdictionary class.
33 //////////////////////////////////////////////////////////////////////
36 #include <rtl/ustrbuf.hxx>
38 #include <com/sun/star/i18n/WordType.hpp>
39 #include <xdictionary.hxx>
40 #include <unicode/uchar.h>
41 #include <string.h>
42 #include <breakiteratorImpl.hxx>
44 //////////////////////////////////////////////////////////////////////
45 // Construction/Destruction
46 //////////////////////////////////////////////////////////////////////
48 using namespace rtl;
50 namespace com { namespace sun { namespace star { namespace i18n {
52 extern "C" { static void SAL_CALL thisModule() {} }
54 xdictionary::xdictionary(const sal_Char *lang) :
55 existMark( NULL ),
56 index1( NULL ),
57 index2( NULL ),
58 lenArray( NULL ),
59 dataArea( NULL ),
60 hModule( NULL ),
61 boundary(),
62 japaneseWordBreak( sal_False )
63 #if USE_CELL_BOUNDARY_CODE
64 // For CTL breakiterator, where the word boundary should not be inside cell.
66 useCellBoundary( sal_False ),
67 cellBoundary( NULL )
68 #endif
70 index1 = 0;
71 #ifdef SAL_DLLPREFIX
72 OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
73 aBuf.appendAscii( SAL_DLLPREFIX );
74 #else
75 OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
76 #endif
77 aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
78 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
79 if( hModule ) {
80 sal_IntPtr (*func)();
81 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
82 existMark = (sal_uInt8*) (*func)();
83 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
84 index1 = (sal_Int16*) (*func)();
85 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
86 index2 = (sal_Int32*) (*func)();
87 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
88 lenArray = (sal_Int32*) (*func)();
89 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
90 dataArea = (sal_Unicode*) (*func)();
92 else
94 existMark = NULL;
95 index1 = NULL;
96 index2 = NULL;
97 lenArray = NULL;
98 dataArea = NULL;
101 for (sal_Int32 i = 0; i < CACHE_MAX; i++)
102 cache[i].size = 0;
104 #if USE_CELL_BOUNDARY_CODE
105 useCellBoundary = sal_False;
106 cellBoundary = NULL;
107 #endif
108 japaneseWordBreak = sal_False;
111 xdictionary::~xdictionary() {
112 osl_unloadModule(hModule);
113 for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
114 if (cache[i].size > 0) {
115 delete cache[i].contents;
116 delete cache[i].wordboundary;
121 void xdictionary::setJapaneseWordBreak()
123 japaneseWordBreak = sal_True;
126 sal_Bool xdictionary::exists(const sal_uInt32 c) {
127 // 0x1FFF is the hardcoded limit in gendict for existMarks
128 sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
129 if (!exist && japaneseWordBreak)
130 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
131 else
132 return exist;
135 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
137 if ( !index1 ) return 0;
139 sal_Int16 idx = index1[str[0] >> 8];
141 if (idx == 0xFF) return 0;
143 idx = (idx<<8) | (str[0]&0xff);
145 sal_uInt32 begin = index2[idx], end = index2[idx+1];
147 if (begin == 0) return 0;
149 str++; sLen--; // first character is not stored in the dictionary
150 for (sal_uInt32 i = end; i > begin; i--) {
151 sal_Int32 len = lenArray[i] - lenArray[i - 1];
152 if (sLen >= len) {
153 const sal_Unicode *dstr = dataArea + lenArray[i-1];
154 sal_Int32 pos = 0;
156 while (pos < len && dstr[pos] == str[pos]) { pos++; }
158 if (pos == len)
159 return len + 1;
162 return 0;
167 * c-tor
170 WordBreakCache::WordBreakCache() :
171 length( 0 ),
172 contents( NULL ),
173 wordboundary( NULL ),
174 size( 0 )
179 * Compare two unicode string,
182 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
183 // Different length, different string.
184 if (length != boundary.endPos - boundary.startPos) return sal_False;
186 for (sal_Int32 i = 0; i < length; i++)
187 if (contents[i] != str[i + boundary.startPos]) return sal_False;
189 return sal_True;
194 * Retrieve the segment containing the character at pos.
195 * @param pos : Position of the given character.
196 * @return true if CJK.
198 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
199 Boundary& segBoundary)
201 sal_Int32 indexUtf16;
202 segBoundary.endPos = segBoundary.startPos = pos;
204 indexUtf16 = pos;
205 while (indexUtf16 > 0)
207 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
208 if (u_isWhitespace(ch) || exists(ch))
209 segBoundary.startPos = indexUtf16;
210 else
211 break;
214 indexUtf16 = pos;
215 while (indexUtf16 < rText.getLength())
217 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
218 if (u_isWhitespace(ch) || exists(ch))
219 segBoundary.endPos = indexUtf16;
220 else
221 break;
224 indexUtf16 = segBoundary.startPos;
225 rText.iterateCodePoints(&indexUtf16, 1);
226 return segBoundary.endPos > indexUtf16;
229 #define KANJA 1
230 #define KATAKANA 2
231 #define HIRAKANA 3
233 static sal_Int16 JapaneseCharType(sal_Unicode c)
235 if (0x3041 <= c && c <= 0x309e)
236 return HIRAKANA;
237 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
238 return KATAKANA;
239 return KANJA;
242 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
245 WordBreakCache& aCache = cache[text[0] & 0x1f];
247 if (aCache.size != 0 && aCache.equals(text, wordBoundary))
248 return aCache;
250 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
252 if (aCache.size == 0 || len > aCache.size) {
253 if (aCache.size != 0) {
254 delete aCache.contents;
255 delete aCache.wordboundary;
256 aCache.size = len;
258 else
259 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
260 aCache.contents = new sal_Unicode[aCache.size + 1];
261 aCache.wordboundary = new sal_Int32[aCache.size + 2];
263 aCache.length = len;
264 memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
265 *(aCache.contents + len) = 0x0000;
266 // reset the wordboundary in cache
267 memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
269 sal_Int32 i = 0; // loop variable
270 while (aCache.wordboundary[i] < aCache.length) {
271 len = 0;
272 // look the continuous white space as one word and cashe it
273 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
274 len ++;
276 if (len == 0) {
277 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
278 sal_Int32 slen = aCache.length - aCache.wordboundary[i];
279 sal_Int16 type = 0, count = 0;
280 for (;len == 0 && slen > 0; str++, slen--) {
281 len = getLongestMatch(str, slen);
282 if (len == 0) {
283 if (!japaneseWordBreak) {
284 len = 1;
285 } else {
286 if (count == 0)
287 type = JapaneseCharType(*str);
288 else if (type != JapaneseCharType(*str))
289 break;
290 count++;
294 if (count) {
295 aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
296 i++;
298 #if USE_CELL_BOUNDARY_CODE
299 if (useCellBoundary) {
300 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
301 if (cBoundary > 0)
302 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
304 #endif
308 if (len) {
309 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
310 i++;
312 #if USE_CELL_BOUNDARY_CODE
313 if (useCellBoundary) {
314 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
315 if (cBoundary > 0)
316 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
318 #endif
321 aCache.wordboundary[i + 1] = aCache.length + 1;
323 return aCache;
326 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
328 // looking for the first non-whitespace character from anyPos
329 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
331 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
333 return getWordBoundary(rText, anyPos, wordType, true);
336 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
338 boundary = getWordBoundary(rText, anyPos, wordType, true);
339 anyPos = boundary.endPos;
340 if (anyPos < rText.getLength()) {
341 // looknig for the first non-whitespace character from anyPos
342 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
343 while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
344 rText.iterateCodePoints(&anyPos, -1);
347 return getWordBoundary(rText, anyPos, wordType, true);
350 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
352 const sal_Unicode *text=rText.getStr();
353 sal_Int32 len=rText.getLength();
354 if (anyPos >= len || anyPos < 0) {
355 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
356 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
357 WordBreakCache& aCache = getCache(text, boundary);
358 sal_Int32 i = 0;
360 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
362 sal_Int32 startPos = aCache.wordboundary[i - 1];
363 // if bDirection is false
364 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
366 sal_Int32 indexUtf16 = anyPos-1;
367 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
368 if (u_isWhitespace(ch))
369 i--;
371 boundary.endPos = boundary.startPos;
372 rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
373 rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
374 } else {
375 boundary.startPos = anyPos;
376 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
377 boundary.endPos = anyPos < len ? anyPos : len;
379 if (wordType == WordType::WORD_COUNT) {
380 // skip punctuation for word count.
381 while (boundary.endPos < len)
383 sal_Int32 indexUtf16 = boundary.endPos;
384 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
385 boundary.endPos = indexUtf16;
386 else
387 break;
391 return boundary;
394 #if USE_CELL_BOUNDARY_CODE
395 void xdictionary::setCellBoundary(sal_Int32* cellArray)
397 useCellBoundary = sal_True;
398 cellBoundary = cellArray;
400 #endif
402 } } } }