Version 4.0.0.1, tag libreoffice-4.0.0.1
[LibreOffice.git] / i18npool / source / breakiterator / xdictionary.cxx
blob2bf2a03e3de34d4f1c2a571cd79ac7781bcd72bd
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 // xdictionary.cpp: implementation of the xdictionary class.
23 //////////////////////////////////////////////////////////////////////
26 #include <rtl/ustrbuf.hxx>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <xdictionary.hxx>
30 #include <unicode/uchar.h>
31 #include <string.h>
32 #include <breakiteratorImpl.hxx>
34 //////////////////////////////////////////////////////////////////////
35 // Construction/Destruction
36 //////////////////////////////////////////////////////////////////////
38 using ::rtl::OUString;
39 using ::rtl::OUStringBuffer;
41 namespace com { namespace sun { namespace star { namespace i18n {
43 #ifndef DISABLE_DYNLOADING
45 extern "C" { static void SAL_CALL thisModule() {} }
47 #else
49 extern "C" {
51 sal_uInt8* getExistMark_ja();
52 sal_Int16* getIndex1_ja();
53 sal_Int32* getIndex2_ja();
54 sal_Int32* getLenArray_ja();
55 sal_Unicode* getDataArea_ja();
57 sal_uInt8* getExistMark_zh();
58 sal_Int16* getIndex1_zh();
59 sal_Int32* getIndex2_zh();
60 sal_Int32* getLenArray_zh();
61 sal_Unicode* getDataArea_zh();
65 #endif
67 xdictionary::xdictionary(const sal_Char *lang) :
68 existMark( NULL ),
69 index1( NULL ),
70 index2( NULL ),
71 lenArray( NULL ),
72 dataArea( NULL ),
73 #ifndef DISABLE_DYNLOADING
74 hModule( NULL ),
75 #endif
76 boundary(),
77 japaneseWordBreak( sal_False )
79 index1 = 0;
80 #ifndef DISABLE_DYNLOADING
81 #ifdef SAL_DLLPREFIX
82 OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
83 aBuf.appendAscii( SAL_DLLPREFIX );
84 #else
85 OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
86 #endif
87 aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
88 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
89 if( hModule ) {
90 sal_IntPtr (*func)();
91 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getExistMark").pData );
92 existMark = (sal_uInt8*) (*func)();
93 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex1").pData );
94 index1 = (sal_Int16*) (*func)();
95 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex2").pData );
96 index2 = (sal_Int32*) (*func)();
97 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getLenArray").pData );
98 lenArray = (sal_Int32*) (*func)();
99 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getDataArea").pData );
100 dataArea = (sal_Unicode*) (*func)();
102 else
104 existMark = NULL;
105 index1 = NULL;
106 index2 = NULL;
107 lenArray = NULL;
108 dataArea = NULL;
111 #else
112 if( strcmp( lang, "ja" ) == 0 ) {
113 existMark = getExistMark_ja();
114 index1 = getIndex1_ja();
115 index2 = getIndex2_ja();
116 lenArray = getLenArray_ja();
117 dataArea = getDataArea_ja();
119 else if( strcmp( lang, "zh" ) == 0 ) {
120 existMark = getExistMark_zh();
121 index1 = getIndex1_zh();
122 index2 = getIndex2_zh();
123 lenArray = getLenArray_zh();
124 dataArea = getDataArea_zh();
126 else
128 existMark = NULL;
129 index1 = NULL;
130 index2 = NULL;
131 lenArray = NULL;
132 dataArea = NULL;
134 #endif
136 for (sal_Int32 i = 0; i < CACHE_MAX; i++)
137 cache[i].size = 0;
139 japaneseWordBreak = sal_False;
142 xdictionary::~xdictionary() {
143 #ifndef DISABLE_DYNLOADING
144 osl_unloadModule(hModule);
145 #endif
146 for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
147 if (cache[i].size > 0) {
148 delete [] cache[i].contents;
149 delete [] cache[i].wordboundary;
154 void xdictionary::setJapaneseWordBreak()
156 japaneseWordBreak = sal_True;
159 sal_Bool xdictionary::exists(const sal_uInt32 c) {
160 // 0x1FFF is the hardcoded limit in gendict for existMarks
161 sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
162 if (!exist && japaneseWordBreak)
163 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
164 else
165 return exist;
168 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
170 if ( !index1 ) return 0;
172 sal_Int16 idx = index1[str[0] >> 8];
174 if (idx == 0xFF) return 0;
176 idx = (idx<<8) | (str[0]&0xff);
178 sal_uInt32 begin = index2[idx], end = index2[idx+1];
180 if (begin == 0) return 0;
182 str++; sLen--; // first character is not stored in the dictionary
183 for (sal_uInt32 i = end; i > begin; i--) {
184 sal_Int32 len = lenArray[i] - lenArray[i - 1];
185 if (sLen >= len) {
186 const sal_Unicode *dstr = dataArea + lenArray[i-1];
187 sal_Int32 pos = 0;
189 while (pos < len && dstr[pos] == str[pos]) { pos++; }
191 if (pos == len)
192 return len + 1;
195 return 0;
200 * c-tor
203 WordBreakCache::WordBreakCache() :
204 length( 0 ),
205 contents( NULL ),
206 wordboundary( NULL ),
207 size( 0 )
212 * Compare two unicode string,
215 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
216 // Different length, different string.
217 if (length != boundary.endPos - boundary.startPos) return sal_False;
219 for (sal_Int32 i = 0; i < length; i++)
220 if (contents[i] != str[i + boundary.startPos]) return sal_False;
222 return sal_True;
227 * Retrieve the segment containing the character at pos.
228 * @param pos : Position of the given character.
229 * @return true if CJK.
231 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
232 Boundary& segBoundary)
234 sal_Int32 indexUtf16;
235 segBoundary.endPos = segBoundary.startPos = pos;
237 indexUtf16 = pos;
238 while (indexUtf16 > 0)
240 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
241 if (u_isWhitespace(ch) || exists(ch))
242 segBoundary.startPos = indexUtf16;
243 else
244 break;
247 indexUtf16 = pos;
248 while (indexUtf16 < rText.getLength())
250 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
251 if (u_isWhitespace(ch) || exists(ch))
252 segBoundary.endPos = indexUtf16;
253 else
254 break;
257 indexUtf16 = segBoundary.startPos;
258 rText.iterateCodePoints(&indexUtf16, 1);
259 return segBoundary.endPos > indexUtf16;
262 #define KANJA 1
263 #define KATAKANA 2
264 #define HIRAKANA 3
266 static sal_Int16 JapaneseCharType(sal_Unicode c)
268 if (0x3041 <= c && c <= 0x309e)
269 return HIRAKANA;
270 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
271 return KATAKANA;
272 return KANJA;
275 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
277 WordBreakCache& rCache = cache[text[0] & 0x1f];
279 if (rCache.size != 0 && rCache.equals(text, wordBoundary))
280 return rCache;
282 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
284 if (rCache.size == 0 || len > rCache.size) {
285 if (rCache.size != 0) {
286 delete rCache.contents;
287 delete rCache.wordboundary;
288 rCache.size = len;
290 else
291 rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
292 rCache.contents = new sal_Unicode[rCache.size + 1];
293 rCache.wordboundary = new sal_Int32[rCache.size + 2];
295 rCache.length = len;
296 memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
297 *(rCache.contents + len) = 0x0000;
298 // reset the wordboundary in cache
299 memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
301 sal_Int32 i = 0; // loop variable
302 while (rCache.wordboundary[i] < rCache.length) {
303 len = 0;
304 // look the continuous white space as one word and cashe it
305 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
306 len ++;
308 if (len == 0) {
309 const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
310 sal_Int32 slen = rCache.length - rCache.wordboundary[i];
311 sal_Int16 type = 0, count = 0;
312 for (;len == 0 && slen > 0; str++, slen--) {
313 len = getLongestMatch(str, slen);
314 if (len == 0) {
315 if (!japaneseWordBreak) {
316 len = 1;
317 } else {
318 if (count == 0)
319 type = JapaneseCharType(*str);
320 else if (type != JapaneseCharType(*str))
321 break;
322 count++;
326 if (count)
328 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
329 i++;
333 if (len) {
334 rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
335 i++;
338 rCache.wordboundary[i + 1] = rCache.length + 1;
340 return rCache;
343 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
345 // looking for the first non-whitespace character from anyPos
346 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
348 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
350 return getWordBoundary(rText, anyPos, wordType, true);
353 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
355 boundary = getWordBoundary(rText, anyPos, wordType, true);
356 anyPos = boundary.endPos;
357 if (anyPos < rText.getLength()) {
358 // looknig for the first non-whitespace character from anyPos
359 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
360 while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
361 rText.iterateCodePoints(&anyPos, -1);
364 return getWordBoundary(rText, anyPos, wordType, true);
367 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
369 const sal_Unicode *text=rText.getStr();
370 sal_Int32 len=rText.getLength();
371 if (anyPos >= len || anyPos < 0) {
372 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
373 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
374 WordBreakCache& aCache = getCache(text, boundary);
375 sal_Int32 i = 0;
377 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
379 sal_Int32 startPos = aCache.wordboundary[i - 1];
380 // if bDirection is false
381 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
383 sal_Int32 indexUtf16 = anyPos-1;
384 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
385 if (u_isWhitespace(ch))
386 i--;
388 boundary.endPos = boundary.startPos;
389 rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
390 rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
391 } else {
392 boundary.startPos = anyPos;
393 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
394 boundary.endPos = anyPos < len ? anyPos : len;
396 if (wordType == WordType::WORD_COUNT) {
397 // skip punctuation for word count.
398 while (boundary.endPos < len)
400 sal_Int32 indexUtf16 = boundary.endPos;
401 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
402 boundary.endPos = indexUtf16;
403 else
404 break;
408 return boundary;
411 } } } }
413 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */