1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: xdictionary.cxx,v $
10 * $Revision: 1.18.24.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_i18npool.hxx"
34 // xdictionary.cpp: implementation of the xdictionary class.
36 //////////////////////////////////////////////////////////////////////
39 #include <rtl/ustrbuf.hxx>
41 #include <com/sun/star/i18n/WordType.hpp>
42 #include <xdictionary.hxx>
43 #include <unicode/uchar.h>
45 #include <breakiteratorImpl.hxx>
47 //////////////////////////////////////////////////////////////////////
48 // Construction/Destruction
49 //////////////////////////////////////////////////////////////////////
53 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
55 extern "C" { static void SAL_CALL
thisModule() {} }
57 xdictionary::xdictionary(const sal_Char
*lang
) :
65 japaneseWordBreak( sal_False
)
66 #if USE_CELL_BOUNDARY_CODE
67 // For CTL breakiterator, where the word boundary should not be inside cell.
69 useCellBoundary( sal_False
),
75 OUStringBuffer
aBuf( strlen(lang
) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
76 aBuf
.appendAscii( SAL_DLLPREFIX
);
78 OUStringBuffer
aBuf( strlen(lang
) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
80 aBuf
.appendAscii( "dict_" ).appendAscii( lang
).appendAscii( SAL_DLLEXTENSION
);
81 hModule
= osl_loadModuleRelative( &thisModule
, aBuf
.makeStringAndClear().pData
, SAL_LOADMODULE_DEFAULT
);
84 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString::createFromAscii("getExistMark").pData
);
85 existMark
= (sal_uInt8
*) (*func
)();
86 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString::createFromAscii("getIndex1").pData
);
87 index1
= (sal_Int16
*) (*func
)();
88 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString::createFromAscii("getIndex2").pData
);
89 index2
= (sal_Int32
*) (*func
)();
90 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString::createFromAscii("getLenArray").pData
);
91 lenArray
= (sal_Int32
*) (*func
)();
92 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString::createFromAscii("getDataArea").pData
);
93 dataArea
= (sal_Unicode
*) (*func
)();
104 for (sal_Int32 i
= 0; i
< CACHE_MAX
; i
++)
107 #if USE_CELL_BOUNDARY_CODE
108 useCellBoundary
= sal_False
;
111 japaneseWordBreak
= sal_False
;
114 xdictionary::~xdictionary() {
115 osl_unloadModule(hModule
);
116 for (sal_Int32 i
= 0; i
< CACHE_MAX
; i
++) {
117 if (cache
[i
].size
> 0) {
118 delete cache
[i
].contents
;
119 delete cache
[i
].wordboundary
;
124 void xdictionary::setJapaneseWordBreak()
126 japaneseWordBreak
= sal_True
;
129 sal_Bool
xdictionary::exists(const sal_uInt32 c
) {
130 // 0x1FFF is the hardcoded limit in gendict for existMarks
131 sal_Bool exist
= (existMark
&& ((c
>>3) < 0x1FFF)) ? sal::static_int_cast
<sal_Bool
>((existMark
[c
>>3] & (1<<(c
&0x07))) != 0) : sal_False
;
132 if (!exist
&& japaneseWordBreak
)
133 return BreakIteratorImpl::getScriptClass(c
) == ScriptType::ASIAN
;
138 sal_Int32
xdictionary::getLongestMatch(const sal_Unicode
* str
, sal_Int32 sLen
) {
140 if ( !index1
) return 0;
142 sal_Int16 idx
= index1
[str
[0] >> 8];
144 if (idx
== 0xFF) return 0;
146 idx
= (idx
<<8) | (str
[0]&0xff);
148 sal_uInt32 begin
= index2
[idx
], end
= index2
[idx
+1];
150 if (begin
== 0) return 0;
152 str
++; sLen
--; // first character is not stored in the dictionary
153 for (sal_uInt32 i
= end
; i
> begin
; i
--) {
154 sal_Int32 len
= lenArray
[i
] - lenArray
[i
- 1];
156 const sal_Unicode
*dstr
= dataArea
+ lenArray
[i
-1];
159 while (pos
< len
&& dstr
[pos
] == str
[pos
]) { pos
++; }
173 WordBreakCache::WordBreakCache() :
176 wordboundary( NULL
),
182 * Compare two unicode string,
185 sal_Bool
WordBreakCache::equals(const sal_Unicode
* str
, Boundary
& boundary
) {
186 // Different length, different string.
187 if (length
!= boundary
.endPos
- boundary
.startPos
) return sal_False
;
189 for (sal_Int32 i
= 0; i
< length
; i
++)
190 if (contents
[i
] != str
[i
+ boundary
.startPos
]) return sal_False
;
197 * Retrieve the segment containing the character at pos.
198 * @param pos : Position of the given character.
199 * @return true if CJK.
201 sal_Bool
xdictionary::seekSegment(const rtl::OUString
&rText
, sal_Int32 pos
,
202 Boundary
& segBoundary
)
204 sal_Int32 indexUtf16
;
205 segBoundary
.endPos
= segBoundary
.startPos
= pos
;
208 while (indexUtf16
> 0)
210 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, -1);
211 if (u_isWhitespace(ch
) || exists(ch
))
212 segBoundary
.startPos
= indexUtf16
;
218 while (indexUtf16
< rText
.getLength())
220 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, 1);
221 if (u_isWhitespace(ch
) || exists(ch
))
222 segBoundary
.endPos
= indexUtf16
;
227 indexUtf16
= segBoundary
.startPos
;
228 rText
.iterateCodePoints(&indexUtf16
, 1);
229 return segBoundary
.endPos
> indexUtf16
;
236 static sal_Int16
JapaneseCharType(sal_Unicode c
)
238 if (0x3041 <= c
&& c
<= 0x309e)
240 if ((0x30a1 <= c
&& c
<= 0x30fe) || (0xff65 <= c
&& c
<= 0xff9f))
245 WordBreakCache
& xdictionary::getCache(const sal_Unicode
*text
, Boundary
& wordBoundary
)
248 WordBreakCache
& aCache
= cache
[text
[0] & 0x1f];
250 if (aCache
.size
!= 0 && aCache
.equals(text
, wordBoundary
))
253 sal_Int32 len
= wordBoundary
.endPos
- wordBoundary
.startPos
;
255 if (aCache
.size
== 0 || len
> aCache
.size
) {
256 if (aCache
.size
!= 0) {
257 delete aCache
.contents
;
258 delete aCache
.wordboundary
;
262 aCache
.size
= len
> DEFAULT_SIZE
? len
: DEFAULT_SIZE
;
263 aCache
.contents
= new sal_Unicode
[aCache
.size
+ 1];
264 aCache
.wordboundary
= new sal_Int32
[aCache
.size
+ 2];
267 memcpy(aCache
.contents
, text
+ wordBoundary
.startPos
, len
* sizeof(sal_Unicode
));
268 *(aCache
.contents
+ len
) = 0x0000;
269 // reset the wordboundary in cache
270 memset(aCache
.wordboundary
, '\0', sizeof(sal_Int32
)*(len
+ 2));
272 sal_Int32 i
= 0; // loop variable
273 while (aCache
.wordboundary
[i
] < aCache
.length
) {
275 // look the continuous white space as one word and cashe it
276 while (u_isWhitespace((sal_uInt32
)text
[wordBoundary
.startPos
+ aCache
.wordboundary
[i
] + len
]))
280 const sal_Unicode
*str
= text
+ wordBoundary
.startPos
+ aCache
.wordboundary
[i
];
281 sal_Int32 slen
= aCache
.length
- aCache
.wordboundary
[i
];
282 sal_Int16 type
= 0, count
= 0;
283 for (;len
== 0 && slen
> 0; str
++, slen
--) {
284 len
= getLongestMatch(str
, slen
);
286 if (!japaneseWordBreak
) {
290 type
= JapaneseCharType(*str
);
291 else if (type
!= JapaneseCharType(*str
))
298 aCache
.wordboundary
[i
+1] = aCache
.wordboundary
[i
] + count
;
301 #if USE_CELL_BOUNDARY_CODE
302 if (useCellBoundary
) {
303 sal_Int32 cBoundary
= cellBoundary
[aCache
.wordboundary
[i
] + wordBoundary
.startPos
- 1];
305 aCache
.wordboundary
[i
] = cBoundary
- wordBoundary
.startPos
;
312 aCache
.wordboundary
[i
+1] = aCache
.wordboundary
[i
] + len
;
315 #if USE_CELL_BOUNDARY_CODE
316 if (useCellBoundary
) {
317 sal_Int32 cBoundary
= cellBoundary
[aCache
.wordboundary
[i
] + wordBoundary
.startPos
- 1];
319 aCache
.wordboundary
[i
] = cBoundary
- wordBoundary
.startPos
;
324 aCache
.wordboundary
[i
+ 1] = aCache
.length
+ 1;
329 Boundary
xdictionary::previousWord(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
)
331 // looking for the first non-whitespace character from anyPos
332 sal_uInt32 ch
= rText
.iterateCodePoints(&anyPos
, -1);
334 while (anyPos
> 0 && u_isWhitespace(ch
)) ch
= rText
.iterateCodePoints(&anyPos
, -1);
336 return getWordBoundary(rText
, anyPos
, wordType
, true);
339 Boundary
xdictionary::nextWord(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
)
341 boundary
= getWordBoundary(rText
, anyPos
, wordType
, true);
342 anyPos
= boundary
.endPos
;
343 if (anyPos
< rText
.getLength()) {
344 // looknig for the first non-whitespace character from anyPos
345 sal_uInt32 ch
= rText
.iterateCodePoints(&anyPos
, 1);
346 while (u_isWhitespace(ch
)) ch
=rText
.iterateCodePoints(&anyPos
, 1);
347 rText
.iterateCodePoints(&anyPos
, -1);
350 return getWordBoundary(rText
, anyPos
, wordType
, true);
353 Boundary
xdictionary::getWordBoundary(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
, sal_Bool bDirection
)
355 const sal_Unicode
*text
=rText
.getStr();
356 sal_Int32 len
=rText
.getLength();
357 if (anyPos
>= len
|| anyPos
< 0) {
358 boundary
.startPos
= boundary
.endPos
= anyPos
< 0 ? 0 : len
;
359 } else if (seekSegment(rText
, anyPos
, boundary
)) { // character in dict
360 WordBreakCache
& aCache
= getCache(text
, boundary
);
363 while (aCache
.wordboundary
[i
] <= anyPos
- boundary
.startPos
) i
++;
365 sal_Int32 startPos
= aCache
.wordboundary
[i
- 1];
366 // if bDirection is false
367 if (!bDirection
&& startPos
> 0 && startPos
== (anyPos
- boundary
.startPos
))
369 sal_Int32 indexUtf16
= anyPos
-1;
370 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, 1);
371 if (u_isWhitespace(ch
))
374 boundary
.endPos
= boundary
.startPos
;
375 rText
.iterateCodePoints(&boundary
.endPos
, aCache
.wordboundary
[i
]);
376 rText
.iterateCodePoints(&boundary
.startPos
, aCache
.wordboundary
[i
-1]);
378 boundary
.startPos
= anyPos
;
379 if (anyPos
< len
) rText
.iterateCodePoints(&anyPos
, 1);
380 boundary
.endPos
= anyPos
< len
? anyPos
: len
;
382 if (wordType
== WordType::WORD_COUNT
) {
383 // skip punctuation for word count.
384 while (boundary
.endPos
< len
)
386 sal_Int32 indexUtf16
= boundary
.endPos
;
387 if (u_ispunct(rText
.iterateCodePoints(&indexUtf16
, 1)))
388 boundary
.endPos
= indexUtf16
;
397 #if USE_CELL_BOUNDARY_CODE
398 void xdictionary::setCellBoundary(sal_Int32
* cellArray
)
400 useCellBoundary
= sal_True
;
401 cellBoundary
= cellArray
;