1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 // xdictionary.cpp: implementation of the xdictionary class.
23 //////////////////////////////////////////////////////////////////////
26 #include <rtl/ustrbuf.hxx>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <xdictionary.hxx>
30 #include <unicode/uchar.h>
32 #include <breakiteratorImpl.hxx>
34 //////////////////////////////////////////////////////////////////////
35 // Construction/Destruction
36 //////////////////////////////////////////////////////////////////////
38 using ::rtl::OUString
;
39 using ::rtl::OUStringBuffer
;
41 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
43 #ifndef DISABLE_DYNLOADING
45 extern "C" { static void SAL_CALL
thisModule() {} }
51 sal_uInt8
* getExistMark_ja();
52 sal_Int16
* getIndex1_ja();
53 sal_Int32
* getIndex2_ja();
54 sal_Int32
* getLenArray_ja();
55 sal_Unicode
* getDataArea_ja();
57 sal_uInt8
* getExistMark_zh();
58 sal_Int16
* getIndex1_zh();
59 sal_Int32
* getIndex2_zh();
60 sal_Int32
* getLenArray_zh();
61 sal_Unicode
* getDataArea_zh();
67 xdictionary::xdictionary(const sal_Char
*lang
) :
73 #ifndef DISABLE_DYNLOADING
77 japaneseWordBreak( sal_False
)
80 #ifndef DISABLE_DYNLOADING
82 OUStringBuffer
aBuf( strlen(lang
) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
83 aBuf
.appendAscii( SAL_DLLPREFIX
);
85 OUStringBuffer
aBuf( strlen(lang
) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
87 aBuf
.appendAscii( "dict_" ).appendAscii( lang
).appendAscii( SAL_DLLEXTENSION
);
88 hModule
= osl_loadModuleRelative( &thisModule
, aBuf
.makeStringAndClear().pData
, SAL_LOADMODULE_DEFAULT
);
91 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString("getExistMark").pData
);
92 existMark
= (sal_uInt8
*) (*func
)();
93 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString("getIndex1").pData
);
94 index1
= (sal_Int16
*) (*func
)();
95 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString("getIndex2").pData
);
96 index2
= (sal_Int32
*) (*func
)();
97 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString("getLenArray").pData
);
98 lenArray
= (sal_Int32
*) (*func
)();
99 func
= (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule
, OUString("getDataArea").pData
);
100 dataArea
= (sal_Unicode
*) (*func
)();
112 if( strcmp( lang
, "ja" ) == 0 ) {
113 existMark
= getExistMark_ja();
114 index1
= getIndex1_ja();
115 index2
= getIndex2_ja();
116 lenArray
= getLenArray_ja();
117 dataArea
= getDataArea_ja();
119 else if( strcmp( lang
, "zh" ) == 0 ) {
120 existMark
= getExistMark_zh();
121 index1
= getIndex1_zh();
122 index2
= getIndex2_zh();
123 lenArray
= getLenArray_zh();
124 dataArea
= getDataArea_zh();
136 for (sal_Int32 i
= 0; i
< CACHE_MAX
; i
++)
139 japaneseWordBreak
= sal_False
;
142 xdictionary::~xdictionary() {
143 #ifndef DISABLE_DYNLOADING
144 osl_unloadModule(hModule
);
146 for (sal_Int32 i
= 0; i
< CACHE_MAX
; i
++) {
147 if (cache
[i
].size
> 0) {
148 delete [] cache
[i
].contents
;
149 delete [] cache
[i
].wordboundary
;
154 void xdictionary::setJapaneseWordBreak()
156 japaneseWordBreak
= sal_True
;
159 sal_Bool
xdictionary::exists(const sal_uInt32 c
) {
160 // 0x1FFF is the hardcoded limit in gendict for existMarks
161 sal_Bool exist
= (existMark
&& ((c
>>3) < 0x1FFF)) ? sal::static_int_cast
<sal_Bool
>((existMark
[c
>>3] & (1<<(c
&0x07))) != 0) : sal_False
;
162 if (!exist
&& japaneseWordBreak
)
163 return BreakIteratorImpl::getScriptClass(c
) == ScriptType::ASIAN
;
168 sal_Int32
xdictionary::getLongestMatch(const sal_Unicode
* str
, sal_Int32 sLen
) {
170 if ( !index1
) return 0;
172 sal_Int16 idx
= index1
[str
[0] >> 8];
174 if (idx
== 0xFF) return 0;
176 idx
= (idx
<<8) | (str
[0]&0xff);
178 sal_uInt32 begin
= index2
[idx
], end
= index2
[idx
+1];
180 if (begin
== 0) return 0;
182 str
++; sLen
--; // first character is not stored in the dictionary
183 for (sal_uInt32 i
= end
; i
> begin
; i
--) {
184 sal_Int32 len
= lenArray
[i
] - lenArray
[i
- 1];
186 const sal_Unicode
*dstr
= dataArea
+ lenArray
[i
-1];
189 while (pos
< len
&& dstr
[pos
] == str
[pos
]) { pos
++; }
203 WordBreakCache::WordBreakCache() :
206 wordboundary( NULL
),
212 * Compare two unicode string,
215 sal_Bool
WordBreakCache::equals(const sal_Unicode
* str
, Boundary
& boundary
) {
216 // Different length, different string.
217 if (length
!= boundary
.endPos
- boundary
.startPos
) return sal_False
;
219 for (sal_Int32 i
= 0; i
< length
; i
++)
220 if (contents
[i
] != str
[i
+ boundary
.startPos
]) return sal_False
;
227 * Retrieve the segment containing the character at pos.
228 * @param pos : Position of the given character.
229 * @return true if CJK.
231 sal_Bool
xdictionary::seekSegment(const rtl::OUString
&rText
, sal_Int32 pos
,
232 Boundary
& segBoundary
)
234 sal_Int32 indexUtf16
;
235 segBoundary
.endPos
= segBoundary
.startPos
= pos
;
238 while (indexUtf16
> 0)
240 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, -1);
241 if (u_isWhitespace(ch
) || exists(ch
))
242 segBoundary
.startPos
= indexUtf16
;
248 while (indexUtf16
< rText
.getLength())
250 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, 1);
251 if (u_isWhitespace(ch
) || exists(ch
))
252 segBoundary
.endPos
= indexUtf16
;
257 indexUtf16
= segBoundary
.startPos
;
258 rText
.iterateCodePoints(&indexUtf16
, 1);
259 return segBoundary
.endPos
> indexUtf16
;
266 static sal_Int16
JapaneseCharType(sal_Unicode c
)
268 if (0x3041 <= c
&& c
<= 0x309e)
270 if ((0x30a1 <= c
&& c
<= 0x30fe) || (0xff65 <= c
&& c
<= 0xff9f))
275 WordBreakCache
& xdictionary::getCache(const sal_Unicode
*text
, Boundary
& wordBoundary
)
277 WordBreakCache
& rCache
= cache
[text
[0] & 0x1f];
279 if (rCache
.size
!= 0 && rCache
.equals(text
, wordBoundary
))
282 sal_Int32 len
= wordBoundary
.endPos
- wordBoundary
.startPos
;
284 if (rCache
.size
== 0 || len
> rCache
.size
) {
285 if (rCache
.size
!= 0) {
286 delete rCache
.contents
;
287 delete rCache
.wordboundary
;
291 rCache
.size
= len
> DEFAULT_SIZE
? len
: DEFAULT_SIZE
;
292 rCache
.contents
= new sal_Unicode
[rCache
.size
+ 1];
293 rCache
.wordboundary
= new sal_Int32
[rCache
.size
+ 2];
296 memcpy(rCache
.contents
, text
+ wordBoundary
.startPos
, len
* sizeof(sal_Unicode
));
297 *(rCache
.contents
+ len
) = 0x0000;
298 // reset the wordboundary in cache
299 memset(rCache
.wordboundary
, '\0', sizeof(sal_Int32
)*(len
+ 2));
301 sal_Int32 i
= 0; // loop variable
302 while (rCache
.wordboundary
[i
] < rCache
.length
) {
304 // look the continuous white space as one word and cashe it
305 while (u_isWhitespace((sal_uInt32
)text
[wordBoundary
.startPos
+ rCache
.wordboundary
[i
] + len
]))
309 const sal_Unicode
*str
= text
+ wordBoundary
.startPos
+ rCache
.wordboundary
[i
];
310 sal_Int32 slen
= rCache
.length
- rCache
.wordboundary
[i
];
311 sal_Int16 type
= 0, count
= 0;
312 for (;len
== 0 && slen
> 0; str
++, slen
--) {
313 len
= getLongestMatch(str
, slen
);
315 if (!japaneseWordBreak
) {
319 type
= JapaneseCharType(*str
);
320 else if (type
!= JapaneseCharType(*str
))
328 rCache
.wordboundary
[i
+1] = rCache
.wordboundary
[i
] + count
;
334 rCache
.wordboundary
[i
+1] = rCache
.wordboundary
[i
] + len
;
338 rCache
.wordboundary
[i
+ 1] = rCache
.length
+ 1;
343 Boundary
xdictionary::previousWord(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
)
345 // looking for the first non-whitespace character from anyPos
346 sal_uInt32 ch
= rText
.iterateCodePoints(&anyPos
, -1);
348 while (anyPos
> 0 && u_isWhitespace(ch
)) ch
= rText
.iterateCodePoints(&anyPos
, -1);
350 return getWordBoundary(rText
, anyPos
, wordType
, true);
353 Boundary
xdictionary::nextWord(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
)
355 boundary
= getWordBoundary(rText
, anyPos
, wordType
, true);
356 anyPos
= boundary
.endPos
;
357 if (anyPos
< rText
.getLength()) {
358 // looknig for the first non-whitespace character from anyPos
359 sal_uInt32 ch
= rText
.iterateCodePoints(&anyPos
, 1);
360 while (u_isWhitespace(ch
)) ch
=rText
.iterateCodePoints(&anyPos
, 1);
361 rText
.iterateCodePoints(&anyPos
, -1);
364 return getWordBoundary(rText
, anyPos
, wordType
, true);
367 Boundary
xdictionary::getWordBoundary(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
, sal_Bool bDirection
)
369 const sal_Unicode
*text
=rText
.getStr();
370 sal_Int32 len
=rText
.getLength();
371 if (anyPos
>= len
|| anyPos
< 0) {
372 boundary
.startPos
= boundary
.endPos
= anyPos
< 0 ? 0 : len
;
373 } else if (seekSegment(rText
, anyPos
, boundary
)) { // character in dict
374 WordBreakCache
& aCache
= getCache(text
, boundary
);
377 while (aCache
.wordboundary
[i
] <= anyPos
- boundary
.startPos
) i
++;
379 sal_Int32 startPos
= aCache
.wordboundary
[i
- 1];
380 // if bDirection is false
381 if (!bDirection
&& startPos
> 0 && startPos
== (anyPos
- boundary
.startPos
))
383 sal_Int32 indexUtf16
= anyPos
-1;
384 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, 1);
385 if (u_isWhitespace(ch
))
388 boundary
.endPos
= boundary
.startPos
;
389 rText
.iterateCodePoints(&boundary
.endPos
, aCache
.wordboundary
[i
]);
390 rText
.iterateCodePoints(&boundary
.startPos
, aCache
.wordboundary
[i
-1]);
392 boundary
.startPos
= anyPos
;
393 if (anyPos
< len
) rText
.iterateCodePoints(&anyPos
, 1);
394 boundary
.endPos
= anyPos
< len
? anyPos
: len
;
396 if (wordType
== WordType::WORD_COUNT
) {
397 // skip punctuation for word count.
398 while (boundary
.endPos
< len
)
400 sal_Int32 indexUtf16
= boundary
.endPos
;
401 if (u_ispunct(rText
.iterateCodePoints(&indexUtf16
, 1)))
402 boundary
.endPos
= indexUtf16
;
413 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */