1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <config_folders.h>
23 #include <osl/mutex.hxx>
24 #include <rtl/ustrbuf.hxx>
25 #include <rtl/bootstrap.hxx>
26 #include <com/sun/star/i18n/WordType.hpp>
27 #include <xdictionary.hxx>
28 #include <unicode/uchar.h>
30 #include <breakiteratorImpl.hxx>
32 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
34 #ifdef DICT_JA_ZH_IN_DATAFILE
36 #elif !defined DISABLE_DYNLOADING
38 extern "C" { static void SAL_CALL
thisModule() {} }
44 sal_uInt8
* getExistMark_ja();
45 sal_Int16
* getIndex1_ja();
46 sal_Int32
* getIndex2_ja();
47 sal_Int32
* getLenArray_ja();
48 sal_Unicode
* getDataArea_ja();
50 sal_uInt8
* getExistMark_zh();
51 sal_Int16
* getIndex1_zh();
52 sal_Int32
* getIndex2_zh();
53 sal_Int32
* getLenArray_zh();
54 sal_Unicode
* getDataArea_zh();
60 xdictionary::xdictionary(const sal_Char
*lang
) :
62 japaneseWordBreak( false )
65 #ifdef DICT_JA_ZH_IN_DATAFILE
67 if( strcmp( lang
, "ja" ) == 0 || strcmp( lang
, "zh" ) == 0 )
69 OUString
sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER
"/dict_" );
70 rtl::Bootstrap::expandMacros(sUrl
);
72 if( strcmp( lang
, "ja" ) == 0 )
74 else if( strcmp( lang
, "zh" ) == 0 )
77 oslFileHandle aFileHandle
;
80 if( osl_openFile( sUrl
.pData
, &aFileHandle
, osl_File_OpenFlag_Read
) == osl_File_E_None
&&
81 osl_getFileSize( aFileHandle
, &nFileSize
) == osl_File_E_None
&&
82 osl_mapFile( aFileHandle
, (void **) &pMapping
, nFileSize
, 0, osl_File_MapFlag_RandomAccess
) == osl_File_E_None
)
84 // We have the offsets to the parts of the file at its end, see gendict.cxx
85 sal_Int64
*pEOF
= (sal_Int64
*)(pMapping
+ nFileSize
);
87 data
.existMark
= (sal_uInt8
*) (pMapping
+ pEOF
[-1]);
88 data
.index2
= (sal_Int32
*) (pMapping
+ pEOF
[-2]);
89 data
.index1
= (sal_Int16
*) (pMapping
+ pEOF
[-3]);
90 data
.lenArray
= (sal_Int32
*) (pMapping
+ pEOF
[-4]);
91 data
.dataArea
= (sal_Unicode
*) (pMapping
+ pEOF
[-5]);
95 #elif !defined DISABLE_DYNLOADING
97 initDictionaryData( lang
);
101 if( strcmp( lang
, "ja" ) == 0 ) {
102 data
.existMark
= getExistMark_ja();
103 data
.index1
= getIndex1_ja();
104 data
.index2
= getIndex2_ja();
105 data
.lenArray
= getLenArray_ja();
106 data
.dataArea
= getDataArea_ja();
108 else if( strcmp( lang
, "zh" ) == 0 ) {
109 data
.existMark
= getExistMark_zh();
110 data
.index1
= getIndex1_zh();
111 data
.index2
= getIndex2_zh();
112 data
.lenArray
= getLenArray_zh();
113 data
.dataArea
= getDataArea_zh();
118 for (sal_Int32 i
= 0; i
< CACHE_MAX
; i
++)
121 japaneseWordBreak
= false;
124 xdictionary::~xdictionary()
126 for (sal_Int32 i
= 0; i
< CACHE_MAX
; i
++) {
127 if (cache
[i
].size
> 0) {
128 delete [] cache
[i
].contents
;
129 delete [] cache
[i
].wordboundary
;
138 xdictionarydata maData
;
142 #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
144 void xdictionary::initDictionaryData(const sal_Char
*pLang
)
146 // Global cache, never released for performance
147 static std::vector
< datacache
> aLoadedCache
;
149 osl::MutexGuard
aGuard( osl::Mutex::getGlobalMutex() );
150 for( size_t i
= 0; i
< aLoadedCache
.size(); ++i
)
152 if( !strcmp( pLang
, aLoadedCache
[ i
].maLang
.getStr() ) )
154 data
= aLoadedCache
[ i
].maData
;
159 // otherwise add to the cache, positive or negative.
161 aEntry
.maLang
= OString( pLang
, strlen( pLang
) );
164 OUStringBuffer
aBuf( sal::static_int_cast
<int>(strlen(pLang
) + 7 + 6) ); // mostly "lib*.so" (with * == dict_zh)
165 aBuf
.appendAscii( SAL_DLLPREFIX
);
167 OUStringBuffer
aBuf( sal::static_int_cast
<int>(strlen(pLang
) + 7 + 4) ); // mostly "*.dll" (with * == dict_zh)
169 aBuf
.appendAscii( "dict_" ).appendAscii( pLang
).appendAscii( SAL_DLLEXTENSION
);
170 aEntry
.mhModule
= osl_loadModuleRelative( &thisModule
, aBuf
.makeStringAndClear().pData
, SAL_LOADMODULE_DEFAULT
);
171 if( aEntry
.mhModule
) {
172 oslGenericFunction func
;
173 func
= osl_getAsciiFunctionSymbol( aEntry
.mhModule
, "getExistMark" );
174 aEntry
.maData
.existMark
= reinterpret_cast<sal_uInt8
const * (*)()>(func
)();
175 func
= osl_getAsciiFunctionSymbol( aEntry
.mhModule
, "getIndex1" );
176 aEntry
.maData
.index1
= reinterpret_cast<sal_Int16
const * (*)()>(func
)();
177 func
= osl_getAsciiFunctionSymbol( aEntry
.mhModule
, "getIndex2" );
178 aEntry
.maData
.index2
= reinterpret_cast<sal_Int32
const * (*)()>(func
)();
179 func
= osl_getAsciiFunctionSymbol( aEntry
.mhModule
, "getLenArray" );
180 aEntry
.maData
.lenArray
= reinterpret_cast<sal_Int32
const * (*)()>(func
)();
181 func
= osl_getAsciiFunctionSymbol( aEntry
.mhModule
, "getDataArea" );
182 aEntry
.maData
.dataArea
= reinterpret_cast<sal_Unicode
const * (*)()>(func
)();
185 data
= aEntry
.maData
;
186 aLoadedCache
.push_back( aEntry
);
191 void xdictionary::setJapaneseWordBreak()
193 japaneseWordBreak
= true;
196 bool xdictionary::exists(const sal_uInt32 c
)
198 // 0x1FFF is the hardcoded limit in gendict for data.existMarks
199 bool exist
= data
.existMark
&& (c
>>3) < 0x1FFF && (data
.existMark
[c
>>3] & (1<<(c
&0x07))) != 0;
200 if (!exist
&& japaneseWordBreak
)
201 return BreakIteratorImpl::getScriptClass(c
) == ScriptType::ASIAN
;
206 sal_Int32
xdictionary::getLongestMatch(const sal_Unicode
* str
, sal_Int32 sLen
)
208 if ( !data
.index1
) return 0;
210 sal_Int16 idx
= data
.index1
[str
[0] >> 8];
212 if (idx
== 0xFF) return 0;
214 idx
= (idx
<<8) | (str
[0]&0xff);
216 sal_uInt32 begin
= data
.index2
[idx
], end
= data
.index2
[idx
+1];
218 if (begin
== 0) return 0;
220 str
++; sLen
--; // first character is not stored in the dictionary
221 for (sal_uInt32 i
= end
; i
> begin
; i
--) {
222 sal_Int32 len
= data
.lenArray
[i
] - data
.lenArray
[i
- 1];
224 const sal_Unicode
*dstr
= data
.dataArea
+ data
.lenArray
[i
-1];
227 while (pos
< len
&& dstr
[pos
] == str
[pos
]) { pos
++; }
241 WordBreakCache::WordBreakCache() :
244 wordboundary( NULL
),
250 * Compare two unicode string,
253 bool WordBreakCache::equals(const sal_Unicode
* str
, Boundary
& boundary
)
255 // Different length, different string.
256 if (length
!= boundary
.endPos
- boundary
.startPos
) return false;
258 for (sal_Int32 i
= 0; i
< length
; i
++)
259 if (contents
[i
] != str
[i
+ boundary
.startPos
]) return false;
266 * Retrieve the segment containing the character at pos.
267 * @param pos : Position of the given character.
268 * @return true if CJK.
270 bool xdictionary::seekSegment(const OUString
&rText
, sal_Int32 pos
,
271 Boundary
& segBoundary
)
273 sal_Int32 indexUtf16
;
275 if (segmentCachedString
.pData
!= rText
.pData
) {
276 // Cache the passed text so we can avoid regenerating the segment if it's the same
277 // (pData is refcounted and assigning the OUString references it, which ensures that
278 // the object is the same if we get the same pointer back later)
279 segmentCachedString
= rText
;
281 // If pos is within the cached boundary, use that boundary
282 if (pos
>= segmentCachedBoundary
.startPos
&& pos
<= segmentCachedBoundary
.endPos
) {
283 segBoundary
.startPos
= segmentCachedBoundary
.startPos
;
284 segBoundary
.endPos
= segmentCachedBoundary
.endPos
;
285 indexUtf16
= segmentCachedBoundary
.startPos
;
286 rText
.iterateCodePoints(&indexUtf16
, 1);
287 return segmentCachedBoundary
.endPos
> indexUtf16
;
291 segBoundary
.endPos
= segBoundary
.startPos
= pos
;
294 while (indexUtf16
> 0)
296 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, -1);
297 if (u_isWhitespace(ch
) || exists(ch
))
298 segBoundary
.startPos
= indexUtf16
;
304 while (indexUtf16
< rText
.getLength())
306 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, 1);
307 if (u_isWhitespace(ch
) || exists(ch
))
308 segBoundary
.endPos
= indexUtf16
;
313 // Cache the calculated boundary
314 segmentCachedBoundary
.startPos
= segBoundary
.startPos
;
315 segmentCachedBoundary
.endPos
= segBoundary
.endPos
;
317 indexUtf16
= segBoundary
.startPos
;
318 rText
.iterateCodePoints(&indexUtf16
, 1);
319 return segBoundary
.endPos
> indexUtf16
;
326 static sal_Int16
JapaneseCharType(sal_Unicode c
)
328 if (0x3041 <= c
&& c
<= 0x309e)
330 if ((0x30a1 <= c
&& c
<= 0x30fe) || (0xff65 <= c
&& c
<= 0xff9f))
335 WordBreakCache
& xdictionary::getCache(const sal_Unicode
*text
, Boundary
& wordBoundary
)
337 WordBreakCache
& rCache
= cache
[text
[0] & 0x1f];
339 if (rCache
.size
!= 0 && rCache
.equals(text
, wordBoundary
))
342 sal_Int32 len
= wordBoundary
.endPos
- wordBoundary
.startPos
;
344 if (rCache
.size
== 0 || len
> rCache
.size
) {
345 if (rCache
.size
!= 0) {
346 delete [] rCache
.contents
;
347 delete [] rCache
.wordboundary
;
351 rCache
.size
= len
> DEFAULT_SIZE
? len
: DEFAULT_SIZE
;
352 rCache
.contents
= new sal_Unicode
[rCache
.size
+ 1];
353 rCache
.wordboundary
= new sal_Int32
[rCache
.size
+ 2];
356 memcpy(rCache
.contents
, text
+ wordBoundary
.startPos
, len
* sizeof(sal_Unicode
));
357 *(rCache
.contents
+ len
) = 0x0000;
358 // reset the wordboundary in cache
359 memset(rCache
.wordboundary
, '\0', sizeof(sal_Int32
)*(len
+ 2));
361 sal_Int32 i
= 0; // loop variable
362 while (rCache
.wordboundary
[i
] < rCache
.length
) {
364 // look the continuous white space as one word and cashe it
365 while (u_isWhitespace((sal_uInt32
)text
[wordBoundary
.startPos
+ rCache
.wordboundary
[i
] + len
]))
369 const sal_Unicode
*str
= text
+ wordBoundary
.startPos
+ rCache
.wordboundary
[i
];
370 sal_Int32 slen
= rCache
.length
- rCache
.wordboundary
[i
];
371 sal_Int16 type
= 0, count
= 0;
372 for (;len
== 0 && slen
> 0; str
++, slen
--) {
373 len
= getLongestMatch(str
, slen
);
375 if (!japaneseWordBreak
) {
379 type
= JapaneseCharType(*str
);
380 else if (type
!= JapaneseCharType(*str
))
388 rCache
.wordboundary
[i
+1] = rCache
.wordboundary
[i
] + count
;
394 rCache
.wordboundary
[i
+1] = rCache
.wordboundary
[i
] + len
;
398 rCache
.wordboundary
[i
+ 1] = rCache
.length
+ 1;
403 Boundary
xdictionary::previousWord(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
)
405 // looking for the first non-whitespace character from anyPos
408 rText
.iterateCodePoints(&anyPos
, -1);
410 while (anyPos
> 0 && u_isWhitespace(ch
)) ch
= rText
.iterateCodePoints(&anyPos
, -1);
412 return getWordBoundary(rText
, anyPos
, wordType
, true);
415 Boundary
xdictionary::nextWord(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
)
417 boundary
= getWordBoundary(rText
, anyPos
, wordType
, true);
418 anyPos
= boundary
.endPos
;
419 const sal_Int32 nLen
= rText
.getLength();
421 // looknig for the first non-whitespace character from anyPos
422 sal_uInt32 ch
= rText
.iterateCodePoints(&anyPos
, 1);
423 while (u_isWhitespace(ch
) && (anyPos
< nLen
)) ch
=rText
.iterateCodePoints(&anyPos
, 1);
425 rText
.iterateCodePoints(&anyPos
, -1);
428 return getWordBoundary(rText
, anyPos
, wordType
, true);
431 Boundary
xdictionary::getWordBoundary(const OUString
& rText
, sal_Int32 anyPos
, sal_Int16 wordType
, bool bDirection
)
433 const sal_Unicode
*text
=rText
.getStr();
434 sal_Int32 len
=rText
.getLength();
435 if (anyPos
>= len
|| anyPos
< 0) {
436 boundary
.startPos
= boundary
.endPos
= anyPos
< 0 ? 0 : len
;
437 } else if (seekSegment(rText
, anyPos
, boundary
)) { // character in dict
438 WordBreakCache
& aCache
= getCache(text
, boundary
);
441 while (aCache
.wordboundary
[i
] <= anyPos
- boundary
.startPos
) i
++;
443 sal_Int32 startPos
= aCache
.wordboundary
[i
- 1];
444 // if bDirection is false
445 if (!bDirection
&& startPos
> 0 && startPos
== (anyPos
- boundary
.startPos
))
447 sal_Int32 indexUtf16
= anyPos
-1;
448 sal_uInt32 ch
= rText
.iterateCodePoints(&indexUtf16
, 1);
449 if (u_isWhitespace(ch
))
453 boundary
.endPos
= boundary
.startPos
;
454 boundary
.endPos
+= aCache
.wordboundary
[i
];
455 boundary
.startPos
+= aCache
.wordboundary
[i
-1];
458 boundary
.startPos
= anyPos
;
459 if (anyPos
< len
) rText
.iterateCodePoints(&anyPos
, 1);
460 boundary
.endPos
= anyPos
< len
? anyPos
: len
;
462 if (wordType
== WordType::WORD_COUNT
) {
463 // skip punctuation for word count.
464 while (boundary
.endPos
< len
)
466 sal_Int32 indexUtf16
= boundary
.endPos
;
467 if (u_ispunct(rText
.iterateCodePoints(&indexUtf16
, 1)))
468 boundary
.endPos
= indexUtf16
;
479 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */