update emoji autocorrect entries from po-files
[LibreOffice.git] / i18npool / source / breakiterator / xdictionary.cxx
blobec9f63097b78cbae0184ed692ebeba9158aa2bb0
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <config_folders.h>
22 #include <osl/file.h>
23 #include <osl/mutex.hxx>
24 #include <rtl/ustrbuf.hxx>
25 #include <rtl/bootstrap.hxx>
26 #include <com/sun/star/i18n/WordType.hpp>
27 #include <xdictionary.hxx>
28 #include <unicode/uchar.h>
29 #include <string.h>
30 #include <breakiteratorImpl.hxx>
32 namespace com { namespace sun { namespace star { namespace i18n {
34 #ifdef DICT_JA_ZH_IN_DATAFILE
36 #elif !defined DISABLE_DYNLOADING
38 extern "C" { static void SAL_CALL thisModule() {} }
40 #else
42 extern "C" {
44 sal_uInt8* getExistMark_ja();
45 sal_Int16* getIndex1_ja();
46 sal_Int32* getIndex2_ja();
47 sal_Int32* getLenArray_ja();
48 sal_Unicode* getDataArea_ja();
50 sal_uInt8* getExistMark_zh();
51 sal_Int16* getIndex1_zh();
52 sal_Int32* getIndex2_zh();
53 sal_Int32* getLenArray_zh();
54 sal_Unicode* getDataArea_zh();
58 #endif
60 xdictionary::xdictionary(const sal_Char *lang) :
61 boundary(),
62 japaneseWordBreak( false )
65 #ifdef DICT_JA_ZH_IN_DATAFILE
67 if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
69 OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
70 rtl::Bootstrap::expandMacros(sUrl);
72 if( strcmp( lang, "ja" ) == 0 )
73 sUrl += "ja.data";
74 else if( strcmp( lang, "zh" ) == 0 )
75 sUrl += "zh.data";
77 oslFileHandle aFileHandle;
78 sal_uInt64 nFileSize;
79 char *pMapping;
80 if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
81 osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None &&
82 osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
84 // We have the offsets to the parts of the file at its end, see gendict.cxx
85 sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize);
87 data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]);
88 data.index2 = (sal_Int32*) (pMapping + pEOF[-2]);
89 data.index1 = (sal_Int16*) (pMapping + pEOF[-3]);
90 data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]);
91 data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]);
95 #elif !defined DISABLE_DYNLOADING
97 initDictionaryData( lang );
99 #else
101 if( strcmp( lang, "ja" ) == 0 ) {
102 data.existMark = getExistMark_ja();
103 data.index1 = getIndex1_ja();
104 data.index2 = getIndex2_ja();
105 data.lenArray = getLenArray_ja();
106 data.dataArea = getDataArea_ja();
108 else if( strcmp( lang, "zh" ) == 0 ) {
109 data.existMark = getExistMark_zh();
110 data.index1 = getIndex1_zh();
111 data.index2 = getIndex2_zh();
112 data.lenArray = getLenArray_zh();
113 data.dataArea = getDataArea_zh();
116 #endif
118 for (sal_Int32 i = 0; i < CACHE_MAX; i++)
119 cache[i].size = 0;
121 japaneseWordBreak = false;
124 xdictionary::~xdictionary()
126 for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
127 if (cache[i].size > 0) {
128 delete [] cache[i].contents;
129 delete [] cache[i].wordboundary;
134 namespace {
135 struct datacache {
136 oslModule mhModule;
137 OString maLang;
138 xdictionarydata maData;
142 #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
144 void xdictionary::initDictionaryData(const sal_Char *pLang)
146 // Global cache, never released for performance
147 static std::vector< datacache > aLoadedCache;
149 osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
150 for( size_t i = 0; i < aLoadedCache.size(); ++i )
152 if( !strcmp( pLang, aLoadedCache[ i ].maLang.getStr() ) )
154 data = aLoadedCache[ i ].maData;
155 return;
159 // otherwise add to the cache, positive or negative.
160 datacache aEntry;
161 aEntry.maLang = OString( pLang, strlen( pLang ) );
163 #ifdef SAL_DLLPREFIX
164 OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 6) ); // mostly "lib*.so" (with * == dict_zh)
165 aBuf.appendAscii( SAL_DLLPREFIX );
166 #else
167 OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 4) ); // mostly "*.dll" (with * == dict_zh)
168 #endif
169 aBuf.appendAscii( "dict_" ).appendAscii( pLang ).appendAscii( SAL_DLLEXTENSION );
170 aEntry.mhModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
171 if( aEntry.mhModule ) {
172 oslGenericFunction func;
173 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
174 aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
175 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
176 aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
177 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
178 aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
179 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
180 aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
181 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
182 aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
185 data = aEntry.maData;
186 aLoadedCache.push_back( aEntry );
189 #endif
191 void xdictionary::setJapaneseWordBreak()
193 japaneseWordBreak = true;
196 bool xdictionary::exists(const sal_uInt32 c)
198 // 0x1FFF is the hardcoded limit in gendict for data.existMarks
199 bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
200 if (!exist && japaneseWordBreak)
201 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
202 else
203 return exist;
206 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
208 if ( !data.index1 ) return 0;
210 sal_Int16 idx = data.index1[str[0] >> 8];
212 if (idx == 0xFF) return 0;
214 idx = (idx<<8) | (str[0]&0xff);
216 sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
218 if (begin == 0) return 0;
220 str++; sLen--; // first character is not stored in the dictionary
221 for (sal_uInt32 i = end; i > begin; i--) {
222 sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
223 if (sLen >= len) {
224 const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
225 sal_Int32 pos = 0;
227 while (pos < len && dstr[pos] == str[pos]) { pos++; }
229 if (pos == len)
230 return len + 1;
233 return 0;
238 * c-tor
241 WordBreakCache::WordBreakCache() :
242 length( 0 ),
243 contents( NULL ),
244 wordboundary( NULL ),
245 size( 0 )
250 * Compare two unicode string,
253 bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary)
255 // Different length, different string.
256 if (length != boundary.endPos - boundary.startPos) return false;
258 for (sal_Int32 i = 0; i < length; i++)
259 if (contents[i] != str[i + boundary.startPos]) return false;
261 return true;
266 * Retrieve the segment containing the character at pos.
267 * @param pos : Position of the given character.
268 * @return true if CJK.
270 bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
271 Boundary& segBoundary)
273 sal_Int32 indexUtf16;
275 if (segmentCachedString.pData != rText.pData) {
276 // Cache the passed text so we can avoid regenerating the segment if it's the same
277 // (pData is refcounted and assigning the OUString references it, which ensures that
278 // the object is the same if we get the same pointer back later)
279 segmentCachedString = rText;
280 } else {
281 // If pos is within the cached boundary, use that boundary
282 if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
283 segBoundary.startPos = segmentCachedBoundary.startPos;
284 segBoundary.endPos = segmentCachedBoundary.endPos;
285 indexUtf16 = segmentCachedBoundary.startPos;
286 rText.iterateCodePoints(&indexUtf16, 1);
287 return segmentCachedBoundary.endPos > indexUtf16;
291 segBoundary.endPos = segBoundary.startPos = pos;
293 indexUtf16 = pos;
294 while (indexUtf16 > 0)
296 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
297 if (u_isWhitespace(ch) || exists(ch))
298 segBoundary.startPos = indexUtf16;
299 else
300 break;
303 indexUtf16 = pos;
304 while (indexUtf16 < rText.getLength())
306 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
307 if (u_isWhitespace(ch) || exists(ch))
308 segBoundary.endPos = indexUtf16;
309 else
310 break;
313 // Cache the calculated boundary
314 segmentCachedBoundary.startPos = segBoundary.startPos;
315 segmentCachedBoundary.endPos = segBoundary.endPos;
317 indexUtf16 = segBoundary.startPos;
318 rText.iterateCodePoints(&indexUtf16, 1);
319 return segBoundary.endPos > indexUtf16;
322 #define KANJA 1
323 #define KATAKANA 2
324 #define HIRAKANA 3
326 static sal_Int16 JapaneseCharType(sal_Unicode c)
328 if (0x3041 <= c && c <= 0x309e)
329 return HIRAKANA;
330 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
331 return KATAKANA;
332 return KANJA;
335 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
337 WordBreakCache& rCache = cache[text[0] & 0x1f];
339 if (rCache.size != 0 && rCache.equals(text, wordBoundary))
340 return rCache;
342 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
344 if (rCache.size == 0 || len > rCache.size) {
345 if (rCache.size != 0) {
346 delete [] rCache.contents;
347 delete [] rCache.wordboundary;
348 rCache.size = len;
350 else
351 rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
352 rCache.contents = new sal_Unicode[rCache.size + 1];
353 rCache.wordboundary = new sal_Int32[rCache.size + 2];
355 rCache.length = len;
356 memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
357 *(rCache.contents + len) = 0x0000;
358 // reset the wordboundary in cache
359 memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
361 sal_Int32 i = 0; // loop variable
362 while (rCache.wordboundary[i] < rCache.length) {
363 len = 0;
364 // look the continuous white space as one word and cashe it
365 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
366 len ++;
368 if (len == 0) {
369 const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
370 sal_Int32 slen = rCache.length - rCache.wordboundary[i];
371 sal_Int16 type = 0, count = 0;
372 for (;len == 0 && slen > 0; str++, slen--) {
373 len = getLongestMatch(str, slen);
374 if (len == 0) {
375 if (!japaneseWordBreak) {
376 len = 1;
377 } else {
378 if (count == 0)
379 type = JapaneseCharType(*str);
380 else if (type != JapaneseCharType(*str))
381 break;
382 count++;
386 if (count)
388 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
389 i++;
393 if (len) {
394 rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
395 i++;
398 rCache.wordboundary[i + 1] = rCache.length + 1;
400 return rCache;
403 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
405 // looking for the first non-whitespace character from anyPos
406 sal_uInt32 ch = 0;
407 if (anyPos > 0)
408 rText.iterateCodePoints(&anyPos, -1);
410 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
412 return getWordBoundary(rText, anyPos, wordType, true);
415 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
417 boundary = getWordBoundary(rText, anyPos, wordType, true);
418 anyPos = boundary.endPos;
419 const sal_Int32 nLen = rText.getLength();
420 if (anyPos < nLen) {
421 // looknig for the first non-whitespace character from anyPos
422 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
423 while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos, 1);
424 if (anyPos > 0)
425 rText.iterateCodePoints(&anyPos, -1);
428 return getWordBoundary(rText, anyPos, wordType, true);
431 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
433 const sal_Unicode *text=rText.getStr();
434 sal_Int32 len=rText.getLength();
435 if (anyPos >= len || anyPos < 0) {
436 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
437 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
438 WordBreakCache& aCache = getCache(text, boundary);
439 sal_Int32 i = 0;
441 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
443 sal_Int32 startPos = aCache.wordboundary[i - 1];
444 // if bDirection is false
445 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
447 sal_Int32 indexUtf16 = anyPos-1;
448 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
449 if (u_isWhitespace(ch))
450 i--;
453 boundary.endPos = boundary.startPos;
454 boundary.endPos += aCache.wordboundary[i];
455 boundary.startPos += aCache.wordboundary[i-1];
457 } else {
458 boundary.startPos = anyPos;
459 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
460 boundary.endPos = anyPos < len ? anyPos : len;
462 if (wordType == WordType::WORD_COUNT) {
463 // skip punctuation for word count.
464 while (boundary.endPos < len)
466 sal_Int32 indexUtf16 = boundary.endPos;
467 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
468 boundary.endPos = indexUtf16;
469 else
470 break;
474 return boundary;
477 } } } }
479 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */