Bump version to 24.04.3.4
[LibreOffice.git] / lingucomponent / source / thesaurus / libnth / nthesimp.cxx
blobea3e3af8ddc19f6dcc4ef5813a61b4425fc2fbb0
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <com/sun/star/uno/Reference.h>
21 #include <cppuhelper/factory.hxx>
22 #include <cppuhelper/supportsservice.hxx>
23 #include <cppuhelper/weak.hxx>
24 #include <com/sun/star/linguistic2/LinguServiceManager.hpp>
25 #include <com/sun/star/linguistic2/XLinguProperties.hpp>
26 #include <com/sun/star/linguistic2/XSpellChecker1.hpp>
27 #include <i18nlangtag/languagetag.hxx>
28 #include <tools/debug.hxx>
29 #include <comphelper/lok.hxx>
30 #include <comphelper/processfactory.hxx>
31 #include <comphelper/sequence.hxx>
32 #include <osl/mutex.hxx>
33 #include <osl/thread.h>
34 #include <unotools/lingucfg.hxx>
35 #include <unotools/resmgr.hxx>
37 #include <rtl/string.hxx>
38 #include <rtl/textenc.h>
40 #include <svtools/strings.hrc>
42 #include "nthesimp.hxx"
43 #include <linguistic/misc.hxx>
44 #include "nthesdta.hxx"
46 #include <vector>
47 #include <numeric>
48 #include <set>
49 #include <string.h>
51 // XML-header to query SPELLML support
52 constexpr OUStringLiteral SPELLML_SUPPORT = u"<?xml?>";
54 using namespace osl;
55 using namespace com::sun::star;
56 using namespace com::sun::star::beans;
57 using namespace com::sun::star::lang;
58 using namespace com::sun::star::uno;
59 using namespace com::sun::star::linguistic2;
60 using namespace linguistic;
62 static uno::Reference< XLinguServiceManager2 > GetLngSvcMgr_Impl()
64 uno::Reference< XComponentContext > xContext( comphelper::getProcessComponentContext() );
65 uno::Reference< XLinguServiceManager2 > xRes = LinguServiceManager::create( xContext ) ;
66 return xRes;
69 Thesaurus::Thesaurus() :
70 aEvtListeners ( GetLinguMutex() ), pPropHelper(nullptr), bDisposing(false),
71 prevLocale(LANGUAGE_DONTKNOW)
75 Thesaurus::~Thesaurus()
77 mvThesInfo.clear();
78 if (pPropHelper)
80 pPropHelper->RemoveAsPropListener();
84 PropertyHelper_Thesaurus& Thesaurus::GetPropHelper_Impl()
86 if (!pPropHelper)
88 Reference< XLinguProperties > xPropSet = GetLinguProperties();
90 pPropHelper = new PropertyHelper_Thesaurus( static_cast<XThesaurus *>(this), xPropSet );
91 pPropHelper->AddAsPropListener(); //! after a reference is established
93 return *pPropHelper;
96 Sequence< Locale > SAL_CALL Thesaurus::getLocales()
98 MutexGuard aGuard( GetLinguMutex() );
100 // this routine should return the locales supported by the installed
101 // dictionaries.
102 if (mvThesInfo.empty())
104 SvtLinguConfig aLinguCfg;
106 // get list of dictionaries-to-use
107 std::vector< SvtLinguConfigDictionaryEntry > aDics;
108 uno::Sequence< OUString > aFormatList;
109 aLinguCfg.GetSupportedDictionaryFormatsFor( "Thesauri",
110 "org.openoffice.lingu.new.Thesaurus", aFormatList );
111 for (const auto& rFormat : std::as_const(aFormatList))
113 std::vector< SvtLinguConfigDictionaryEntry > aTmpDic(
114 aLinguCfg.GetActiveDictionariesByFormat( rFormat ) );
115 aDics.insert( aDics.end(), aTmpDic.begin(), aTmpDic.end() );
118 //!! for compatibility with old dictionaries (the ones not using extensions
119 //!! or new configuration entries, but still using the dictionary.lst file)
120 //!! Get the list of old style spell checking dictionaries to use...
121 std::vector< SvtLinguConfigDictionaryEntry > aOldStyleDics(
122 GetOldStyleDics( "THES" ) );
124 // to prefer dictionaries with configuration entries we will only
125 // use those old style dictionaries that add a language that
126 // is not yet supported by the list of new style dictionaries
127 MergeNewStyleDicsAndOldStyleDics( aDics, aOldStyleDics );
129 if (!aDics.empty())
131 // get supported locales from the dictionaries-to-use...
132 std::set<OUString> aLocaleNamesSet;
133 for (auto const& dict : aDics)
135 for (const auto& rLocaleName : dict.aLocaleNames)
137 if (!comphelper::LibreOfficeKit::isAllowlistedLanguage(rLocaleName))
138 continue;
140 aLocaleNamesSet.insert( rLocaleName );
143 // ... and add them to the resulting sequence
144 std::vector<Locale> aLocalesVec;
145 aLocalesVec.reserve(aLocaleNamesSet.size());
147 std::transform(aLocaleNamesSet.begin(), aLocaleNamesSet.end(), std::back_inserter(aLocalesVec),
148 [](const OUString& localeName) -> Locale { return LanguageTag::convertToLocale(localeName); });
150 aSuppLocales = comphelper::containerToSequence(aLocalesVec);
152 //! For each dictionary and each locale we need a separate entry.
153 //! If this results in more than one dictionary per locale than (for now)
154 //! it is undefined which dictionary gets used.
155 //! In the future the implementation should support using several dictionaries
156 //! for one locale.
157 sal_Int32 numthes = std::accumulate(aDics.begin(), aDics.end(), 0,
158 [](const sal_Int32 nSum, const SvtLinguConfigDictionaryEntry& dict) {
159 return nSum + dict.aLocaleNames.getLength(); });
161 // add dictionary information
162 mvThesInfo.resize(numthes);
164 sal_Int32 k = 0;
165 for (auto const& dict : aDics)
167 if (dict.aLocaleNames.hasElements() &&
168 dict.aLocations.hasElements())
170 // currently only one language per dictionary is supported in the actual implementation...
171 // Thus here we work-around this by adding the same dictionary several times.
172 // Once for each of its supported locales.
173 for (const auto& rLocaleName : dict.aLocaleNames)
175 LanguageTag aLanguageTag(rLocaleName);
176 mvThesInfo[k].aEncoding = RTL_TEXTENCODING_DONTKNOW;
177 mvThesInfo[k].aLocale = aLanguageTag.getLocale();
178 mvThesInfo[k].aCharSetInfo.reset( new CharClass( std::move(aLanguageTag) ) );
179 // also both files have to be in the same directory and the
180 // file names must only differ in the extension (.aff/.dic).
181 // Thus we use the first location only and strip the extension part.
182 OUString aLocation = dict.aLocations[0];
183 sal_Int32 nPos = aLocation.lastIndexOf( '.' );
184 aLocation = aLocation.copy( 0, nPos );
185 mvThesInfo[k].aName = aLocation;
187 ++k;
191 DBG_ASSERT( k == numthes, "index mismatch?" );
193 else
195 /* no dictionary found so register no dictionaries */
196 mvThesInfo.clear();
197 aSuppLocales.realloc(0);
201 return aSuppLocales;
204 sal_Bool SAL_CALL Thesaurus::hasLocale(const Locale& rLocale)
206 MutexGuard aGuard( GetLinguMutex() );
208 if (!aSuppLocales.hasElements())
209 getLocales();
211 return comphelper::findValue(aSuppLocales, rLocale) != -1;
214 Sequence < Reference < css::linguistic2::XMeaning > > SAL_CALL Thesaurus::queryMeanings(
215 const OUString& qTerm, const Locale& rLocale,
216 const css::uno::Sequence< css::beans::PropertyValue >& rProperties)
218 MutexGuard aGuard( GetLinguMutex() );
220 uno::Sequence< Reference< XMeaning > > aMeanings( 1 );
221 uno::Sequence< Reference< XMeaning > > noMeanings( 0 );
222 uno::Reference< XLinguServiceManager2 > xLngSvcMgr( GetLngSvcMgr_Impl() );
223 uno::Reference< XSpellChecker1 > xSpell;
225 OUString aRTerm(qTerm);
226 OUString aPTerm(qTerm);
227 CapType ct = CapType::UNKNOWN;
228 sal_Int32 stem = 0;
229 sal_Int32 stem2 = 0;
231 LanguageType nLanguage = LinguLocaleToLanguage( rLocale );
233 if (LinguIsUnspecified( nLanguage) || aRTerm.isEmpty())
234 return noMeanings;
236 if (!hasLocale( rLocale ))
237 #ifdef LINGU_EXCEPTIONS
238 throw( IllegalArgumentException() );
239 #else
240 return noMeanings;
241 #endif
243 if (prevTerm == qTerm && prevLocale == nLanguage)
244 return prevMeanings;
246 mentry * pmean = nullptr;
247 sal_Int32 nmean = 0;
249 PropertyHelper_Thesaurus &rHelper = GetPropHelper();
250 rHelper.SetTmpPropVals( rProperties );
252 MyThes * pTH = nullptr;
253 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
254 CharClass * pCC = nullptr;
256 // find the first thesaurus that matches the locale
257 for (size_t i =0; i < mvThesInfo.size(); i++)
259 if (rLocale == mvThesInfo[i].aLocale)
261 // open up and initialize this thesaurus if need be
262 if (!mvThesInfo[i].aThes)
264 OUString datpath = mvThesInfo[i].aName + ".dat";
265 OUString idxpath = mvThesInfo[i].aName + ".idx";
266 OUString ndat;
267 OUString nidx;
268 osl::FileBase::getSystemPathFromFileURL(datpath,ndat);
269 osl::FileBase::getSystemPathFromFileURL(idxpath,nidx);
271 #if defined(_WIN32)
272 // MyThes waits UTF-8 encoded paths with \\?\ long path prefix.
273 OString aTmpidx = Win_AddLongPathPrefix(OUStringToOString(nidx, RTL_TEXTENCODING_UTF8));
274 OString aTmpdat = Win_AddLongPathPrefix(OUStringToOString(ndat, RTL_TEXTENCODING_UTF8));
275 #else
276 OString aTmpidx(OU2ENC(nidx,osl_getThreadTextEncoding()));
277 OString aTmpdat(OU2ENC(ndat,osl_getThreadTextEncoding()));
278 #endif
280 mvThesInfo[i].aThes.reset( new MyThes(aTmpidx.getStr(),aTmpdat.getStr()) );
281 mvThesInfo[i].aEncoding = getTextEncodingFromCharset(mvThesInfo[i].aThes->get_th_encoding());
283 pTH = mvThesInfo[i].aThes.get();
284 eEnc = mvThesInfo[i].aEncoding;
285 pCC = mvThesInfo[i].aCharSetInfo.get();
287 if (pTH)
288 break;
292 // we don't want to work with a default text encoding since following incorrect
293 // results may occur only for specific text and thus may be hard to notice.
294 // Thus better always make a clean exit here if the text encoding is in question.
295 // Hopefully something not working at all will raise proper attention quickly. ;-)
296 DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
297 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
298 return noMeanings;
300 while (pTH)
302 // convert word to all lower case for searching
303 if (!stem)
304 ct = capitalType(aRTerm, pCC);
305 OUString nTerm(makeLowerCase(aRTerm, pCC));
306 OString aTmp( OU2ENC(nTerm, eEnc) );
307 nmean = pTH->Lookup(aTmp.getStr(),aTmp.getLength(),&pmean);
309 if (nmean)
310 aMeanings.realloc( nmean );
312 mentry * pe = pmean;
313 OUString codeTerm = qTerm;
314 Reference< XSpellAlternatives > xTmpRes2;
316 if (stem)
318 xTmpRes2 = xSpell->spell( "<?xml?><query type='analyze'><word>" +
319 aPTerm + "</word></query>", static_cast<sal_uInt16>(nLanguage), rProperties );
320 if (xTmpRes2.is())
322 Sequence<OUString>seq = xTmpRes2->getAlternatives();
323 if (seq.hasElements())
325 codeTerm = seq[0];
326 stem2 = 1;
331 for (int j = 0; j < nmean; j++)
333 int count = pe->count;
334 if (count)
336 Sequence< OUString > aStr( count );
337 OUString *pStr = aStr.getArray();
339 for (int i=0; i < count; i++)
341 OUString sTerm(pe->psyns[i],strlen(pe->psyns[i]),eEnc );
342 sal_Int32 catpos = sTerm.indexOf('(');
343 OUString catst;
344 if (catpos > 2)
346 // remove category name for affixation and casing
347 catst = OUString::Concat(" ") + sTerm.subView(catpos);
348 sTerm = sTerm.copy(0, catpos);
349 sTerm = sTerm.trim();
351 // generate synonyms with affixes
352 if (stem && stem2)
354 Reference< XSpellAlternatives > xTmpRes = xSpell->spell( "<?xml?><query type='generate'><word>" +
355 sTerm + "</word>" + codeTerm + "</query>", static_cast<sal_uInt16>(nLanguage), rProperties );
356 if (xTmpRes.is())
358 Sequence<OUString>seq = xTmpRes->getAlternatives();
359 if (seq.hasElements())
360 sTerm = seq[0];
364 CapType ct1 = capitalType(sTerm, pCC);
365 if (CapType::MIXED == ct1)
366 ct = ct1;
367 OUString cTerm;
368 switch (ct)
370 case CapType::ALLCAP:
371 cTerm = makeUpperCase(sTerm, pCC);
372 break;
373 case CapType::INITCAP:
374 cTerm = makeInitCap(sTerm, pCC);
375 break;
376 default:
377 cTerm = sTerm;
378 break;
380 OUString aAlt( cTerm + catst);
381 pStr[i] = aAlt;
383 rtl::Reference<Meaning> pMn = new Meaning(aRTerm);
384 OUString dTerm(pe->defn,strlen(pe->defn),eEnc );
385 pMn->SetMeaning(dTerm);
386 pMn->SetSynonyms(aStr);
387 Reference<XMeaning>* pMeaning = aMeanings.getArray();
388 pMeaning[j] = pMn;
390 pe++;
392 pTH->CleanUpAfterLookup(&pmean,nmean);
394 if (nmean)
396 prevTerm = qTerm;
397 prevMeanings = aMeanings;
398 prevLocale = nLanguage;
399 return aMeanings;
402 if (stem || !xLngSvcMgr.is())
403 return noMeanings;
404 stem = 1;
406 xSpell.set( xLngSvcMgr->getSpellChecker(), UNO_QUERY );
407 if (!xSpell.is() || !xSpell->isValid( SPELLML_SUPPORT, static_cast<sal_uInt16>(nLanguage), rProperties ))
408 return noMeanings;
409 Reference< XSpellAlternatives > xTmpRes = xSpell->spell( "<?xml?><query type='stem'><word>" +
410 aRTerm + "</word></query>", static_cast<sal_uInt16>(nLanguage), rProperties );
411 if (xTmpRes.is())
413 Sequence<OUString>seq = xTmpRes->getAlternatives();
414 if (seq.hasElements())
416 aRTerm = seq[0]; // XXX Use only the first stem
417 continue;
421 // stem the last word of the synonym (for categories after affixation)
422 aRTerm = aRTerm.trim();
423 sal_Int32 pos = aRTerm.lastIndexOf(' ');
424 if (!pos)
425 return noMeanings;
426 xTmpRes = xSpell->spell( OUString::Concat("<?xml?><query type='stem'><word>") +
427 aRTerm.subView(pos + 1) + "</word></query>", static_cast<sal_uInt16>(nLanguage), rProperties );
428 if (xTmpRes.is())
430 Sequence<OUString>seq = xTmpRes->getAlternatives();
431 if (seq.hasElements())
433 aPTerm = aRTerm.copy(pos + 1);
434 aRTerm = aRTerm.subView(0, pos + 1) + seq[0];
435 #if 0
436 for (int i = 0; i < seq.getLength(); i++)
438 OString o = OUStringToOString(seq[i], RTL_TEXTENCODING_UTF8);
439 fprintf(stderr, "%d: %s\n", i + 1, o.pData->buffer);
441 #endif
442 continue;
445 break;
447 return noMeanings;
450 OUString SAL_CALL Thesaurus::getServiceDisplayName(const Locale& rLocale)
452 std::locale loc(Translate::Create("svt", LanguageTag(rLocale)));
453 return Translate::get(STR_DESCRIPTION_MYTHES, loc);
456 void SAL_CALL Thesaurus::initialize( const Sequence< Any >& rArguments )
458 MutexGuard aGuard( GetLinguMutex() );
460 if (pPropHelper)
461 return;
463 sal_Int32 nLen = rArguments.getLength();
464 // Accept one of two args so we can be compatible with the call site in GetAvailLocales()
465 // linguistic module
466 if (1 == nLen || 2 == nLen)
468 Reference< XLinguProperties > xPropSet;
469 rArguments.getConstArray()[0] >>= xPropSet;
470 assert(xPropSet);
472 //! Pointer allows for access of the non-UNO functions.
473 //! And the reference to the UNO-functions while increasing
474 //! the ref-count and will implicitly free the memory
475 //! when the object is no longer used.
476 pPropHelper = new PropertyHelper_Thesaurus( static_cast<XThesaurus *>(this), xPropSet );
477 pPropHelper->AddAsPropListener(); //! after a reference is established
479 else
480 OSL_FAIL( "wrong number of arguments in sequence" );
483 OUString Thesaurus::makeLowerCase(const OUString& aTerm, CharClass const * pCC)
485 if (pCC)
486 return pCC->lowercase(aTerm);
487 return aTerm;
490 OUString Thesaurus::makeUpperCase(const OUString& aTerm, CharClass const * pCC)
492 if (pCC)
493 return pCC->uppercase(aTerm);
494 return aTerm;
497 OUString Thesaurus::makeInitCap(const OUString& aTerm, CharClass const * pCC)
499 sal_Int32 tlen = aTerm.getLength();
500 if (pCC && tlen)
502 OUString bTemp = aTerm.copy(0,1);
503 if (tlen > 1)
505 return ( pCC->uppercase(bTemp, 0, 1)
506 + pCC->lowercase(aTerm,1,(tlen-1)) );
509 return pCC->uppercase(bTemp, 0, 1);
511 return aTerm;
514 void SAL_CALL Thesaurus::dispose()
516 MutexGuard aGuard( GetLinguMutex() );
518 if (!bDisposing)
520 bDisposing = true;
521 EventObject aEvtObj( static_cast<XThesaurus *>(this) );
522 aEvtListeners.disposeAndClear( aEvtObj );
523 if (pPropHelper)
525 pPropHelper->RemoveAsPropListener();
526 delete pPropHelper;
527 pPropHelper = nullptr;
532 void SAL_CALL Thesaurus::addEventListener( const Reference< XEventListener >& rxListener )
534 MutexGuard aGuard( GetLinguMutex() );
536 if (!bDisposing && rxListener.is())
537 aEvtListeners.addInterface( rxListener );
540 void SAL_CALL Thesaurus::removeEventListener( const Reference< XEventListener >& rxListener )
542 MutexGuard aGuard( GetLinguMutex() );
544 if (!bDisposing && rxListener.is())
545 aEvtListeners.removeInterface( rxListener );
548 // Service specific part
549 OUString SAL_CALL Thesaurus::getImplementationName()
551 return "org.openoffice.lingu.new.Thesaurus";
554 sal_Bool SAL_CALL Thesaurus::supportsService( const OUString& ServiceName )
556 return cppu::supportsService(this, ServiceName);
559 Sequence< OUString > SAL_CALL Thesaurus::getSupportedServiceNames()
561 return { SN_THESAURUS };
564 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
565 lingucomponent_Thesaurus_get_implementation(
566 css::uno::XComponentContext* , css::uno::Sequence<css::uno::Any> const&)
568 return cppu::acquire(new Thesaurus());
571 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */