nss: upgrade to release 3.73
[LibreOffice.git] / i18npool / source / collator / collator_unicode.cxx
blobc05a9cb025c681ec6ee66368ac386d89e3313fa0
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <config_locales.h>
22 #include <sal/log.hxx>
24 #include <lrl_include.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <i18nlangtag/languagetag.hxx>
28 #include <i18nlangtag/languagetagicu.hxx>
29 #include <collator_unicode.hxx>
30 #include <localedata.hxx>
31 #include <com/sun/star/i18n/CollatorOptions.hpp>
32 #include <cppuhelper/supportsservice.hxx>
34 using namespace ::com::sun::star;
35 using namespace ::com::sun::star::i18n;
36 using namespace ::com::sun::star::lang;
37 using namespace ::com::sun::star::uno;
39 namespace i18npool {
41 const char implementationName[] = "com.sun.star.i18n.Collator_Unicode";
43 Collator_Unicode::Collator_Unicode()
45 collator = nullptr;
46 uca_base = nullptr;
47 #ifndef DISABLE_DYNLOADING
48 hModule = nullptr;
49 #endif
52 Collator_Unicode::~Collator_Unicode()
54 collator.reset();
55 uca_base.reset();
56 #ifndef DISABLE_DYNLOADING
57 if (hModule) osl_unloadModule(hModule);
58 #endif
61 #ifdef DISABLE_DYNLOADING
63 extern "C" {
65 // For DISABLE_DYNLOADING the generated functions have names that
66 // start with get_collator_data_ to avoid clashing with a few
67 // functions in the generated libindex_data that are called just
68 // get_zh_pinyin for instance.
70 const sal_uInt8* get_collator_data_ca_charset();
71 const sal_uInt8* get_collator_data_cu_charset();
72 const sal_uInt8* get_collator_data_dz_charset();
73 const sal_uInt8* get_collator_data_hu_charset();
74 const sal_uInt8* get_collator_data_ja_charset();
75 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_first();
76 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_last();
77 const sal_uInt8* get_collator_data_ko_charset();
78 const sal_uInt8* get_collator_data_ku_alphanumeric();
79 const sal_uInt8* get_collator_data_ln_charset();
80 const sal_uInt8* get_collator_data_my_dictionary();
81 const sal_uInt8* get_collator_data_ne_charset();
82 const sal_uInt8* get_collator_data_sid_charset();
83 const sal_uInt8* get_collator_data_vro_alphanumeric();
84 const sal_uInt8* get_collator_data_zh_TW_charset();
85 const sal_uInt8* get_collator_data_zh_TW_radical();
86 const sal_uInt8* get_collator_data_zh_TW_stroke();
87 const sal_uInt8* get_collator_data_zh_charset();
88 const sal_uInt8* get_collator_data_zh_pinyin();
89 const sal_uInt8* get_collator_data_zh_radical();
90 const sal_uInt8* get_collator_data_zh_stroke();
91 const sal_uInt8* get_collator_data_zh_zhuyin();
93 size_t get_collator_data_ca_charset_length();
94 size_t get_collator_data_cu_charset_length();
95 size_t get_collator_data_dz_charset_length();
96 size_t get_collator_data_hu_charset_length();
97 size_t get_collator_data_ja_charset_length();
98 size_t get_collator_data_ja_phonetic_alphanumeric_first_length();
99 size_t get_collator_data_ja_phonetic_alphanumeric_last_length();
100 size_t get_collator_data_ko_charset_length();
101 size_t get_collator_data_ku_alphanumeric_length();
102 size_t get_collator_data_ln_charset_length();
103 size_t get_collator_data_my_dictionary_length();
104 size_t get_collator_data_ne_charset_length();
105 size_t get_collator_data_sid_charset_length();
106 size_t get_collator_data_vro_alphanumeric_length();
107 size_t get_collator_data_zh_TW_charset_length();
108 size_t get_collator_data_zh_TW_radical_length();
109 size_t get_collator_data_zh_TW_stroke_length();
110 size_t get_collator_data_zh_charset_length();
111 size_t get_collator_data_zh_pinyin_length();
112 size_t get_collator_data_zh_radical_length();
113 size_t get_collator_data_zh_stroke_length();
114 size_t get_collator_data_zh_zhuyin_length();
118 #endif
120 sal_Int32 SAL_CALL
121 Collator_Unicode::compareSubstring( const OUString& str1, sal_Int32 off1, sal_Int32 len1,
122 const OUString& str2, sal_Int32 off2, sal_Int32 len2)
124 return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2);
127 sal_Int32 SAL_CALL
128 Collator_Unicode::compareString( const OUString& str1, const OUString& str2)
130 return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()), str1.getLength(),
131 reinterpret_cast<const UChar *>(str2.getStr()), str2.getLength());
134 #ifndef DISABLE_DYNLOADING
136 extern "C" { static void thisModule() {} }
138 #endif
140 sal_Int32 SAL_CALL
141 Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang::Locale& rLocale, sal_Int32 options)
143 if (!collator) {
144 UErrorCode status = U_ZERO_ERROR;
145 OUString rule = LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale, rAlgorithm);
146 if (!rule.isEmpty()) {
147 collator.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar *>(rule.getStr()), status) );
148 if (! U_SUCCESS(status)) {
149 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
150 SAL_WARN("i18npool", message);
151 throw RuntimeException(message);
154 if (!collator && OUString(LOCAL_RULE_LANGS).indexOf(rLocale.Language) >= 0) {
155 const sal_uInt8* (*func)() = nullptr;
156 size_t (*funclen)() = nullptr;
158 #ifndef DISABLE_DYNLOADING
159 OUStringBuffer aBuf;
160 #ifdef SAL_DLLPREFIX
161 aBuf.append(SAL_DLLPREFIX);
162 #endif
163 aBuf.append( "collator_data" ).append( SAL_DLLEXTENSION );
164 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
165 if (hModule) {
166 aBuf.append("get_").append(rLocale.Language).append("_");
167 if ( rLocale.Language == "zh" ) {
168 OUString func_base = aBuf.makeStringAndClear();
169 if (OUString("TW HK MO").indexOf(rLocale.Country) >= 0)
171 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule,
172 OUString(func_base + "TW_" + rAlgorithm).pData));
173 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule,
174 OUString(func_base + "TW_" + rAlgorithm + "_length").pData));
176 if (!func)
178 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(
179 hModule, OUString(func_base + rAlgorithm).pData));
180 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(
181 hModule, OUString(func_base + rAlgorithm + "_length").pData));
183 } else {
184 if ( rLocale.Language == "ja" ) {
185 // replace algorithm name to implementation name.
186 if (rAlgorithm == "phonetic (alphanumeric first)")
187 aBuf.append("phonetic_alphanumeric_first");
188 else if (rAlgorithm == "phonetic (alphanumeric last)")
189 aBuf.append("phonetic_alphanumeric_last");
190 else
191 aBuf.append(rAlgorithm);
192 } else {
193 aBuf.append(rAlgorithm);
195 OUString func_base = aBuf.makeStringAndClear();
196 OUString funclen_base = func_base + "_length";
197 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule, func_base.pData));
198 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule, funclen_base.pData));
201 #else
202 if (false) {
204 #if WITH_LOCALE_ALL || WITH_LOCALE_ca
205 } else if ( rLocale.Language == "ca" ) {
206 if ( rAlgorithm == "charset" )
208 func = get_collator_data_ca_charset;
209 funclen = get_collator_data_ca_charset_length;
211 #endif
212 #if WITH_LOCALE_ALL || WITH_LOCALE_cu
213 } else if ( rLocale.Language == "cu" ) {
214 if ( rAlgorithm == "charset" )
216 func = get_collator_data_cu_charset;
217 funclen = get_collator_data_cu_charset_length;
219 #endif
220 #if WITH_LOCALE_ALL || WITH_LOCALE_dz
221 } else if ( rLocale.Language == "dz" || rLocale.Language == "bo" ) {
222 // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha
223 if ( rAlgorithm == "charset" )
225 func = get_collator_data_dz_charset;
226 funclen = get_collator_data_dz_charset_length;
228 #endif
229 #if WITH_LOCALE_ALL || WITH_LOCALE_hu
230 } else if ( rLocale.Language == "hu" ) {
231 if ( rAlgorithm == "charset" )
233 func = get_collator_data_hu_charset;
234 funclen = get_collator_data_hu_charset_length;
236 #endif
237 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
238 } else if ( rLocale.Language == "ja" ) {
239 if ( rAlgorithm == "charset" )
241 func = get_collator_data_ja_charset;
242 funclen = get_collator_data_ja_charset_length;
244 else if ( rAlgorithm == "phonetic (alphanumeric first)" )
246 func = get_collator_data_ja_phonetic_alphanumeric_first;
247 funclen = get_collator_data_ja_phonetic_alphanumeric_first_length;
249 else if ( rAlgorithm == "phonetic (alphanumeric last)" )
251 func = get_collator_data_ja_phonetic_alphanumeric_last;
252 funclen = get_collator_data_ja_phonetic_alphanumeric_last_length;
254 #endif
255 #if WITH_LOCALE_ALL || WITH_LOCALE_ko
256 #if (U_ICU_VERSION_MAJOR_NUM < 53)
257 } else if ( rLocale.Language == "ko" ) {
258 if ( rAlgorithm == "charset" )
260 func = get_collator_data_ko_charset;
261 funclen = get_collator_data_ko_charset_length;
263 #endif
264 #endif
265 #if WITH_LOCALE_ALL || WITH_LOCALE_ku
266 } else if ( rLocale.Language == "ku" ) {
267 if ( rAlgorithm == "alphanumeric" )
269 func = get_collator_data_ku_alphanumeric;
270 funclen = get_collator_data_ku_alphanumeric_length;
272 #endif
273 #if WITH_LOCALE_ALL || WITH_LOCALE_ln
274 } else if ( rLocale.Language == "ln" ) {
275 if ( rAlgorithm == "charset" )
277 func = get_collator_data_ln_charset;
278 funclen = get_collator_data_ln_charset_length;
280 #endif
281 #if WITH_LOCALE_ALL || WITH_LOCALE_my
282 } else if ( rLocale.Language == "my" ) {
283 if ( rAlgorithm == "dictionary" )
285 func = get_collator_data_my_dictionary;
286 funclen = get_collator_data_my_dictionary_length;
288 #endif
289 #if WITH_LOCALE_ALL || WITH_LOCALE_ne
290 } else if ( rLocale.Language == "ne" ) {
291 if ( rAlgorithm == "charset" )
293 func = get_collator_data_ne_charset;
294 funclen = get_collator_data_ne_charset_length;
296 #endif
297 #if WITH_LOCALE_ALL || WITH_LOCALE_sid
298 } else if ( rLocale.Language == "sid" ) {
299 if ( rAlgorithm == "charset" )
301 func = get_collator_data_sid_charset;
302 funclen = get_collator_data_sid_charset_length;
304 #endif
305 #if WITH_LOCALE_ALL || WITH_LOCALE_vro
306 } else if ( rLocale.Language == "vro" ) {
307 if ( rAlgorithm == "alphanumeric" )
309 func = get_collator_data_vro_alphanumeric;
310 funclen = get_collator_data_vro_alphanumeric_length;
312 #endif
313 #if WITH_LOCALE_ALL || WITH_LOCALE_zh
314 } else if ( rLocale.Language == "zh" && (rLocale.Country == "TW" || rLocale.Country == "HK" || rLocale.Country == "MO") ) {
315 if ( rAlgorithm == "charset" )
317 func = get_collator_data_zh_TW_charset;
318 funclen = get_collator_data_zh_TW_charset_length;
320 else if ( rAlgorithm == "radical" )
322 func = get_collator_data_zh_TW_radical;
323 funclen = get_collator_data_zh_TW_radical_length;
325 else if ( rAlgorithm == "stroke" )
327 func = get_collator_data_zh_TW_stroke;
328 funclen = get_collator_data_zh_TW_stroke_length;
330 } else if ( rLocale.Language == "zh" ) {
331 if ( rAlgorithm == "charset" )
333 func = get_collator_data_zh_charset;
334 funclen = get_collator_data_zh_charset_length;
336 else if ( rAlgorithm == "pinyin" )
338 func = get_collator_data_zh_pinyin;
339 funclen = get_collator_data_zh_pinyin_length;
341 else if ( rAlgorithm == "radical" )
343 func = get_collator_data_zh_radical;
344 funclen = get_collator_data_zh_radical_length;
346 else if ( rAlgorithm == "stroke" )
348 func = get_collator_data_zh_stroke;
349 funclen = get_collator_data_zh_stroke_length;
351 else if ( rAlgorithm == "zhuyin" )
353 func = get_collator_data_zh_zhuyin;
354 funclen = get_collator_data_zh_zhuyin_length;
356 #endif
358 #endif // DISABLE_DYNLOADING
359 if (func && funclen) {
360 const sal_uInt8* ruleImage=func();
361 size_t ruleImageSize = funclen();
363 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
364 uca_base = new icu::RuleBasedCollator(static_cast<UChar*>(NULL), status);
365 #else
366 // Not only changed ICU 53.1 the API behavior that a negative
367 // length (ruleImageSize) now leads to failure, but also that
368 // the base RuleBasedCollator passed as uca_base here needs to
369 // have a base->tailoring == CollationRoot::getRoot() otherwise
370 // the init bails out as well, as it does for the previously
371 // used "empty" RuleBasedCollator.
372 // The default collator of the en-US locale would also fulfill
373 // the requirement. The collator of the actual locale or the
374 // NULL (default) locale does not.
375 uca_base.reset( static_cast<icu::RuleBasedCollator*>(icu::Collator::createInstance(
376 icu::Locale::getRoot(), status)) );
377 #endif
378 if (! U_SUCCESS(status)) {
379 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
380 SAL_WARN("i18npool", message);
381 throw RuntimeException(message);
383 collator.reset( new icu::RuleBasedCollator(
384 reinterpret_cast<const uint8_t*>(ruleImage), ruleImageSize, uca_base.get(), status) );
385 if (! U_SUCCESS(status)) {
386 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
387 SAL_WARN("i18npool", message);
388 throw RuntimeException(message);
392 if (!collator) {
393 /** ICU collators are loaded using a locale only.
394 ICU uses Variant as collation algorithm name (like de__PHONEBOOK
395 locale), note the empty territory (Country) designator in this special
396 case here.
397 But sometimes the mapping fails, eg for German (from Germany) phonebook, we'll have "de_DE_PHONEBOOK"
398 this one won't be remapping to collation keyword specifiers "de@collation=phonebook"
399 See http://userguide.icu-project.org/locale#TOC-Variant-code, Level 2 canonicalization, 8.
400 So let variant empty and use the fourth arg of icuLocale "keywords"
401 See LanguageTagIcu::getIcuLocale from i18nlangtag/source/languagetag/languagetagicu.cxx
402 The icu::Locale constructor changes the algorithm name to
403 uppercase itself, so we don't have to bother with that.
405 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale),
406 "", rAlgorithm.isEmpty() ? OUString("") : "collation=" + rAlgorithm));
408 // FIXME: apparently we get here in LOKit case only. When the language is Japanese, we pass "ja@collation=phonetic (alphanumeric first)" to ICU
409 // and ICU does not like this (U_ILLEGAL_ARGUMENT_ERROR). Subsequently LOKit crashes, because collator is nullptr.
410 if (!strcmp(icuLocale.getLanguage(), "ja"))
411 icuLocale = icu::Locale::getJapanese();
413 // load ICU collator
414 collator.reset( static_cast<icu::RuleBasedCollator*>( icu::Collator::createInstance(icuLocale, status) ) );
415 if (! U_SUCCESS(status)) {
416 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
417 SAL_WARN("i18npool", message);
418 throw RuntimeException(message);
423 if (options & CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT)
424 collator->setStrength(icu::Collator::PRIMARY);
425 else if (options & CollatorOptions::CollatorOptions_IGNORE_CASE)
426 collator->setStrength(icu::Collator::SECONDARY);
427 else
428 collator->setStrength(icu::Collator::TERTIARY);
430 return 0;
434 OUString SAL_CALL
435 Collator_Unicode::getImplementationName()
437 return implementationName;
440 sal_Bool SAL_CALL
441 Collator_Unicode::supportsService(const OUString& rServiceName)
443 return cppu::supportsService(this, rServiceName);
446 Sequence< OUString > SAL_CALL
447 Collator_Unicode::getSupportedServiceNames()
449 Sequence< OUString > aRet { OUString(implementationName) };
450 return aRet;
455 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */