1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <config_locales.h>
22 #include <sal/log.hxx>
24 #include <lrl_include.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <i18nlangtag/languagetag.hxx>
28 #include <i18nlangtag/languagetagicu.hxx>
29 #include <collator_unicode.hxx>
30 #include <localedata.hxx>
31 #include <com/sun/star/i18n/CollatorOptions.hpp>
32 #include <cppuhelper/supportsservice.hxx>
34 using namespace ::com::sun::star
;
35 using namespace ::com::sun::star::i18n
;
36 using namespace ::com::sun::star::lang
;
37 using namespace ::com::sun::star::uno
;
41 const char implementationName
[] = "com.sun.star.i18n.Collator_Unicode";
43 Collator_Unicode::Collator_Unicode()
47 #ifndef DISABLE_DYNLOADING
52 Collator_Unicode::~Collator_Unicode()
56 #ifndef DISABLE_DYNLOADING
57 if (hModule
) osl_unloadModule(hModule
);
61 #ifdef DISABLE_DYNLOADING
65 // For DISABLE_DYNLOADING the generated functions have names that
66 // start with get_collator_data_ to avoid clashing with a few
67 // functions in the generated libindex_data that are called just
68 // get_zh_pinyin for instance.
70 const sal_uInt8
* get_collator_data_ca_charset();
71 const sal_uInt8
* get_collator_data_cu_charset();
72 const sal_uInt8
* get_collator_data_dz_charset();
73 const sal_uInt8
* get_collator_data_hu_charset();
74 const sal_uInt8
* get_collator_data_ja_charset();
75 const sal_uInt8
* get_collator_data_ja_phonetic_alphanumeric_first();
76 const sal_uInt8
* get_collator_data_ja_phonetic_alphanumeric_last();
77 const sal_uInt8
* get_collator_data_ko_charset();
78 const sal_uInt8
* get_collator_data_ku_alphanumeric();
79 const sal_uInt8
* get_collator_data_ln_charset();
80 const sal_uInt8
* get_collator_data_my_dictionary();
81 const sal_uInt8
* get_collator_data_ne_charset();
82 const sal_uInt8
* get_collator_data_sid_charset();
83 const sal_uInt8
* get_collator_data_vro_alphanumeric();
84 const sal_uInt8
* get_collator_data_zh_TW_charset();
85 const sal_uInt8
* get_collator_data_zh_TW_radical();
86 const sal_uInt8
* get_collator_data_zh_TW_stroke();
87 const sal_uInt8
* get_collator_data_zh_charset();
88 const sal_uInt8
* get_collator_data_zh_pinyin();
89 const sal_uInt8
* get_collator_data_zh_radical();
90 const sal_uInt8
* get_collator_data_zh_stroke();
91 const sal_uInt8
* get_collator_data_zh_zhuyin();
93 size_t get_collator_data_ca_charset_length();
94 size_t get_collator_data_cu_charset_length();
95 size_t get_collator_data_dz_charset_length();
96 size_t get_collator_data_hu_charset_length();
97 size_t get_collator_data_ja_charset_length();
98 size_t get_collator_data_ja_phonetic_alphanumeric_first_length();
99 size_t get_collator_data_ja_phonetic_alphanumeric_last_length();
100 size_t get_collator_data_ko_charset_length();
101 size_t get_collator_data_ku_alphanumeric_length();
102 size_t get_collator_data_ln_charset_length();
103 size_t get_collator_data_my_dictionary_length();
104 size_t get_collator_data_ne_charset_length();
105 size_t get_collator_data_sid_charset_length();
106 size_t get_collator_data_vro_alphanumeric_length();
107 size_t get_collator_data_zh_TW_charset_length();
108 size_t get_collator_data_zh_TW_radical_length();
109 size_t get_collator_data_zh_TW_stroke_length();
110 size_t get_collator_data_zh_charset_length();
111 size_t get_collator_data_zh_pinyin_length();
112 size_t get_collator_data_zh_radical_length();
113 size_t get_collator_data_zh_stroke_length();
114 size_t get_collator_data_zh_zhuyin_length();
121 Collator_Unicode::compareSubstring( const OUString
& str1
, sal_Int32 off1
, sal_Int32 len1
,
122 const OUString
& str2
, sal_Int32 off2
, sal_Int32 len2
)
124 return collator
->compare(reinterpret_cast<const UChar
*>(str1
.getStr()) + off1
, len1
, reinterpret_cast<const UChar
*>(str2
.getStr()) + off2
, len2
);
128 Collator_Unicode::compareString( const OUString
& str1
, const OUString
& str2
)
130 return collator
->compare(reinterpret_cast<const UChar
*>(str1
.getStr()), str1
.getLength(),
131 reinterpret_cast<const UChar
*>(str2
.getStr()), str2
.getLength());
134 #ifndef DISABLE_DYNLOADING
136 extern "C" { static void thisModule() {} }
141 Collator_Unicode::loadCollatorAlgorithm(const OUString
& rAlgorithm
, const lang::Locale
& rLocale
, sal_Int32 options
)
144 UErrorCode status
= U_ZERO_ERROR
;
145 OUString rule
= LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale
, rAlgorithm
);
146 if (!rule
.isEmpty()) {
147 collator
.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar
*>(rule
.getStr()), status
) );
148 if (! U_SUCCESS(status
)) {
149 OUString message
= "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status
));
150 SAL_WARN("i18npool", message
);
151 throw RuntimeException(message
);
154 if (!collator
&& OUString(LOCAL_RULE_LANGS
).indexOf(rLocale
.Language
) >= 0) {
155 const sal_uInt8
* (*func
)() = nullptr;
156 size_t (*funclen
)() = nullptr;
158 #ifndef DISABLE_DYNLOADING
161 aBuf
.append(SAL_DLLPREFIX
);
163 aBuf
.append( "collator_data" ).append( SAL_DLLEXTENSION
);
164 hModule
= osl_loadModuleRelative( &thisModule
, aBuf
.makeStringAndClear().pData
, SAL_LOADMODULE_DEFAULT
);
166 aBuf
.append("get_").append(rLocale
.Language
).append("_");
167 if ( rLocale
.Language
== "zh" ) {
168 OUString func_base
= aBuf
.makeStringAndClear();
169 if (OUString("TW HK MO").indexOf(rLocale
.Country
) >= 0)
171 func
= reinterpret_cast<const sal_uInt8
* (*)()>(osl_getFunctionSymbol(hModule
,
172 OUString(func_base
+ "TW_" + rAlgorithm
).pData
));
173 funclen
= reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule
,
174 OUString(func_base
+ "TW_" + rAlgorithm
+ "_length").pData
));
178 func
= reinterpret_cast<const sal_uInt8
* (*)()>(osl_getFunctionSymbol(
179 hModule
, OUString(func_base
+ rAlgorithm
).pData
));
180 funclen
= reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(
181 hModule
, OUString(func_base
+ rAlgorithm
+ "_length").pData
));
184 if ( rLocale
.Language
== "ja" ) {
185 // replace algorithm name to implementation name.
186 if (rAlgorithm
== "phonetic (alphanumeric first)")
187 aBuf
.append("phonetic_alphanumeric_first");
188 else if (rAlgorithm
== "phonetic (alphanumeric last)")
189 aBuf
.append("phonetic_alphanumeric_last");
191 aBuf
.append(rAlgorithm
);
193 aBuf
.append(rAlgorithm
);
195 OUString func_base
= aBuf
.makeStringAndClear();
196 OUString funclen_base
= func_base
+ "_length";
197 func
= reinterpret_cast<const sal_uInt8
* (*)()>(osl_getFunctionSymbol(hModule
, func_base
.pData
));
198 funclen
= reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule
, funclen_base
.pData
));
204 #if WITH_LOCALE_ALL || WITH_LOCALE_ca
205 } else if ( rLocale
.Language
== "ca" ) {
206 if ( rAlgorithm
== "charset" )
208 func
= get_collator_data_ca_charset
;
209 funclen
= get_collator_data_ca_charset_length
;
212 #if WITH_LOCALE_ALL || WITH_LOCALE_cu
213 } else if ( rLocale
.Language
== "cu" ) {
214 if ( rAlgorithm
== "charset" )
216 func
= get_collator_data_cu_charset
;
217 funclen
= get_collator_data_cu_charset_length
;
220 #if WITH_LOCALE_ALL || WITH_LOCALE_dz
221 } else if ( rLocale
.Language
== "dz" || rLocale
.Language
== "bo" ) {
222 // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha
223 if ( rAlgorithm
== "charset" )
225 func
= get_collator_data_dz_charset
;
226 funclen
= get_collator_data_dz_charset_length
;
229 #if WITH_LOCALE_ALL || WITH_LOCALE_hu
230 } else if ( rLocale
.Language
== "hu" ) {
231 if ( rAlgorithm
== "charset" )
233 func
= get_collator_data_hu_charset
;
234 funclen
= get_collator_data_hu_charset_length
;
237 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
238 } else if ( rLocale
.Language
== "ja" ) {
239 if ( rAlgorithm
== "charset" )
241 func
= get_collator_data_ja_charset
;
242 funclen
= get_collator_data_ja_charset_length
;
244 else if ( rAlgorithm
== "phonetic (alphanumeric first)" )
246 func
= get_collator_data_ja_phonetic_alphanumeric_first
;
247 funclen
= get_collator_data_ja_phonetic_alphanumeric_first_length
;
249 else if ( rAlgorithm
== "phonetic (alphanumeric last)" )
251 func
= get_collator_data_ja_phonetic_alphanumeric_last
;
252 funclen
= get_collator_data_ja_phonetic_alphanumeric_last_length
;
255 #if WITH_LOCALE_ALL || WITH_LOCALE_ko
256 #if (U_ICU_VERSION_MAJOR_NUM < 53)
257 } else if ( rLocale
.Language
== "ko" ) {
258 if ( rAlgorithm
== "charset" )
260 func
= get_collator_data_ko_charset
;
261 funclen
= get_collator_data_ko_charset_length
;
265 #if WITH_LOCALE_ALL || WITH_LOCALE_ku
266 } else if ( rLocale
.Language
== "ku" ) {
267 if ( rAlgorithm
== "alphanumeric" )
269 func
= get_collator_data_ku_alphanumeric
;
270 funclen
= get_collator_data_ku_alphanumeric_length
;
273 #if WITH_LOCALE_ALL || WITH_LOCALE_ln
274 } else if ( rLocale
.Language
== "ln" ) {
275 if ( rAlgorithm
== "charset" )
277 func
= get_collator_data_ln_charset
;
278 funclen
= get_collator_data_ln_charset_length
;
281 #if WITH_LOCALE_ALL || WITH_LOCALE_my
282 } else if ( rLocale
.Language
== "my" ) {
283 if ( rAlgorithm
== "dictionary" )
285 func
= get_collator_data_my_dictionary
;
286 funclen
= get_collator_data_my_dictionary_length
;
289 #if WITH_LOCALE_ALL || WITH_LOCALE_ne
290 } else if ( rLocale
.Language
== "ne" ) {
291 if ( rAlgorithm
== "charset" )
293 func
= get_collator_data_ne_charset
;
294 funclen
= get_collator_data_ne_charset_length
;
297 #if WITH_LOCALE_ALL || WITH_LOCALE_sid
298 } else if ( rLocale
.Language
== "sid" ) {
299 if ( rAlgorithm
== "charset" )
301 func
= get_collator_data_sid_charset
;
302 funclen
= get_collator_data_sid_charset_length
;
305 #if WITH_LOCALE_ALL || WITH_LOCALE_vro
306 } else if ( rLocale
.Language
== "vro" ) {
307 if ( rAlgorithm
== "alphanumeric" )
309 func
= get_collator_data_vro_alphanumeric
;
310 funclen
= get_collator_data_vro_alphanumeric_length
;
313 #if WITH_LOCALE_ALL || WITH_LOCALE_zh
314 } else if ( rLocale
.Language
== "zh" && (rLocale
.Country
== "TW" || rLocale
.Country
== "HK" || rLocale
.Country
== "MO") ) {
315 if ( rAlgorithm
== "charset" )
317 func
= get_collator_data_zh_TW_charset
;
318 funclen
= get_collator_data_zh_TW_charset_length
;
320 else if ( rAlgorithm
== "radical" )
322 func
= get_collator_data_zh_TW_radical
;
323 funclen
= get_collator_data_zh_TW_radical_length
;
325 else if ( rAlgorithm
== "stroke" )
327 func
= get_collator_data_zh_TW_stroke
;
328 funclen
= get_collator_data_zh_TW_stroke_length
;
330 } else if ( rLocale
.Language
== "zh" ) {
331 if ( rAlgorithm
== "charset" )
333 func
= get_collator_data_zh_charset
;
334 funclen
= get_collator_data_zh_charset_length
;
336 else if ( rAlgorithm
== "pinyin" )
338 func
= get_collator_data_zh_pinyin
;
339 funclen
= get_collator_data_zh_pinyin_length
;
341 else if ( rAlgorithm
== "radical" )
343 func
= get_collator_data_zh_radical
;
344 funclen
= get_collator_data_zh_radical_length
;
346 else if ( rAlgorithm
== "stroke" )
348 func
= get_collator_data_zh_stroke
;
349 funclen
= get_collator_data_zh_stroke_length
;
351 else if ( rAlgorithm
== "zhuyin" )
353 func
= get_collator_data_zh_zhuyin
;
354 funclen
= get_collator_data_zh_zhuyin_length
;
358 #endif // DISABLE_DYNLOADING
359 if (func
&& funclen
) {
360 const sal_uInt8
* ruleImage
=func();
361 size_t ruleImageSize
= funclen();
363 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
364 uca_base
= new icu::RuleBasedCollator(static_cast<UChar
*>(NULL
), status
);
366 // Not only changed ICU 53.1 the API behavior that a negative
367 // length (ruleImageSize) now leads to failure, but also that
368 // the base RuleBasedCollator passed as uca_base here needs to
369 // have a base->tailoring == CollationRoot::getRoot() otherwise
370 // the init bails out as well, as it does for the previously
371 // used "empty" RuleBasedCollator.
372 // The default collator of the en-US locale would also fulfill
373 // the requirement. The collator of the actual locale or the
374 // NULL (default) locale does not.
375 uca_base
.reset( static_cast<icu::RuleBasedCollator
*>(icu::Collator::createInstance(
376 icu::Locale::getRoot(), status
)) );
378 if (! U_SUCCESS(status
)) {
379 OUString message
= "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status
));
380 SAL_WARN("i18npool", message
);
381 throw RuntimeException(message
);
383 collator
.reset( new icu::RuleBasedCollator(
384 reinterpret_cast<const uint8_t*>(ruleImage
), ruleImageSize
, uca_base
.get(), status
) );
385 if (! U_SUCCESS(status
)) {
386 OUString message
= "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status
));
387 SAL_WARN("i18npool", message
);
388 throw RuntimeException(message
);
393 /** ICU collators are loaded using a locale only.
394 ICU uses Variant as collation algorithm name (like de__PHONEBOOK
395 locale), note the empty territory (Country) designator in this special
397 But sometimes the mapping fails, eg for German (from Germany) phonebook, we'll have "de_DE_PHONEBOOK"
398 this one won't be remapping to collation keyword specifiers "de@collation=phonebook"
399 See http://userguide.icu-project.org/locale#TOC-Variant-code, Level 2 canonicalization, 8.
400 So let variant empty and use the fourth arg of icuLocale "keywords"
401 See LanguageTagIcu::getIcuLocale from i18nlangtag/source/languagetag/languagetagicu.cxx
402 The icu::Locale constructor changes the algorithm name to
403 uppercase itself, so we don't have to bother with that.
405 icu::Locale
icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale
),
406 "", rAlgorithm
.isEmpty() ? OUString("") : "collation=" + rAlgorithm
));
408 // FIXME: apparently we get here in LOKit case only. When the language is Japanese, we pass "ja@collation=phonetic (alphanumeric first)" to ICU
409 // and ICU does not like this (U_ILLEGAL_ARGUMENT_ERROR). Subsequently LOKit crashes, because collator is nullptr.
410 if (!strcmp(icuLocale
.getLanguage(), "ja"))
411 icuLocale
= icu::Locale::getJapanese();
414 collator
.reset( static_cast<icu::RuleBasedCollator
*>( icu::Collator::createInstance(icuLocale
, status
) ) );
415 if (! U_SUCCESS(status
)) {
416 OUString message
= "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status
));
417 SAL_WARN("i18npool", message
);
418 throw RuntimeException(message
);
423 if (options
& CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT
)
424 collator
->setStrength(icu::Collator::PRIMARY
);
425 else if (options
& CollatorOptions::CollatorOptions_IGNORE_CASE
)
426 collator
->setStrength(icu::Collator::SECONDARY
);
428 collator
->setStrength(icu::Collator::TERTIARY
);
435 Collator_Unicode::getImplementationName()
437 return implementationName
;
441 Collator_Unicode::supportsService(const OUString
& rServiceName
)
443 return cppu::supportsService(this, rServiceName
);
446 Sequence
< OUString
> SAL_CALL
447 Collator_Unicode::getSupportedServiceNames()
449 Sequence
< OUString
> aRet
{ OUString(implementationName
) };
455 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */