1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <config_locales.h>
22 #include <lrl_include.hxx>
24 #include <rtl/ustrbuf.hxx>
25 #include <i18nlangtag/languagetag.hxx>
26 #include <i18nlangtag/languagetagicu.hxx>
27 #include <collator_unicode.hxx>
28 #include <localedata.hxx>
29 #include <com/sun/star/i18n/CollatorOptions.hpp>
30 #include <cppuhelper/supportsservice.hxx>
32 using namespace ::com::sun::star
;
33 using namespace ::com::sun::star::i18n
;
34 using namespace ::com::sun::star::lang
;
35 using namespace ::com::sun::star::uno
;
39 static const char implementationName
[] = "com.sun.star.i18n.Collator_Unicode";
41 Collator_Unicode::Collator_Unicode()
45 #ifndef DISABLE_DYNLOADING
50 Collator_Unicode::~Collator_Unicode()
54 #ifndef DISABLE_DYNLOADING
55 if (hModule
) osl_unloadModule(hModule
);
59 #ifdef DISABLE_DYNLOADING
63 // For DISABLE_DYNLOADING the generated functions have names that
64 // start with get_collator_data_ to avoid clashing with a few
65 // functions in the generated libindex_data that are called just
66 // get_zh_pinyin for instance.
68 const sal_uInt8
* get_collator_data_ca_charset();
69 const sal_uInt8
* get_collator_data_cu_charset();
70 const sal_uInt8
* get_collator_data_dz_charset();
71 const sal_uInt8
* get_collator_data_hu_charset();
72 const sal_uInt8
* get_collator_data_ja_charset();
73 const sal_uInt8
* get_collator_data_ja_phonetic_alphanumeric_first();
74 const sal_uInt8
* get_collator_data_ja_phonetic_alphanumeric_last();
75 const sal_uInt8
* get_collator_data_ko_charset();
76 const sal_uInt8
* get_collator_data_ku_alphanumeric();
77 const sal_uInt8
* get_collator_data_ln_charset();
78 const sal_uInt8
* get_collator_data_my_dictionary();
79 const sal_uInt8
* get_collator_data_ne_charset();
80 const sal_uInt8
* get_collator_data_sid_charset();
81 const sal_uInt8
* get_collator_data_zh_TW_charset();
82 const sal_uInt8
* get_collator_data_zh_TW_radical();
83 const sal_uInt8
* get_collator_data_zh_TW_stroke();
84 const sal_uInt8
* get_collator_data_zh_charset();
85 const sal_uInt8
* get_collator_data_zh_pinyin();
86 const sal_uInt8
* get_collator_data_zh_radical();
87 const sal_uInt8
* get_collator_data_zh_stroke();
88 const sal_uInt8
* get_collator_data_zh_zhuyin();
90 size_t get_collator_data_ca_charset_length();
91 size_t get_collator_data_cu_charset_length();
92 size_t get_collator_data_dz_charset_length();
93 size_t get_collator_data_hu_charset_length();
94 size_t get_collator_data_ja_charset_length();
95 size_t get_collator_data_ja_phonetic_alphanumeric_first_length();
96 size_t get_collator_data_ja_phonetic_alphanumeric_last_length();
97 size_t get_collator_data_ko_charset_length();
98 size_t get_collator_data_ku_alphanumeric_length();
99 size_t get_collator_data_ln_charset_length();
100 size_t get_collator_data_my_dictionary_length();
101 size_t get_collator_data_ne_charset_length();
102 size_t get_collator_data_sid_charset_length();
103 size_t get_collator_data_zh_TW_charset_length();
104 size_t get_collator_data_zh_TW_radical_length();
105 size_t get_collator_data_zh_TW_stroke_length();
106 size_t get_collator_data_zh_charset_length();
107 size_t get_collator_data_zh_pinyin_length();
108 size_t get_collator_data_zh_radical_length();
109 size_t get_collator_data_zh_stroke_length();
110 size_t get_collator_data_zh_zhuyin_length();
117 Collator_Unicode::compareSubstring( const OUString
& str1
, sal_Int32 off1
, sal_Int32 len1
,
118 const OUString
& str2
, sal_Int32 off2
, sal_Int32 len2
)
120 return collator
->compare(reinterpret_cast<const UChar
*>(str1
.getStr()) + off1
, len1
, reinterpret_cast<const UChar
*>(str2
.getStr()) + off2
, len2
);
124 Collator_Unicode::compareString( const OUString
& str1
, const OUString
& str2
)
126 return collator
->compare(reinterpret_cast<const UChar
*>(str1
.getStr()), reinterpret_cast<const UChar
*>(str2
.getStr()));
129 #ifndef DISABLE_DYNLOADING
131 extern "C" { static void thisModule() {} }
136 Collator_Unicode::loadCollatorAlgorithm(const OUString
& rAlgorithm
, const lang::Locale
& rLocale
, sal_Int32 options
)
139 UErrorCode status
= U_ZERO_ERROR
;
140 OUString rule
= LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale
, rAlgorithm
);
141 if (!rule
.isEmpty()) {
142 collator
.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar
*>(rule
.getStr()), status
) );
143 if (! U_SUCCESS(status
)) throw RuntimeException();
145 if (!collator
&& OUString(LOCAL_RULE_LANGS
).indexOf(rLocale
.Language
) >= 0) {
146 const sal_uInt8
* (*func
)() = nullptr;
147 size_t (*funclen
)() = nullptr;
149 #ifndef DISABLE_DYNLOADING
152 aBuf
.append(SAL_DLLPREFIX
);
154 aBuf
.append( "collator_data" ).append( SAL_DLLEXTENSION
);
155 hModule
= osl_loadModuleRelative( &thisModule
, aBuf
.makeStringAndClear().pData
, SAL_LOADMODULE_DEFAULT
);
157 aBuf
.append("get_").append(rLocale
.Language
).append("_");
158 if ( rLocale
.Language
== "zh" ) {
159 OUString func_base
= aBuf
.makeStringAndClear();
160 if (OUString("TW HK MO").indexOf(rLocale
.Country
) >= 0)
162 func
= reinterpret_cast<const sal_uInt8
* (*)()>(osl_getFunctionSymbol(hModule
,
163 OUString(func_base
+ "TW_" + rAlgorithm
).pData
));
164 funclen
= reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule
,
165 OUString(func_base
+ "TW_" + rAlgorithm
+ "_length").pData
));
169 func
= reinterpret_cast<const sal_uInt8
* (*)()>(osl_getFunctionSymbol(
170 hModule
, OUString(func_base
+ rAlgorithm
).pData
));
171 funclen
= reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(
172 hModule
, OUString(func_base
+ rAlgorithm
+ "_length").pData
));
175 if ( rLocale
.Language
== "ja" ) {
176 // replace algorithm name to implementation name.
177 if (rAlgorithm
== "phonetic (alphanumeric first)")
178 aBuf
.append("phonetic_alphanumeric_first");
179 else if (rAlgorithm
== "phonetic (alphanumeric last)")
180 aBuf
.append("phonetic_alphanumeric_last");
182 aBuf
.append(rAlgorithm
);
184 aBuf
.append(rAlgorithm
);
186 OUString func_base
= aBuf
.makeStringAndClear();
187 OUString funclen_base
= func_base
+ "_length";
188 func
= reinterpret_cast<const sal_uInt8
* (*)()>(osl_getFunctionSymbol(hModule
, func_base
.pData
));
189 funclen
= reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule
, funclen_base
.pData
));
195 #if WITH_LOCALE_ALL || WITH_LOCALE_ca
196 } else if ( rLocale
.Language
== "ca" ) {
197 if ( rAlgorithm
== "charset" )
199 func
= get_collator_data_ca_charset
;
200 funclen
= get_collator_data_ca_charset_length
;
203 #if WITH_LOCALE_ALL || WITH_LOCALE_cu
204 } else if ( rLocale
.Language
== "cu" ) {
205 if ( rAlgorithm
== "charset" )
207 func
= get_collator_data_cu_charset
;
208 funclen
= get_collator_data_cu_charset_length
;
211 #if WITH_LOCALE_ALL || WITH_LOCALE_dz
212 } else if ( rLocale
.Language
== "dz" || rLocale
.Language
== "bo" ) {
213 // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha
214 if ( rAlgorithm
== "charset" )
216 func
= get_collator_data_dz_charset
;
217 funclen
= get_collator_data_dz_charset_length
;
220 #if WITH_LOCALE_ALL || WITH_LOCALE_hu
221 } else if ( rLocale
.Language
== "hu" ) {
222 if ( rAlgorithm
== "charset" )
224 func
= get_collator_data_hu_charset
;
225 funclen
= get_collator_data_hu_charset_length
;
228 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
229 } else if ( rLocale
.Language
== "ja" ) {
230 if ( rAlgorithm
== "charset" )
232 func
= get_collator_data_ja_charset
;
233 funclen
= get_collator_data_ja_charset_length
;
235 else if ( rAlgorithm
== "phonetic (alphanumeric first)" )
237 func
= get_collator_data_ja_phonetic_alphanumeric_first
;
238 funclen
= get_collator_data_ja_phonetic_alphanumeric_first_length
;
240 else if ( rAlgorithm
== "phonetic (alphanumeric last)" )
242 func
= get_collator_data_ja_phonetic_alphanumeric_last
;
243 funclen
= get_collator_data_ja_phonetic_alphanumeric_last_length
;
246 #if WITH_LOCALE_ALL || WITH_LOCALE_ko
247 #if (U_ICU_VERSION_MAJOR_NUM < 53)
248 } else if ( rLocale
.Language
== "ko" ) {
249 if ( rAlgorithm
== "charset" )
251 func
= get_collator_data_ko_charset
;
252 funclen
= get_collator_data_ko_charset_length
;
256 #if WITH_LOCALE_ALL || WITH_LOCALE_ku
257 } else if ( rLocale
.Language
== "ku" ) {
258 if ( rAlgorithm
== "alphanumeric" )
260 func
= get_collator_data_ku_alphanumeric
;
261 funclen
= get_collator_data_ku_alphanumeric_length
;
264 #if WITH_LOCALE_ALL || WITH_LOCALE_ln
265 } else if ( rLocale
.Language
== "ln" ) {
266 if ( rAlgorithm
== "charset" )
268 func
= get_collator_data_ln_charset
;
269 funclen
= get_collator_data_ln_charset_length
;
272 #if WITH_LOCALE_ALL || WITH_LOCALE_my
273 } else if ( rLocale
.Language
== "my" ) {
274 if ( rAlgorithm
== "dictionary" )
276 func
= get_collator_data_my_dictionary
;
277 funclen
= get_collator_data_my_dictionary_length
;
280 #if WITH_LOCALE_ALL || WITH_LOCALE_ne
281 } else if ( rLocale
.Language
== "ne" ) {
282 if ( rAlgorithm
== "charset" )
284 func
= get_collator_data_ne_charset
;
285 funclen
= get_collator_data_ne_charset_length
;
288 #if WITH_LOCALE_ALL || WITH_LOCALE_sid
289 } else if ( rLocale
.Language
== "sid" ) {
290 if ( rAlgorithm
== "charset" )
292 func
= get_collator_data_sid_charset
;
293 funclen
= get_collator_data_sid_charset_length
;
296 #if WITH_LOCALE_ALL || WITH_LOCALE_zh
297 } else if ( rLocale
.Language
== "zh" && (rLocale
.Country
== "TW" || rLocale
.Country
== "HK" || rLocale
.Country
== "MO") ) {
298 if ( rAlgorithm
== "charset" )
300 func
= get_collator_data_zh_TW_charset
;
301 funclen
= get_collator_data_zh_TW_charset_length
;
303 else if ( rAlgorithm
== "radical" )
305 func
= get_collator_data_zh_TW_radical
;
306 funclen
= get_collator_data_zh_TW_radical_length
;
308 else if ( rAlgorithm
== "stroke" )
310 func
= get_collator_data_zh_TW_stroke
;
311 funclen
= get_collator_data_zh_TW_stroke_length
;
313 } else if ( rLocale
.Language
== "zh" ) {
314 if ( rAlgorithm
== "charset" )
316 func
= get_collator_data_zh_charset
;
317 funclen
= get_collator_data_zh_charset_length
;
319 else if ( rAlgorithm
== "pinyin" )
321 func
= get_collator_data_zh_pinyin
;
322 funclen
= get_collator_data_zh_pinyin_length
;
324 else if ( rAlgorithm
== "radical" )
326 func
= get_collator_data_zh_radical
;
327 funclen
= get_collator_data_zh_radical_length
;
329 else if ( rAlgorithm
== "stroke" )
331 func
= get_collator_data_zh_stroke
;
332 funclen
= get_collator_data_zh_stroke_length
;
334 else if ( rAlgorithm
== "zhuyin" )
336 func
= get_collator_data_zh_zhuyin
;
337 funclen
= get_collator_data_zh_zhuyin_length
;
341 #endif // DISABLE_DYNLOADING
342 if (func
&& funclen
) {
343 const sal_uInt8
* ruleImage
=func();
344 size_t ruleImageSize
= funclen();
346 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
347 uca_base
= new icu::RuleBasedCollator(static_cast<UChar
*>(NULL
), status
);
349 // Not only changed ICU 53.1 the API behavior that a negative
350 // length (ruleImageSize) now leads to failure, but also that
351 // the base RuleBasedCollator passed as uca_base here needs to
352 // have a base->tailoring == CollationRoot::getRoot() otherwise
353 // the init bails out as well, as it does for the previously
354 // used "empty" RuleBasedCollator.
355 // The default collator of the en-US locale would also fulfill
356 // the requirement. The collator of the actual locale or the
357 // NULL (default) locale does not.
358 uca_base
.reset( static_cast<icu::RuleBasedCollator
*>(icu::Collator::createInstance(
359 icu::Locale::getRoot(), status
)) );
361 if (! U_SUCCESS(status
)) throw RuntimeException();
362 collator
.reset( new icu::RuleBasedCollator(
363 reinterpret_cast<const uint8_t*>(ruleImage
), ruleImageSize
, uca_base
.get(), status
) );
364 if (! U_SUCCESS(status
)) throw RuntimeException();
368 /** ICU collators are loaded using a locale only.
369 ICU uses Variant as collation algorithm name (like de__PHONEBOOK
370 locale), note the empty territory (Country) designator in this special
372 But sometimes the mapping fails, eg for German (from Germany) phonebook, we'll have "de_DE_PHONEBOOK"
373 this one won't be remapping to collation keyword specifiers "de@collation=phonebook"
374 See http://userguide.icu-project.org/locale#TOC-Variant-code, Level 2 canonicalization, 8.
375 So let variant empty and use the fourth arg of icuLocale "keywords"
376 See LanguageTagIcu::getIcuLocale from i18nlangtag/source/languagetag/languagetagicu.cxx
377 The icu::Locale constructor changes the algorithm name to
378 uppercase itself, so we don't have to bother with that.
380 icu::Locale
icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale
),
381 "", rAlgorithm
.isEmpty() ? OUString("") : "collation=" + rAlgorithm
));
384 collator
.reset( static_cast<icu::RuleBasedCollator
*>( icu::Collator::createInstance(icuLocale
, status
) ) );
385 if (! U_SUCCESS(status
)) throw RuntimeException();
389 if (options
& CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT
)
390 collator
->setStrength(icu::Collator::PRIMARY
);
391 else if (options
& CollatorOptions::CollatorOptions_IGNORE_CASE
)
392 collator
->setStrength(icu::Collator::SECONDARY
);
394 collator
->setStrength(icu::Collator::TERTIARY
);
401 Collator_Unicode::getImplementationName()
403 return implementationName
;
407 Collator_Unicode::supportsService(const OUString
& rServiceName
)
409 return cppu::supportsService(this, rServiceName
);
412 Sequence
< OUString
> SAL_CALL
413 Collator_Unicode::getSupportedServiceNames()
415 Sequence
< OUString
> aRet
{ OUString(implementationName
) };
421 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */