1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <i18nutil/casefolding.hxx>
21 #include "casefolding_data.h"
22 #include <i18nutil/oneToOneMapping.hxx>
23 #include <i18nutil/widthfolding.hxx>
24 #include <i18nutil/transliteration.hxx>
25 #include <com/sun/star/lang/Locale.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
27 #include <rtl/character.hxx>
29 #include <unicode/uchar.h>
31 using namespace com::sun::star::lang
;
32 using namespace com::sun::star::uno
;
36 const Mapping mapping_03a3
[] = {{0, 1, {0x03c2, 0, 0}},{0, 1, {0x03c3, 0, 0}}};
37 const Mapping mapping_0307
[] = {{0, 0, {0, 0, 0}},{0, 1, {0x0307, 0, 0}}};
38 const Mapping mapping_004a
[] = {{0, 2, {0x006a, 0x0307, 0}},{0, 1, {0x006a, 0, 0}}};
39 const Mapping mapping_012e
[] = {{0, 2, {0x012f, 0x0307, 0}},{0, 1, {0x012f, 0, 0}}};
40 const Mapping mapping_00cc
[] = {{0, 3, {0x0069, 0x0307, 0x0300}},{0, 1, {0x00ec, 0, 0}}};
41 const Mapping mapping_00cd
[] = {{0, 3, {0x0069, 0x0307, 0x0301}},{0, 1, {0x00ed, 0, 0}}};
42 const Mapping mapping_0128
[] = {{0, 3, {0x0069, 0x0307, 0x0303}},{0, 1, {0x0129, 0, 0}}};
43 const Mapping mapping_0049
[] = {{0, 2, {0x0069, 0x0307, 0}},{0, 1, {0x0131, 0, 0}},{0, 1, {0x0069, 0, 0}}};
44 const Mapping mapping_0069
[] = {{0, 1, {0x0130, 0, 0}},{0, 1, {0x0049, 0, 0}}};
45 const Mapping mapping_0130
[] = {{0, 1, {0x0069, 0, 0}},{0, 1, {0x0130, 0, 0}}};
47 #define langIs(lang) (aLocale.Language == lang)
49 // only check simple case, there is more complicated case need to be checked.
50 #define type_i(ch) ((ch) == 0x0069 || (ch) == 0x006a)
52 static bool cased_letter(sal_Unicode ch
)
55 int cmi
= CaseMappingIndex
[msb
];
58 int cmv_idx
= (cmi
<< 8) + (ch
& 0xff);
59 return bool(static_cast<MappingType
>(CaseMappingValue
[cmv_idx
].type
) & MappingType::CasedLetterMask
);
62 // for Lithuanian, condition to make explicit dot above when lowercasing capital I's and J's
63 // whenever there are more accents above.
64 #define accent_above(ch) (((ch) >= 0x0300 && (ch) <= 0x0314) || ((ch) >= 0x033D && (ch) <= 0x0344) || (ch) == 0x0346 || ((ch) >= 0x034A && (ch) <= 0x034C))
66 const Mapping
& casefolding::getConditionalValue(const sal_Unicode
* str
, sal_Int32 pos
, sal_Int32 len
, Locale
const & aLocale
, MappingType nMappingType
)
70 // final_sigma (not followed by cased and preceded by cased character)
71 // DOES NOT check ignorable sequence yet (more complicated implementation).
72 return !(pos
< len
&& cased_letter(str
[pos
+1])) && (pos
> 0 && cased_letter(str
[pos
-1])) ?
73 mapping_03a3
[0] : mapping_03a3
[1];
75 return (((nMappingType
== MappingType::LowerToUpper
&& langIs("lt")) ||
76 (nMappingType
== MappingType::UpperToLower
&& (langIs("tr") || langIs("az")))) &&
77 (pos
> 0 && type_i(str
[pos
-1]))) ? // after_i
78 mapping_0307
[0] : mapping_0307
[1];
80 return (langIs("tr") || langIs("az")) ? mapping_0130
[0] : mapping_0130
[1];
82 return (langIs("tr") || langIs("az")) ? mapping_0069
[0] : mapping_0069
[1];
83 case 0x0049: return langIs("lt") && pos
> len
&& accent_above(str
[pos
+1]) ? mapping_0049
[0] :
84 (langIs("tr") || langIs("az")) ? mapping_0049
[1] : mapping_0049
[2];
85 case 0x004a: return langIs("lt") && pos
> len
&& accent_above(str
[pos
+1]) ? mapping_004a
[0] : mapping_004a
[1];
86 case 0x012e: return langIs("lt") && pos
> len
&& accent_above(str
[pos
+1]) ? mapping_012e
[0] : mapping_012e
[1];
87 case 0x00cc: return langIs("lt") ? mapping_00cc
[0] : mapping_00cc
[1];
88 case 0x00cd: return langIs("lt") ? mapping_00cd
[0] : mapping_00cd
[1];
89 case 0x0128: return langIs("lt") ? mapping_0128
[0] : mapping_0128
[1];
91 // Should not come here
92 throw RuntimeException();
95 Mapping
casefolding::getValue(const sal_Unicode
* str
, sal_Int32 pos
, sal_Int32 len
, Locale
const & aLocale
, MappingType nMappingType
)
97 if (pos
> 0 && rtl::isHighSurrogate(str
[pos
-1]) && rtl::isLowSurrogate(str
[pos
]))
98 return { 0, 0, { 0, 0, 0 } };
100 Mapping dummy
= { 0, 1, { str
[pos
], 0, 0 } };
103 if (pos
+ 1 < len
&& rtl::isHighSurrogate(str
[pos
]) && rtl::isLowSurrogate(str
[pos
+ 1]))
104 c
= rtl::combineSurrogates(str
[pos
], str
[pos
+ 1]);
108 sal_Int16 address
= -1;
109 if (c
< SAL_N_ELEMENTS(CaseMappingIndex
) * 256)
110 address
= CaseMappingIndex
[c
>> 8];
113 address
= (address
<< 8) + (c
& 0xFF);
114 if (static_cast<MappingType
>(CaseMappingValue
[address
].type
) & nMappingType
) {
115 MappingType type
= static_cast<MappingType
>(CaseMappingValue
[address
].type
);
116 if (type
& MappingType::NotValue
) {
117 if (CaseMappingValue
[address
].value
== 0)
118 return getConditionalValue(str
, pos
, len
, aLocale
, nMappingType
);
120 for (int map
= CaseMappingValue
[address
].value
;
121 map
< CaseMappingValue
[address
].value
+ MaxCaseMappingExtras
; map
++) {
122 if (static_cast<MappingType
>(CaseMappingExtra
[map
].type
) & nMappingType
) {
123 if (static_cast<MappingType
>(CaseMappingExtra
[map
].type
) & MappingType::NotValue
)
124 return getConditionalValue(str
, pos
, len
, aLocale
, nMappingType
);
126 return CaseMappingExtra
[map
];
129 // Should not come here
130 throw RuntimeException();
135 dummy
.map
[0] = CaseMappingValue
[address
].value
;
141 // If the code point is not supported by our case mapping tables,
142 // fallback to ICU functions.
143 // TODO: this does not handle special case mapping as these require
144 // using ustring.h APIs, which work on the whole string not character
146 // TODO: what is the difference between ToLower and UpperToLower etc.?
147 sal_uInt32 value
= c
;
148 switch (nMappingType
)
150 case MappingType::ToLower
:
151 case MappingType::UpperToLower
:
152 value
= u_tolower(c
);
154 case MappingType::ToUpper
:
155 case MappingType::LowerToUpper
:
156 value
= u_toupper(c
);
158 case MappingType::ToTitle
:
159 value
= u_totitle(c
);
161 case MappingType::SimpleFolding
:
162 case MappingType::FullFolding
:
163 value
= u_foldCase(c
, U_FOLD_CASE_DEFAULT
);
168 dummy
.nmap
= rtl::splitSurrogates(value
, dummy
.map
);
174 is_ja_voice_sound_mark(sal_Unicode
& current
, sal_Unicode next
)
176 if (next
!= 0x3099 && next
!= 0x309a)
178 sal_Unicode c
= widthfolding::getCompositionChar(current
, next
);
184 sal_Unicode
casefolding::getNextChar(const sal_Unicode
*str
, sal_Int32
& idx
, sal_Int32 len
, MappingElement
& e
, Locale
const & aLocale
, MappingType nMappingType
, TransliterationFlags moduleLoaded
)
188 e
= MappingElement();
194 if (moduleLoaded
& TransliterationFlags::IGNORE_CASE
) {
195 if( e
.current
>= e
.element
.nmap
) {
196 e
.element
= getValue(str
, idx
++, len
, aLocale
, nMappingType
);
199 c
= e
.element
.map
[e
.current
++];
204 if (moduleLoaded
& TransliterationFlags::IGNORE_KANA
) {
205 if ((0x3040 <= c
&& c
<= 0x3094) || (0x309d <= c
&& c
<= 0x309f))
209 // composition: KA + voice-mark --> GA. see halfwidthToFullwidth.cxx for detail
210 if (moduleLoaded
& TransliterationFlags::IGNORE_WIDTH
) {
211 static oneToOneMapping
& half2fullTable
= widthfolding::gethalf2fullTable();
212 c
= half2fullTable
[c
];
213 if (0x3040 <= c
&& c
<= 0x30ff && idx
< len
&&
214 is_ja_voice_sound_mark(c
, half2fullTable
[*(str
+ idx
)]))
223 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */