1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/icu_string_conversions.h"
9 #include "base/basictypes.h"
10 #include "base/logging.h"
11 #include "base/memory/scoped_ptr.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "third_party/icu/source/common/unicode/ucnv.h"
15 #include "third_party/icu/source/common/unicode/ucnv_cb.h"
16 #include "third_party/icu/source/common/unicode/ucnv_err.h"
17 #include "third_party/icu/source/common/unicode/unorm.h"
18 #include "third_party/icu/source/common/unicode/ustring.h"
23 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUBSTITUTE
24 // in source/common/ucnv_err.c.
26 // Copyright (c) 1995-2006 International Business Machines Corporation
29 // All rights reserved.
32 // Permission is hereby granted, free of charge, to any person obtaining a
33 // copy of this software and associated documentation files (the "Software"),
34 // to deal in the Software without restriction, including without limitation
35 // the rights to use, copy, modify, merge, publish, distribute, and/or
36 // sell copies of the Software, and to permit persons to whom the Software
37 // is furnished to do so, provided that the above copyright notice(s) and
38 // this permission notice appear in all copies of the Software and that
39 // both the above copyright notice(s) and this permission notice appear in
40 // supporting documentation.
42 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
45 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
46 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
47 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
48 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
49 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
50 // OR PERFORMANCE OF THIS SOFTWARE.
52 // Except as contained in this notice, the name of a copyright holder
53 // shall not be used in advertising or otherwise to promote the sale, use
54 // or other dealings in this Software without prior written authorization
55 // of the copyright holder.
57 // ___________________________________________________________________________
59 // All trademarks and registered trademarks mentioned herein are the property
60 // of their respective owners.
62 void ToUnicodeCallbackSubstitute(const void* context
,
63 UConverterToUnicodeArgs
*to_args
,
64 const char* code_units
,
66 UConverterCallbackReason reason
,
68 static const UChar kReplacementChar
= 0xFFFD;
69 if (reason
<= UCNV_IRREGULAR
) {
70 if (context
== NULL
||
71 (*(reinterpret_cast<const char*>(context
)) == 'i' &&
72 reason
== UCNV_UNASSIGNED
)) {
74 ucnv_cbToUWriteUChars(to_args
, &kReplacementChar
, 1, 0, err
);
76 // else the caller must have set the error code accordingly.
78 // else ignore the reset, close and clone calls.
81 bool ConvertFromUTF16(UConverter
* converter
, const UChar
* uchar_src
,
82 int uchar_len
, OnStringConversionError::Type on_error
,
83 std::string
* encoded
) {
84 int encoded_max_length
= UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len
,
85 ucnv_getMaxCharSize(converter
));
86 encoded
->resize(encoded_max_length
);
88 UErrorCode status
= U_ZERO_ERROR
;
90 // Setup our error handler.
92 case OnStringConversionError::FAIL
:
93 ucnv_setFromUCallBack(converter
, UCNV_FROM_U_CALLBACK_STOP
, 0,
96 case OnStringConversionError::SKIP
:
97 case OnStringConversionError::SUBSTITUTE
:
98 ucnv_setFromUCallBack(converter
, UCNV_FROM_U_CALLBACK_SKIP
, 0,
105 // ucnv_fromUChars returns size not including terminating null
106 int actual_size
= ucnv_fromUChars(converter
, &(*encoded
)[0],
107 encoded_max_length
, uchar_src
, uchar_len
, &status
);
108 encoded
->resize(actual_size
);
109 ucnv_close(converter
);
110 if (U_SUCCESS(status
))
112 encoded
->clear(); // Make sure the output is empty on error.
116 // Set up our error handler for ToUTF-16 converters
117 void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error
,
118 UConverter
* converter
, UErrorCode
* status
) {
120 case OnStringConversionError::FAIL
:
121 ucnv_setToUCallBack(converter
, UCNV_TO_U_CALLBACK_STOP
, 0,
124 case OnStringConversionError::SKIP
:
125 ucnv_setToUCallBack(converter
, UCNV_TO_U_CALLBACK_SKIP
, 0,
128 case OnStringConversionError::SUBSTITUTE
:
129 ucnv_setToUCallBack(converter
, ToUnicodeCallbackSubstitute
, 0,
137 inline UConverterType
utf32_platform_endian() {
139 return UCNV_UTF32_BigEndian
;
141 return UCNV_UTF32_LittleEndian
;
147 // Codepage <-> Wide/UTF-16 ---------------------------------------------------
149 bool UTF16ToCodepage(const string16
& utf16
,
150 const char* codepage_name
,
151 OnStringConversionError::Type on_error
,
152 std::string
* encoded
) {
155 UErrorCode status
= U_ZERO_ERROR
;
156 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
157 if (!U_SUCCESS(status
))
160 return ConvertFromUTF16(converter
, utf16
.c_str(),
161 static_cast<int>(utf16
.length()), on_error
, encoded
);
164 bool CodepageToUTF16(const std::string
& encoded
,
165 const char* codepage_name
,
166 OnStringConversionError::Type on_error
,
170 UErrorCode status
= U_ZERO_ERROR
;
171 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
172 if (!U_SUCCESS(status
))
175 // Even in the worst case, the maximum length in 2-byte units of UTF-16
176 // output would be at most the same as the number of bytes in input. There
177 // is no single-byte encoding in which a character is mapped to a
178 // non-BMP character requiring two 2-byte units.
180 // Moreover, non-BMP characters in legacy multibyte encodings
181 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
182 // BOCU and SCSU, but we don't care about them.
183 size_t uchar_max_length
= encoded
.length() + 1;
185 SetUpErrorHandlerForToUChars(on_error
, converter
, &status
);
186 scoped_ptr
<char16
[]> buffer(new char16
[uchar_max_length
]);
187 int actual_size
= ucnv_toUChars(converter
, buffer
.get(),
188 static_cast<int>(uchar_max_length
), encoded
.data(),
189 static_cast<int>(encoded
.length()), &status
);
190 ucnv_close(converter
);
191 if (!U_SUCCESS(status
)) {
192 utf16
->clear(); // Make sure the output is empty on error.
196 utf16
->assign(buffer
.get(), actual_size
);
200 bool WideToCodepage(const std::wstring
& wide
,
201 const char* codepage_name
,
202 OnStringConversionError::Type on_error
,
203 std::string
* encoded
) {
204 #if defined(WCHAR_T_IS_UTF16)
205 return UTF16ToCodepage(wide
, codepage_name
, on_error
, encoded
);
206 #elif defined(WCHAR_T_IS_UTF32)
209 UErrorCode status
= U_ZERO_ERROR
;
210 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
211 if (!U_SUCCESS(status
))
215 // When wchar_t is wider than UChar (16 bits), transform |wide| into a
216 // UChar* string. Size the UChar* buffer to be large enough to hold twice
217 // as many UTF-16 code units (UChar's) as there are Unicode code points,
218 // in case each code points translates to a UTF-16 surrogate pair,
219 // and leave room for a NUL terminator.
220 std::vector
<UChar
> utf16(wide
.length() * 2 + 1);
221 u_strFromUTF32(&utf16
[0], utf16
.size(), &utf16_len
,
222 reinterpret_cast<const UChar32
*>(wide
.c_str()),
223 wide
.length(), &status
);
224 DCHECK(U_SUCCESS(status
)) << "failed to convert wstring to UChar*";
226 return ConvertFromUTF16(converter
, &utf16
[0], utf16_len
, on_error
, encoded
);
227 #endif // defined(WCHAR_T_IS_UTF32)
230 bool CodepageToWide(const std::string
& encoded
,
231 const char* codepage_name
,
232 OnStringConversionError::Type on_error
,
233 std::wstring
* wide
) {
234 #if defined(WCHAR_T_IS_UTF16)
235 return CodepageToUTF16(encoded
, codepage_name
, on_error
, wide
);
236 #elif defined(WCHAR_T_IS_UTF32)
239 UErrorCode status
= U_ZERO_ERROR
;
240 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
241 if (!U_SUCCESS(status
))
244 // The maximum length in 4 byte unit of UTF-32 output would be
245 // at most the same as the number of bytes in input. In the worst
246 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
247 // this can be 4 times larger than actually needed.
248 size_t wchar_max_length
= encoded
.length() + 1;
250 SetUpErrorHandlerForToUChars(on_error
, converter
, &status
);
251 scoped_ptr
<wchar_t[]> buffer(new wchar_t[wchar_max_length
]);
252 int actual_size
= ucnv_toAlgorithmic(utf32_platform_endian(), converter
,
253 reinterpret_cast<char*>(buffer
.get()),
254 static_cast<int>(wchar_max_length
) * sizeof(wchar_t), encoded
.data(),
255 static_cast<int>(encoded
.length()), &status
);
256 ucnv_close(converter
);
257 if (!U_SUCCESS(status
)) {
258 wide
->clear(); // Make sure the output is empty on error.
262 // actual_size is # of bytes.
263 wide
->assign(buffer
.get(), actual_size
/ sizeof(wchar_t));
265 #endif // defined(WCHAR_T_IS_UTF32)
268 bool ConvertToUtf8AndNormalize(const std::string
& text
,
269 const std::string
& charset
,
270 std::string
* result
) {
273 if (!CodepageToUTF16(
274 text
, charset
.c_str(), OnStringConversionError::FAIL
, &utf16
))
277 UErrorCode status
= U_ZERO_ERROR
;
278 size_t max_length
= utf16
.length() + 1;
279 string16 normalized_utf16
;
280 scoped_ptr
<char16
[]> buffer(new char16
[max_length
]);
281 int actual_length
= unorm_normalize(
282 utf16
.c_str(), utf16
.length(), UNORM_NFC
, 0,
283 buffer
.get(), static_cast<int>(max_length
), &status
);
284 if (!U_SUCCESS(status
))
286 normalized_utf16
.assign(buffer
.get(), actual_length
);
288 return UTF16ToUTF8(normalized_utf16
.data(),
289 normalized_utf16
.length(), result
);