base/i18n/icu_string_conversions.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/i18n/icu_string_conversions.h"
   6
   7 #include <vector>
   8
   9 #include "base/basictypes.h"
  10 #include "base/logging.h"
  11 #include "base/memory/scoped_ptr.h"
  12 #include "base/strings/string_util.h"
  13 #include "base/strings/utf_string_conversions.h"
  14 #include "third_party/icu/source/common/unicode/ucnv.h"
  15 #include "third_party/icu/source/common/unicode/ucnv_cb.h"
  16 #include "third_party/icu/source/common/unicode/ucnv_err.h"
  17 #include "third_party/icu/source/common/unicode/unorm.h"
  18 #include "third_party/icu/source/common/unicode/ustring.h"
  19
  20 namespace base {
  21
  22 namespace {
  23 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUBSTITUTE
  24 // in source/common/ucnv_err.c.
  25
  26 // Copyright (c) 1995-2006 International Business Machines Corporation
  27 // and others
  28 //
  29 // All rights reserved.
  30 //
  31
  32 // Permission is hereby granted, free of charge, to any person obtaining a
  33 // copy of this software and associated documentation files (the "Software"),
  34 // to deal in the Software without restriction, including without limitation
  35 // the rights to use, copy, modify, merge, publish, distribute, and/or
  36 // sell copies of the Software, and to permit persons to whom the Software
  37 // is furnished to do so, provided that the above copyright notice(s) and
  38 // this permission notice appear in all copies of the Software and that
  39 // both the above copyright notice(s) and this permission notice appear in
  40 // supporting documentation.
  41 //
  42 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  43 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  44 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
  45 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
  46 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
  47 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  48 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  49 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  50 // OR PERFORMANCE OF THIS SOFTWARE.
  51 //
  52 // Except as contained in this notice, the name of a copyright holder
  53 // shall not be used in advertising or otherwise to promote the sale, use
  54 // or other dealings in this Software without prior written authorization
  55 // of the copyright holder.
  56
  57 //  ___________________________________________________________________________
  58 //
  59 // All trademarks and registered trademarks mentioned herein are the property
  60 // of their respective owners.
  61
  62 void ToUnicodeCallbackSubstitute(const void* context,
  63                                  UConverterToUnicodeArgs *to_args,
  64                                  const char* code_units,
  65                                  int32_t length,
  66                                  UConverterCallbackReason reason,
  67                                  UErrorCode * err) {
  68   static const UChar kReplacementChar = 0xFFFD;
  69   if (reason <= UCNV_IRREGULAR) {
  70       if (context == NULL ||
  71           (*(reinterpret_cast<const char*>(context)) == 'i' &&
  72            reason == UCNV_UNASSIGNED)) {
  73         *err = U_ZERO_ERROR;
  74         ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
  75       }
  76       // else the caller must have set the error code accordingly.
  77   }
  78   // else ignore the reset, close and clone calls.
  79 }
  80
  81 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
  82                       int uchar_len, OnStringConversionError::Type on_error,
  83                       std::string* encoded) {
  84   int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
  85       ucnv_getMaxCharSize(converter));
  86   encoded->resize(encoded_max_length);
  87
  88   UErrorCode status = U_ZERO_ERROR;
  89
  90   // Setup our error handler.
  91   switch (on_error) {
  92     case OnStringConversionError::FAIL:
  93       ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
  94                             NULL, NULL, &status);
  95       break;
  96     case OnStringConversionError::SKIP:
  97     case OnStringConversionError::SUBSTITUTE:
  98       ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
  99                             NULL, NULL, &status);
 100       break;
 101     default:
 102       NOTREACHED();
 103   }
 104
 105   // ucnv_fromUChars returns size not including terminating null
 106   int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
 107       encoded_max_length, uchar_src, uchar_len, &status);
 108   encoded->resize(actual_size);
 109   ucnv_close(converter);
 110   if (U_SUCCESS(status))
 111     return true;
 112   encoded->clear();  // Make sure the output is empty on error.
 113   return false;
 114 }
 115
 116 // Set up our error handler for ToUTF-16 converters
 117 void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error,
 118                                   UConverter* converter, UErrorCode* status) {
 119   switch (on_error) {
 120     case OnStringConversionError::FAIL:
 121       ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
 122                           NULL, NULL, status);
 123       break;
 124     case OnStringConversionError::SKIP:
 125       ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
 126                           NULL, NULL, status);
 127       break;
 128     case OnStringConversionError::SUBSTITUTE:
 129       ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0,
 130                           NULL, NULL, status);
 131       break;
 132     default:
 133       NOTREACHED();
 134   }
 135 }
 136
 137 inline UConverterType utf32_platform_endian() {
 138 #if U_IS_BIG_ENDIAN
 139   return UCNV_UTF32_BigEndian;
 140 #else
 141   return UCNV_UTF32_LittleEndian;
 142 #endif
 143 }
 144
 145 }  // namespace
 146
 147 // Codepage <-> Wide/UTF-16  ---------------------------------------------------
 148
 149 bool UTF16ToCodepage(const string16& utf16,
 150                      const char* codepage_name,
 151                      OnStringConversionError::Type on_error,
 152                      std::string* encoded) {
 153   encoded->clear();
 154
 155   UErrorCode status = U_ZERO_ERROR;
 156   UConverter* converter = ucnv_open(codepage_name, &status);
 157   if (!U_SUCCESS(status))
 158     return false;
 159
 160   return ConvertFromUTF16(converter, utf16.c_str(),
 161                           static_cast<int>(utf16.length()), on_error, encoded);
 162 }
 163
 164 bool CodepageToUTF16(const std::string& encoded,
 165                      const char* codepage_name,
 166                      OnStringConversionError::Type on_error,
 167                      string16* utf16) {
 168   utf16->clear();
 169
 170   UErrorCode status = U_ZERO_ERROR;
 171   UConverter* converter = ucnv_open(codepage_name, &status);
 172   if (!U_SUCCESS(status))
 173     return false;
 174
 175   // Even in the worst case, the maximum length in 2-byte units of UTF-16
 176   // output would be at most the same as the number of bytes in input. There
 177   // is no single-byte encoding in which a character is mapped to a
 178   // non-BMP character requiring two 2-byte units.
 179   //
 180   // Moreover, non-BMP characters in legacy multibyte encodings
 181   // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
 182   // BOCU and SCSU, but we don't care about them.
 183   size_t uchar_max_length = encoded.length() + 1;
 184
 185   SetUpErrorHandlerForToUChars(on_error, converter, &status);
 186   scoped_ptr<char16[]> buffer(new char16[uchar_max_length]);
 187   int actual_size = ucnv_toUChars(converter, buffer.get(),
 188       static_cast<int>(uchar_max_length), encoded.data(),
 189       static_cast<int>(encoded.length()), &status);
 190   ucnv_close(converter);
 191   if (!U_SUCCESS(status)) {
 192     utf16->clear();  // Make sure the output is empty on error.
 193     return false;
 194   }
 195
 196   utf16->assign(buffer.get(), actual_size);
 197   return true;
 198 }
 199
 200 bool WideToCodepage(const std::wstring& wide,
 201                     const char* codepage_name,
 202                     OnStringConversionError::Type on_error,
 203                     std::string* encoded) {
 204 #if defined(WCHAR_T_IS_UTF16)
 205   return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
 206 #elif defined(WCHAR_T_IS_UTF32)
 207   encoded->clear();
 208
 209   UErrorCode status = U_ZERO_ERROR;
 210   UConverter* converter = ucnv_open(codepage_name, &status);
 211   if (!U_SUCCESS(status))
 212     return false;
 213
 214   int utf16_len;
 215   // When wchar_t is wider than UChar (16 bits), transform |wide| into a
 216   // UChar* string.  Size the UChar* buffer to be large enough to hold twice
 217   // as many UTF-16 code units (UChar's) as there are Unicode code points,
 218   // in case each code points translates to a UTF-16 surrogate pair,
 219   // and leave room for a NUL terminator.
 220   std::vector<UChar> utf16(wide.length() * 2 + 1);
 221   u_strFromUTF32(&utf16[0], utf16.size(), &utf16_len,
 222                  reinterpret_cast<const UChar32*>(wide.c_str()),
 223                  wide.length(), &status);
 224   DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
 225
 226   return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);
 227 #endif  // defined(WCHAR_T_IS_UTF32)
 228 }
 229
 230 bool CodepageToWide(const std::string& encoded,
 231                     const char* codepage_name,
 232                     OnStringConversionError::Type on_error,
 233                     std::wstring* wide) {
 234 #if defined(WCHAR_T_IS_UTF16)
 235   return CodepageToUTF16(encoded, codepage_name, on_error, wide);
 236 #elif defined(WCHAR_T_IS_UTF32)
 237   wide->clear();
 238
 239   UErrorCode status = U_ZERO_ERROR;
 240   UConverter* converter = ucnv_open(codepage_name, &status);
 241   if (!U_SUCCESS(status))
 242     return false;
 243
 244   // The maximum length in 4 byte unit of UTF-32 output would be
 245   // at most the same as the number of bytes in input. In the worst
 246   // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
 247   // this can be 4 times larger than actually needed.
 248   size_t wchar_max_length = encoded.length() + 1;
 249
 250   SetUpErrorHandlerForToUChars(on_error, converter, &status);
 251   scoped_ptr<wchar_t[]> buffer(new wchar_t[wchar_max_length]);
 252   int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), converter,
 253       reinterpret_cast<char*>(buffer.get()),
 254       static_cast<int>(wchar_max_length) * sizeof(wchar_t), encoded.data(),
 255       static_cast<int>(encoded.length()), &status);
 256   ucnv_close(converter);
 257   if (!U_SUCCESS(status)) {
 258     wide->clear();  // Make sure the output is empty on error.
 259     return false;
 260   }
 261
 262   // actual_size is # of bytes.
 263   wide->assign(buffer.get(), actual_size / sizeof(wchar_t));
 264   return true;
 265 #endif  // defined(WCHAR_T_IS_UTF32)
 266 }
 267
 268 bool ConvertToUtf8AndNormalize(const std::string& text,
 269                                const std::string& charset,
 270                                std::string* result) {
 271   result->clear();
 272   string16 utf16;
 273   if (!CodepageToUTF16(
 274       text, charset.c_str(), OnStringConversionError::FAIL, &utf16))
 275     return false;
 276
 277   UErrorCode status = U_ZERO_ERROR;
 278   size_t max_length = utf16.length() + 1;
 279   string16 normalized_utf16;
 280   scoped_ptr<char16[]> buffer(new char16[max_length]);
 281   int actual_length = unorm_normalize(
 282       utf16.c_str(), utf16.length(), UNORM_NFC, 0,
 283       buffer.get(), static_cast<int>(max_length), &status);
 284   if (!U_SUCCESS(status))
 285     return false;
 286   normalized_utf16.assign(buffer.get(), actual_length);
 287
 288   return UTF16ToUTF8(normalized_utf16.data(),
 289                      normalized_utf16.length(), result);
 290 }
 291
 292 }  // namespace base