base/i18n/icu_string_conversions_unittest.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <math.h>
   6 #include <stdarg.h>
   7
   8 #include <limits>
   9 #include <sstream>
  10
  11 #include "base/basictypes.h"
  12 #include "base/format_macros.h"
  13 #include "base/i18n/icu_string_conversions.h"
  14 #include "base/logging.h"
  15 #include "base/strings/string_piece.h"
  16 #include "base/strings/stringprintf.h"
  17 #include "base/strings/utf_string_conversions.h"
  18 #include "testing/gtest/include/gtest/gtest.h"
  19
  20 namespace base {
  21
  22 namespace {
  23
  24 // Given a null-terminated string of wchar_t with each wchar_t representing
  25 // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
  26 // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
  27 // should be represented as a surrogate pair (two UTF-16 units)
  28 // *even* where wchar_t is 32-bit (Linux and Mac).
  29 //
  30 // This is to help write tests for functions with string16 params until
  31 // the C++ 0x UTF-16 literal is well-supported by compilers.
  32 string16 BuildString16(const wchar_t* s) {
  33 #if defined(WCHAR_T_IS_UTF16)
  34   return string16(s);
  35 #elif defined(WCHAR_T_IS_UTF32)
  36   string16 u16;
  37   while (*s != 0) {
  38     DCHECK_LE(static_cast<unsigned int>(*s), 0xFFFFu);
  39     u16.push_back(*s++);
  40   }
  41   return u16;
  42 #endif
  43 }
  44
  45 }  // namespace
  46
  47 // kConverterCodepageCases is not comprehensive. There are a number of cases
  48 // to add if we really want to have a comprehensive coverage of various
  49 // codepages and their 'idiosyncrasies'. Currently, the only implementation
  50 // for CodepageTo* and *ToCodepage uses ICU, which has a very extensive
  51 // set of tests for the charset conversion. So, we can get away with a
  52 // relatively small number of cases listed below.
  53 //
  54 // Note about |u16_wide| in the following struct.
  55 // On Windows, the field is always identical to |wide|. On Mac and Linux,
  56 // it's identical as long as there's no character outside the
  57 // BMP (<= U+FFFF). When there is, it is different from |wide| and
  58 // is not a real wide string (UTF-32 string) in that each wchar_t in
  59 // the string is a UTF-16 code unit zero-extended to be 32-bit
  60 // even when the code unit belongs to a surrogate pair.
  61 // For instance, a Unicode string (U+0041 U+010000) is represented as
  62 // L"\x0041\xD800\xDC00" instead of L"\x0041\x10000".
  63 // To avoid the clutter, |u16_wide| will be set to NULL
  64 // if it's identical to |wide| on *all* platforms.
  65
  66 static const struct {
  67   const char* codepage_name;
  68   const char* encoded;
  69   OnStringConversionError::Type on_error;
  70   bool success;
  71   const wchar_t* wide;
  72   const wchar_t* u16_wide;
  73 } kConvertCodepageCases[] = {
  74   // Test a case where the input cannot be decoded, using SKIP, FAIL
  75   // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't.
  76   {"big5",
  77    "\xA7\x41\xA6",
  78    OnStringConversionError::FAIL,
  79    false,
  80    L"",
  81    NULL},
  82   {"big5",
  83    "\xA7\x41\xA6",
  84    OnStringConversionError::SKIP,
  85    true,
  86    L"\x4F60",
  87    NULL},
  88   {"big5",
  89    "\xA7\x41\xA6",
  90    OnStringConversionError::SUBSTITUTE,
  91    true,
  92    L"\x4F60\xFFFD",
  93    NULL},
  94   // Arabic (ISO-8859)
  95   {"iso-8859-6",
  96    "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
  97    "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
  98    OnStringConversionError::FAIL,
  99    true,
 100    L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
 101    L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652",
 102    NULL},
 103   // Chinese Simplified (GB2312)
 104   {"gb2312",
 105    "\xC4\xE3\xBA\xC3",
 106    OnStringConversionError::FAIL,
 107    true,
 108    L"\x4F60\x597D",
 109    NULL},
 110   // Chinese (GB18030) : 4 byte sequences mapped to BMP characters
 111   {"gb18030",
 112    "\x81\x30\x84\x36\xA1\xA7",
 113    OnStringConversionError::FAIL,
 114    true,
 115    L"\x00A5\x00A8",
 116    NULL},
 117   // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000)
 118   {"gb18030",
 119    "\x95\x32\x82\x36\xD2\xBB",
 120    OnStringConversionError::FAIL,
 121    true,
 122 #if defined(WCHAR_T_IS_UTF16)
 123    L"\xD840\xDC00\x4E00",
 124 #elif defined(WCHAR_T_IS_UTF32)
 125    L"\x20000\x4E00",
 126 #endif
 127    L"\xD840\xDC00\x4E00"},
 128   {"big5",
 129    "\xA7\x41\xA6\x6E",
 130    OnStringConversionError::FAIL,
 131    true,
 132    L"\x4F60\x597D",
 133    NULL},
 134   // Greek (ISO-8859)
 135   {"iso-8859-7",
 136    "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
 137    OnStringConversionError::FAIL,
 138    true,
 139    L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5",
 140    NULL},
 141   // Hebrew (Windows)
 142   {"windows-1255",
 143    "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
 144    OnStringConversionError::FAIL,
 145    true,
 146    L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD",
 147    NULL},
 148   // Korean (EUC)
 149   {"euc-kr",
 150    "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
 151    OnStringConversionError::FAIL,
 152    true,
 153    L"\xC548\xB155\xD558\xC138\xC694",
 154    NULL},
 155   // Japanese (EUC)
 156   {"euc-jp",
 157    "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8E\xA6",
 158    OnStringConversionError::FAIL,
 159    true,
 160    L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66",
 161    NULL},
 162   // Japanese (ISO-2022)
 163   {"iso-2022-jp",
 164    "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B"
 165    "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B",
 166    OnStringConversionError::FAIL,
 167    true,
 168    L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$",
 169    NULL},
 170   // Japanese (Shift-JIS)
 171   {"sjis",
 172    "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6",
 173    OnStringConversionError::FAIL,
 174    true,
 175    L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66",
 176    NULL},
 177   // Russian (KOI8)
 178   {"koi8-r",
 179    "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
 180    OnStringConversionError::FAIL,
 181    true,
 182    L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
 183    L"\x0443\x0439\x0442\x0435",
 184    NULL},
 185   // Thai (windows-874)
 186   {"windows-874",
 187    "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
 188    OnStringConversionError::FAIL,
 189    true,
 190    L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
 191    L"\x0E04\x0E23\x0e31\x0E1A",
 192    NULL},
 193 };
 194
 195 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) {
 196   for (size_t i = 0; i < arraysize(kConvertCodepageCases); ++i) {
 197     SCOPED_TRACE(base::StringPrintf(
 198                      "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i,
 199                      kConvertCodepageCases[i].encoded,
 200                      kConvertCodepageCases[i].codepage_name));
 201
 202     string16 utf16;
 203     bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded,
 204                                    kConvertCodepageCases[i].codepage_name,
 205                                    kConvertCodepageCases[i].on_error,
 206                                    &utf16);
 207     string16 utf16_expected;
 208     if (kConvertCodepageCases[i].u16_wide == NULL)
 209       utf16_expected = BuildString16(kConvertCodepageCases[i].wide);
 210     else
 211       utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide);
 212     EXPECT_EQ(kConvertCodepageCases[i].success, success);
 213     EXPECT_EQ(utf16_expected, utf16);
 214
 215     // When decoding was successful and nothing was skipped, we also check the
 216     // reverse conversion. See also the corresponding comment in
 217     // ConvertBetweenCodepageAndWide.
 218     if (success &&
 219         kConvertCodepageCases[i].on_error == OnStringConversionError::FAIL) {
 220       std::string encoded;
 221       success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name,
 222                                 kConvertCodepageCases[i].on_error, &encoded);
 223       EXPECT_EQ(kConvertCodepageCases[i].success, success);
 224       EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded);
 225     }
 226   }
 227 }
 228
 229 static const struct {
 230   const char* encoded;
 231   const char* codepage_name;
 232   bool expected_success;
 233   const char* expected_value;
 234 } kConvertAndNormalizeCases[] = {
 235   {"foo-\xe4.html", "iso-8859-1", true, "foo-\xc3\xa4.html"},
 236   {"foo-\xe4.html", "iso-8859-7", true, "foo-\xce\xb4.html"},
 237   {"foo-\xe4.html", "foo-bar", false, ""},
 238   // HTML Encoding spec treats US-ASCII as synonymous with windows-1252
 239   {"foo-\xff.html", "ascii", true, "foo-\xc3\xbf.html"},
 240   {"foo.html", "ascii", true, "foo.html"},
 241   {"foo-a\xcc\x88.html", "utf-8", true, "foo-\xc3\xa4.html"},
 242   {"\x95\x32\x82\x36\xD2\xBB", "gb18030", true, "\xF0\xA0\x80\x80\xE4\xB8\x80"},
 243   {"\xA7\x41\xA6\x6E", "big5", true, "\xE4\xBD\xA0\xE5\xA5\xBD"},
 244   // Windows-1258 does have a combining character at xD2 (which is U+0309).
 245   // The sequence of (U+00E2, U+0309) is also encoded as U+1EA9.
 246   {"foo\xE2\xD2", "windows-1258", true, "foo\xE1\xBA\xA9"},
 247   {"", "iso-8859-1", true, ""},
 248 };
 249 TEST(ICUStringConversionsTest, ConvertToUtf8AndNormalize) {
 250   std::string result;
 251   for (size_t i = 0; i < arraysize(kConvertAndNormalizeCases); ++i) {
 252     SCOPED_TRACE(base::StringPrintf(
 253                      "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i,
 254                      kConvertAndNormalizeCases[i].encoded,
 255                      kConvertAndNormalizeCases[i].codepage_name));
 256
 257     bool success = ConvertToUtf8AndNormalize(
 258         kConvertAndNormalizeCases[i].encoded,
 259         kConvertAndNormalizeCases[i].codepage_name, &result);
 260     EXPECT_EQ(kConvertAndNormalizeCases[i].expected_success, success);
 261     EXPECT_EQ(kConvertAndNormalizeCases[i].expected_value, result);
 262   }
 263 }
 264
 265 }  // namespace base