chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <string>
   6 #include <vector>
   7
   8 #include "base/format_macros.h"
   9 #include "base/strings/string_split.h"
  10 #include "base/strings/stringprintf.h"
  11 #include "base/strings/utf_string_conversions.h"
  12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
  13 #include "testing/gtest/include/gtest/gtest.h"
  14
  15 namespace {
  16
  17 struct TestCase {
  18     const char* language;
  19     bool allow_contraction;
  20     const wchar_t* expected_words;
  21 };
  22
  23 }  // namespace
  24
  25 // Tests whether or not our SpellcheckWordIterator can extract only words used
  26 // by the specified language from a multi-language text.
  27 TEST(SpellcheckWordIteratorTest, SplitWord) {
  28   // An input text. This text includes words of several languages. (Some words
  29   // are not separated with whitespace characters.) Our SpellcheckWordIterator
  30   // should extract only the words used by the specified language from this text
  31   // and normalize them so our spell-checker can check their spellings.
  32   const wchar_t kTestText[] =
  33       // Graphic characters
  34       L"!@#$%^&*()"
  35       // Latin (including a contraction character and a ligature).
  36       L"hello:hello a\xFB03x"
  37       // Greek
  38       L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
  39       // Cyrillic
  40       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
  41       L"\x0443\x0439\x0442\x0435"
  42       // Hebrew (including niqquds)
  43       L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
  44       // Hebrew words with U+0027 and U+05F3
  45       L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
  46       // Hebrew words with U+0022 and U+05F4
  47       L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
  48       // Hebrew words enclosed with ASCII quotes.
  49       L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
  50       // Arabic (including vowel marks)
  51       L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
  52       L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
  53       L"\x0652\x0643\x064f\x0645\x0652"
  54       // Hindi
  55       L"\x0930\x093E\x091C\x0927\x093E\x0928"
  56       // Thai
  57       L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
  58       L"\x0e23\x0e31\x0e1a"
  59       // Hiraganas
  60       L"\x3053\x3093\x306B\x3061\x306F"
  61       // CJKV ideographs
  62       L"\x4F60\x597D"
  63       // Hangul Syllables
  64       L"\xC548\xB155\xD558\xC138\xC694"
  65       // Full-width latin : Hello
  66       L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
  67       L"e.g.,";
  68
  69   // The languages and expected results used in this test.
  70   static const TestCase kTestCases[] = {
  71     {
  72       // English (keep contraction words)
  73       "en-US", true, L"hello:hello affix Hello e.g"
  74     }, {
  75       // English (split contraction words)
  76       "en-US", false, L"hello hello affix Hello e g"
  77     }, {
  78       // Greek
  79       "el-GR", true,
  80       L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
  81     }, {
  82       // Russian
  83       "ru-RU", true,
  84       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
  85       L"\x0443\x0439\x0442\x0435"
  86     }, {
  87       // Hebrew
  88       "he-IL", true,
  89       L"\x05e9\x05dc\x05d5\x05dd "
  90       L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
  91       L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
  92       L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
  93     }, {
  94       // Arabic
  95       "ar", true,
  96       L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
  97       L"\x0644\x064a\x0643\x0645"
  98     }, {
  99       // Hindi
 100       "hi-IN", true,
 101       L"\x0930\x093E\x091C\x0927\x093E\x0928"
 102     }, {
 103       // Thai
 104       "th-TH", true,
 105       L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
 106       L"\x0e23\x0e31\x0e1a"
 107     }, {
 108       // Korean
 109       "ko-KR", true,
 110       L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
 111       L"\x1109\x1166\x110b\x116d"
 112     },
 113   };
 114
 115   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
 116     SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
 117                                     kTestCases[i].language));
 118
 119     SpellcheckCharAttribute attributes;
 120     attributes.SetDefaultLanguage(kTestCases[i].language);
 121
 122     base::string16 input(base::WideToUTF16(kTestText));
 123     SpellcheckWordIterator iterator;
 124     EXPECT_TRUE(iterator.Initialize(&attributes,
 125                                     kTestCases[i].allow_contraction));
 126     EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
 127
 128     std::vector<base::string16> expected_words;
 129     base::SplitString(
 130         base::WideToUTF16(kTestCases[i].expected_words), ' ', &expected_words);
 131
 132     base::string16 actual_word;
 133     int actual_start, actual_end;
 134     size_t index = 0;
 135     while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
 136       EXPECT_TRUE(index < expected_words.size());
 137       if (index < expected_words.size())
 138         EXPECT_EQ(expected_words[index], actual_word);
 139       ++index;
 140     }
 141   }
 142 }
 143
 144 // Tests whether our SpellcheckWordIterator extracts an empty word without
 145 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
 146 // regression test for Issue 46278.)
 147 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
 148   SpellcheckCharAttribute attributes;
 149   attributes.SetDefaultLanguage("en-US");
 150
 151   const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
 152   base::string16 input(base::WideToUTF16(kTestText));
 153
 154   SpellcheckWordIterator iterator;
 155   EXPECT_TRUE(iterator.Initialize(&attributes, true));
 156   EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
 157
 158   // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
 159   // iterator.GetNextWord() call gets stuck in an infinite loop. Therefore, this
 160   // test succeeds if this call returns without timeouts.
 161   base::string16 actual_word;
 162   int actual_start, actual_end;
 163   EXPECT_FALSE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
 164   EXPECT_EQ(0, actual_start);
 165   EXPECT_EQ(0, actual_end);
 166 }
 167
 168 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
 169 // on LTR languages. On the other hand, it should not treat ASCII numbers as
 170 // word characters on RTL languages because they change the text direction from
 171 // RTL to LTR.
 172 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
 173   // A set of a language, a dummy word, and a text direction used in this test.
 174   // For each language, this test splits a dummy word, which consists of ASCII
 175   // numbers and an alphabet of the language, into words. When ASCII numbers are
 176   // treated as word characters, the split word becomes equal to the dummy word.
 177   // Otherwise, the split word does not include ASCII numbers.
 178   static const struct {
 179     const char* language;
 180     const wchar_t* text;
 181     bool left_to_right;
 182   } kTestCases[] = {
 183     {
 184       // English
 185       "en-US", L"0123456789" L"a", true,
 186     }, {
 187       // Greek
 188       "el-GR", L"0123456789" L"\x03B1", true,
 189     }, {
 190       // Russian
 191       "ru-RU", L"0123456789" L"\x0430", true,
 192     }, {
 193       // Hebrew
 194       "he-IL", L"0123456789" L"\x05D0", false,
 195     }, {
 196       // Arabic
 197       "ar",  L"0123456789" L"\x0627", false,
 198     }, {
 199       // Hindi
 200       "hi-IN", L"0123456789" L"\x0905", true,
 201     }, {
 202       // Thai
 203       "th-TH", L"0123456789" L"\x0e01", true,
 204     }, {
 205       // Korean
 206       "ko-KR", L"0123456789" L"\x1100\x1161", true,
 207     },
 208   };
 209
 210   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
 211     SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
 212                                     kTestCases[i].language));
 213
 214     SpellcheckCharAttribute attributes;
 215     attributes.SetDefaultLanguage(kTestCases[i].language);
 216
 217     base::string16 input_word(base::WideToUTF16(kTestCases[i].text));
 218     SpellcheckWordIterator iterator;
 219     EXPECT_TRUE(iterator.Initialize(&attributes, true));
 220     EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
 221
 222     base::string16 actual_word;
 223     int actual_start, actual_end;
 224     EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
 225     if (kTestCases[i].left_to_right)
 226       EXPECT_EQ(input_word, actual_word);
 227     else
 228       EXPECT_NE(input_word, actual_word);
 229   }
 230 }
 231
 232 TEST(SpellcheckWordIteratorTest, Initialization) {
 233   // Test initialization works when a default language is set.
 234   {
 235     SpellcheckCharAttribute attributes;
 236     attributes.SetDefaultLanguage("en-US");
 237
 238     SpellcheckWordIterator iterator;
 239     EXPECT_TRUE(iterator.Initialize(&attributes, true));
 240   }
 241
 242   // Test initialization fails when no default language is set.
 243   {
 244     SpellcheckCharAttribute attributes;
 245
 246     SpellcheckWordIterator iterator;
 247     EXPECT_FALSE(iterator.Initialize(&attributes, true));
 248   }
 249 }