Pin Chrome's shortcut to the Win10 Start menu on install and OS upgrade.
[chromium-blink-merge.git] / chrome / renderer / spellchecker / spellcheck_worditerator_unittest.cc
blob08809ded8e7905613cfbdafbd361a970d7d5a333
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <string>
6 #include <vector>
8 #include "base/format_macros.h"
9 #include "base/strings/string_split.h"
10 #include "base/strings/stringprintf.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
13 #include "testing/gtest/include/gtest/gtest.h"
15 namespace {
17 struct TestCase {
18 const char* language;
19 bool allow_contraction;
20 const wchar_t* expected_words;
23 } // namespace
25 // Tests whether or not our SpellcheckWordIterator can extract only words used
26 // by the specified language from a multi-language text.
27 TEST(SpellcheckWordIteratorTest, SplitWord) {
28 // An input text. This text includes words of several languages. (Some words
29 // are not separated with whitespace characters.) Our SpellcheckWordIterator
30 // should extract only the words used by the specified language from this text
31 // and normalize them so our spell-checker can check their spellings.
32 const wchar_t kTestText[] =
33 // Graphic characters
34 L"!@#$%^&*()"
35 // Latin (including a contraction character and a ligature).
36 L"hello:hello a\xFB03x"
37 // Greek
38 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
39 // Cyrillic
40 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
41 L"\x0443\x0439\x0442\x0435"
42 // Hebrew (including niqquds)
43 L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
44 // Hebrew words with U+0027 and U+05F3
45 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
46 // Hebrew words with U+0022 and U+05F4
47 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
48 // Hebrew words enclosed with ASCII quotes.
49 L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
50 // Arabic (including vowel marks)
51 L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
52 L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
53 L"\x0652\x0643\x064f\x0645\x0652"
54 // Hindi
55 L"\x0930\x093E\x091C\x0927\x093E\x0928"
56 // Thai
57 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
58 L"\x0e23\x0e31\x0e1a"
59 // Hiraganas
60 L"\x3053\x3093\x306B\x3061\x306F"
61 // CJKV ideographs
62 L"\x4F60\x597D"
63 // Hangul Syllables
64 L"\xC548\xB155\xD558\xC138\xC694"
65 // Full-width latin : Hello
66 L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
67 L"e.g.,";
69 // The languages and expected results used in this test.
70 static const TestCase kTestCases[] = {
72 // English (keep contraction words)
73 "en-US", true, L"hello:hello affix Hello e.g"
74 }, {
75 // English (split contraction words)
76 "en-US", false, L"hello hello affix Hello e g"
77 }, {
78 // Greek
79 "el-GR", true,
80 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
81 }, {
82 // Russian
83 "ru-RU", true,
84 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
85 L"\x0443\x0439\x0442\x0435"
86 }, {
87 // Hebrew
88 "he-IL", true,
89 L"\x05e9\x05dc\x05d5\x05dd "
90 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
91 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
92 L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
93 }, {
94 // Arabic
95 "ar", true,
96 L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
97 L"\x0644\x064a\x0643\x0645"
98 }, {
99 // Hindi
100 "hi-IN", true,
101 L"\x0930\x093E\x091C\x0927\x093E\x0928"
102 }, {
103 // Thai
104 "th-TH", true,
105 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
106 L"\x0e23\x0e31\x0e1a"
107 }, {
108 // Korean
109 "ko-KR", true,
110 L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
111 L"\x1109\x1166\x110b\x116d"
115 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
116 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
117 kTestCases[i].language));
119 SpellcheckCharAttribute attributes;
120 attributes.SetDefaultLanguage(kTestCases[i].language);
122 base::string16 input(base::WideToUTF16(kTestText));
123 SpellcheckWordIterator iterator;
124 EXPECT_TRUE(iterator.Initialize(&attributes,
125 kTestCases[i].allow_contraction));
126 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
128 std::vector<base::string16> expected_words = base::SplitString(
129 base::WideToUTF16(kTestCases[i].expected_words),
130 base::string16(1, ' '), base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
132 base::string16 actual_word;
133 int actual_start, actual_end;
134 size_t index = 0;
135 while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
136 EXPECT_TRUE(index < expected_words.size());
137 if (index < expected_words.size())
138 EXPECT_EQ(expected_words[index], actual_word);
139 ++index;
144 // Tests whether our SpellcheckWordIterator extracts an empty word without
145 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
146 // regression test for Issue 46278.)
147 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
148 SpellcheckCharAttribute attributes;
149 attributes.SetDefaultLanguage("en-US");
151 const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
152 base::string16 input(base::WideToUTF16(kTestText));
154 SpellcheckWordIterator iterator;
155 EXPECT_TRUE(iterator.Initialize(&attributes, true));
156 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
158 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
159 // iterator.GetNextWord() call gets stuck in an infinite loop. Therefore, this
160 // test succeeds if this call returns without timeouts.
161 base::string16 actual_word;
162 int actual_start, actual_end;
163 EXPECT_FALSE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
164 EXPECT_EQ(0, actual_start);
165 EXPECT_EQ(0, actual_end);
168 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
169 // on LTR languages. On the other hand, it should not treat ASCII numbers as
170 // word characters on RTL languages because they change the text direction from
171 // RTL to LTR.
172 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
173 // A set of a language, a dummy word, and a text direction used in this test.
174 // For each language, this test splits a dummy word, which consists of ASCII
175 // numbers and an alphabet of the language, into words. When ASCII numbers are
176 // treated as word characters, the split word becomes equal to the dummy word.
177 // Otherwise, the split word does not include ASCII numbers.
178 static const struct {
179 const char* language;
180 const wchar_t* text;
181 bool left_to_right;
182 } kTestCases[] = {
184 // English
185 "en-US", L"0123456789" L"a", true,
186 }, {
187 // Greek
188 "el-GR", L"0123456789" L"\x03B1", true,
189 }, {
190 // Russian
191 "ru-RU", L"0123456789" L"\x0430", true,
192 }, {
193 // Hebrew
194 "he-IL", L"0123456789" L"\x05D0", false,
195 }, {
196 // Arabic
197 "ar", L"0123456789" L"\x0627", false,
198 }, {
199 // Hindi
200 "hi-IN", L"0123456789" L"\x0905", true,
201 }, {
202 // Thai
203 "th-TH", L"0123456789" L"\x0e01", true,
204 }, {
205 // Korean
206 "ko-KR", L"0123456789" L"\x1100\x1161", true,
210 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
211 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
212 kTestCases[i].language));
214 SpellcheckCharAttribute attributes;
215 attributes.SetDefaultLanguage(kTestCases[i].language);
217 base::string16 input_word(base::WideToUTF16(kTestCases[i].text));
218 SpellcheckWordIterator iterator;
219 EXPECT_TRUE(iterator.Initialize(&attributes, true));
220 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
222 base::string16 actual_word;
223 int actual_start, actual_end;
224 EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
225 if (kTestCases[i].left_to_right)
226 EXPECT_EQ(input_word, actual_word);
227 else
228 EXPECT_NE(input_word, actual_word);
232 // Vertify SpellcheckWordIterator treats typographical apostrophe as a part of
233 // the word.
234 TEST(SpellcheckWordIteratorTest, TypographicalApostropheIsPartOfWord) {
235 static const struct {
236 const char* language;
237 const wchar_t* word;
238 } kTestCases[] = {
239 // Typewriter apostrophe:
241 "en-AU", L"you're"
242 }, {
243 "en-CA", L"you're"
244 }, {
245 "en-GB", L"you're"
246 }, {
247 "en-US", L"you're"
249 // Typographical apostrophe:
251 "en-AU", L"you\x2019re"
252 }, {
253 "en-CA", L"you\x2019re"
254 }, {
255 "en-GB", L"you\x2019re"
256 }, {
257 "en-US", L"you\x2019re"
261 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
262 SpellcheckCharAttribute attributes;
263 attributes.SetDefaultLanguage(kTestCases[i].language);
265 base::string16 input_word(base::WideToUTF16(kTestCases[i].word));
266 SpellcheckWordIterator iterator;
267 EXPECT_TRUE(iterator.Initialize(&attributes, true));
268 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
270 base::string16 actual_word;
271 int actual_start, actual_end;
272 EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
273 EXPECT_EQ(input_word, actual_word);
274 EXPECT_EQ(0, actual_start);
275 EXPECT_EQ(input_word.length(),
276 static_cast<base::string16::size_type>(actual_end));
280 TEST(SpellcheckWordIteratorTest, Initialization) {
281 // Test initialization works when a default language is set.
283 SpellcheckCharAttribute attributes;
284 attributes.SetDefaultLanguage("en-US");
286 SpellcheckWordIterator iterator;
287 EXPECT_TRUE(iterator.Initialize(&attributes, true));
290 // Test initialization fails when no default language is set.
292 SpellcheckCharAttribute attributes;
294 SpellcheckWordIterator iterator;
295 EXPECT_FALSE(iterator.Initialize(&attributes, true));