1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
8 #include "base/format_macros.h"
9 #include "base/strings/string_split.h"
10 #include "base/strings/stringprintf.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
13 #include "testing/gtest/include/gtest/gtest.h"
19 bool allow_contraction
;
20 const wchar_t* expected_words
;
25 // Tests whether or not our SpellcheckWordIterator can extract only words used
26 // by the specified language from a multi-language text.
27 TEST(SpellcheckWordIteratorTest
, SplitWord
) {
28 // An input text. This text includes words of several languages. (Some words
29 // are not separated with whitespace characters.) Our SpellcheckWordIterator
30 // should extract only the words used by the specified language from this text
31 // and normalize them so our spell-checker can check their spellings.
32 const wchar_t kTestText
[] =
35 // Latin (including a contraction character and a ligature).
36 L
"hello:hello a\xFB03x"
38 L
"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
40 L
"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
41 L
"\x0443\x0439\x0442\x0435"
42 // Hebrew (including niqquds)
43 L
"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
44 // Hebrew words with U+0027 and U+05F3
45 L
"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
46 // Hebrew words with U+0022 and U+05F4
47 L
"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
48 // Hebrew words enclosed with ASCII quotes.
49 L
"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
50 // Arabic (including vowel marks)
51 L
"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
52 L
"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
53 L
"\x0652\x0643\x064f\x0645\x0652"
55 L
"\x0930\x093E\x091C\x0927\x093E\x0928"
57 L
"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
60 L
"\x3053\x3093\x306B\x3061\x306F"
64 L
"\xC548\xB155\xD558\xC138\xC694"
65 // Full-width latin : Hello
66 L
"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
69 // The languages and expected results used in this test.
70 static const TestCase kTestCases
[] = {
72 // English (keep contraction words)
73 "en-US", true, L
"hello:hello affix Hello e.g"
75 // English (split contraction words)
76 "en-US", false, L
"hello hello affix Hello e g"
80 L
"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
84 L
"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
85 L
"\x0443\x0439\x0442\x0435"
89 L
"\x05e9\x05dc\x05d5\x05dd "
90 L
"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
91 L
"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
92 L
"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
96 L
"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
97 L
"\x0644\x064a\x0643\x0645"
101 L
"\x0930\x093E\x091C\x0927\x093E\x0928"
105 L
"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
106 L
"\x0e23\x0e31\x0e1a"
110 L
"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
111 L
"\x1109\x1166\x110b\x116d"
115 for (size_t i
= 0; i
< arraysize(kTestCases
); ++i
) {
116 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS
"]: language=%s", i
,
117 kTestCases
[i
].language
));
119 SpellcheckCharAttribute attributes
;
120 attributes
.SetDefaultLanguage(kTestCases
[i
].language
);
122 base::string16
input(base::WideToUTF16(kTestText
));
123 SpellcheckWordIterator iterator
;
124 EXPECT_TRUE(iterator
.Initialize(&attributes
,
125 kTestCases
[i
].allow_contraction
));
126 EXPECT_TRUE(iterator
.SetText(input
.c_str(), input
.length()));
128 std::vector
<base::string16
> expected_words
;
130 base::WideToUTF16(kTestCases
[i
].expected_words
), ' ', &expected_words
);
132 base::string16 actual_word
;
133 int actual_start
, actual_end
;
135 while (iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
)) {
136 EXPECT_TRUE(index
< expected_words
.size());
137 if (index
< expected_words
.size())
138 EXPECT_EQ(expected_words
[index
], actual_word
);
144 // Tests whether our SpellcheckWordIterator extracts an empty word without
145 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
146 // regression test for Issue 46278.)
147 TEST(SpellcheckWordIteratorTest
, RuleSetConsistency
) {
148 SpellcheckCharAttribute attributes
;
149 attributes
.SetDefaultLanguage("en-US");
151 const wchar_t kTestText
[] = L
"\x1791\x17c1\x002e";
152 base::string16
input(base::WideToUTF16(kTestText
));
154 SpellcheckWordIterator iterator
;
155 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
156 EXPECT_TRUE(iterator
.SetText(input
.c_str(), input
.length()));
158 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
159 // iterator.GetNextWord() call gets stuck in an infinite loop. Therefore, this
160 // test succeeds if this call returns without timeouts.
161 base::string16 actual_word
;
162 int actual_start
, actual_end
;
163 EXPECT_FALSE(iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
));
164 EXPECT_EQ(0, actual_start
);
165 EXPECT_EQ(0, actual_end
);
168 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
169 // on LTR languages. On the other hand, it should not treat ASCII numbers as
170 // word characters on RTL languages because they change the text direction from
172 TEST(SpellcheckWordIteratorTest
, TreatNumbersAsWordCharacters
) {
173 // A set of a language, a dummy word, and a text direction used in this test.
174 // For each language, this test splits a dummy word, which consists of ASCII
175 // numbers and an alphabet of the language, into words. When ASCII numbers are
176 // treated as word characters, the split word becomes equal to the dummy word.
177 // Otherwise, the split word does not include ASCII numbers.
178 static const struct {
179 const char* language
;
185 "en-US", L
"0123456789" L
"a", true,
188 "el-GR", L
"0123456789" L
"\x03B1", true,
191 "ru-RU", L
"0123456789" L
"\x0430", true,
194 "he-IL", L
"0123456789" L
"\x05D0", false,
197 "ar", L
"0123456789" L
"\x0627", false,
200 "hi-IN", L
"0123456789" L
"\x0905", true,
203 "th-TH", L
"0123456789" L
"\x0e01", true,
206 "ko-KR", L
"0123456789" L
"\x1100\x1161", true,
210 for (size_t i
= 0; i
< ARRAYSIZE_UNSAFE(kTestCases
); ++i
) {
211 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS
"]: language=%s", i
,
212 kTestCases
[i
].language
));
214 SpellcheckCharAttribute attributes
;
215 attributes
.SetDefaultLanguage(kTestCases
[i
].language
);
217 base::string16
input_word(base::WideToUTF16(kTestCases
[i
].text
));
218 SpellcheckWordIterator iterator
;
219 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
220 EXPECT_TRUE(iterator
.SetText(input_word
.c_str(), input_word
.length()));
222 base::string16 actual_word
;
223 int actual_start
, actual_end
;
224 EXPECT_TRUE(iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
));
225 if (kTestCases
[i
].left_to_right
)
226 EXPECT_EQ(input_word
, actual_word
);
228 EXPECT_NE(input_word
, actual_word
);
232 TEST(SpellcheckWordIteratorTest
, Initialization
) {
233 // Test initialization works when a default language is set.
235 SpellcheckCharAttribute attributes
;
236 attributes
.SetDefaultLanguage("en-US");
238 SpellcheckWordIterator iterator
;
239 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
242 // Test initialization fails when no default language is set.
244 SpellcheckCharAttribute attributes
;
246 SpellcheckWordIterator iterator
;
247 EXPECT_FALSE(iterator
.Initialize(&attributes
, true));