1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
8 #include "base/format_macros.h"
9 #include "base/i18n/break_iterator.h"
10 #include "base/strings/string_split.h"
11 #include "base/strings/stringprintf.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
14 #include "testing/gtest/include/gtest/gtest.h"
16 using base::i18n::BreakIterator
;
22 bool allow_contraction
;
23 const wchar_t* expected_words
;
26 base::string16
GetRulesForLanguage(const std::string
& language
) {
27 SpellcheckCharAttribute attribute
;
28 attribute
.SetDefaultLanguage(language
);
29 return attribute
.GetRuleSet(true);
34 // Tests whether or not our SpellcheckWordIterator can extract words used by the
35 // specified language from a multi-language text.
36 TEST(SpellcheckWordIteratorTest
, SplitWord
) {
37 // An input text. This text includes words of several languages. (Some words
38 // are not separated with whitespace characters.) Our SpellcheckWordIterator
39 // should extract the words used by the specified language from this text and
40 // normalize them so our spell-checker can check their spellings. If
41 // characters are found that are not from the specified language the test
43 const wchar_t kTestText
[] =
46 // Latin (including a contraction character and a ligature).
47 L
"hello:hello a\xFB03x"
49 L
"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
51 L
"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
52 L
"\x0443\x0439\x0442\x0435"
53 // Hebrew (including niqquds)
54 L
"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
55 // Hebrew words with U+0027 and U+05F3
56 L
"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
57 // Hebrew words with U+0022 and U+05F4
58 L
"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
59 // Hebrew words enclosed with ASCII quotes.
60 L
"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
61 // Arabic (including vowel marks)
62 L
"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
63 L
"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
64 L
"\x0652\x0643\x064f\x0645\x0652"
66 L
"\x0930\x093E\x091C\x0927\x093E\x0928"
68 L
"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
71 L
"\x3053\x3093\x306B\x3061\x306F"
75 L
"\xC548\xB155\xD558\xC138\xC694"
76 // Full-width latin : Hello
77 L
"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
80 // The languages and expected results used in this test.
81 static const TestCase kTestCases
[] = {
83 // English (keep contraction words)
84 "en-US", true, L
"hello:hello affix Hello e.g"
86 // English (split contraction words)
87 "en-US", false, L
"hello hello affix Hello e g"
91 L
"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
95 L
"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
96 L
"\x0443\x0439\x0442\x0435"
100 L
"\x05e9\x05dc\x05d5\x05dd "
101 L
"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
102 L
"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
103 L
"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
107 L
"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
108 L
"\x0644\x064a\x0643\x0645"
112 L
"\x0930\x093E\x091C\x0927\x093E\x0928"
116 L
"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
117 L
"\x0e23\x0e31\x0e1a"
121 L
"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
122 L
"\x1109\x1166\x110b\x116d"
126 for (size_t i
= 0; i
< arraysize(kTestCases
); ++i
) {
127 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS
"]: language=%s", i
,
128 kTestCases
[i
].language
));
130 SpellcheckCharAttribute attributes
;
131 attributes
.SetDefaultLanguage(kTestCases
[i
].language
);
133 base::string16
input(base::WideToUTF16(kTestText
));
134 SpellcheckWordIterator iterator
;
135 EXPECT_TRUE(iterator
.Initialize(&attributes
,
136 kTestCases
[i
].allow_contraction
));
137 EXPECT_TRUE(iterator
.SetText(input
.c_str(), input
.length()));
139 std::vector
<base::string16
> expected_words
= base::SplitString(
140 base::WideToUTF16(kTestCases
[i
].expected_words
),
141 base::string16(1, ' '), base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
143 base::string16 actual_word
;
144 int actual_start
, actual_end
;
146 for (SpellcheckWordIterator::WordIteratorStatus status
=
147 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
);
148 status
!= SpellcheckWordIterator::IS_END_OF_TEXT
;
150 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
)) {
151 if (status
== SpellcheckWordIterator::WordIteratorStatus::IS_SKIPPABLE
)
154 EXPECT_TRUE(index
< expected_words
.size());
155 if (index
< expected_words
.size())
156 EXPECT_EQ(expected_words
[index
], actual_word
);
162 // Tests whether our SpellcheckWordIterator extracts an empty word without
163 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
164 // regression test for Issue 46278.)
165 TEST(SpellcheckWordIteratorTest
, RuleSetConsistency
) {
166 SpellcheckCharAttribute attributes
;
167 attributes
.SetDefaultLanguage("en-US");
169 const wchar_t kTestText
[] = L
"\x1791\x17c1\x002e";
170 base::string16
input(base::WideToUTF16(kTestText
));
172 SpellcheckWordIterator iterator
;
173 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
174 EXPECT_TRUE(iterator
.SetText(input
.c_str(), input
.length()));
176 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
177 // iterator.GetNextWord() calls get stuck in an infinite loop. Therefore, this
178 // test succeeds if this call returns without timeouts.
179 base::string16 actual_word
;
180 int actual_start
, actual_end
;
181 SpellcheckWordIterator::WordIteratorStatus status
;
182 for (status
= iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
);
183 status
== SpellcheckWordIterator::IS_SKIPPABLE
;
185 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
)) {
189 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_END_OF_TEXT
, status
);
190 EXPECT_EQ(0, actual_start
);
191 EXPECT_EQ(0, actual_end
);
194 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
195 // on LTR languages. On the other hand, it should not treat ASCII numbers as
196 // word characters on RTL languages because they change the text direction from
198 TEST(SpellcheckWordIteratorTest
, TreatNumbersAsWordCharacters
) {
199 // A set of a language, a dummy word, and a text direction used in this test.
200 // For each language, this test splits a dummy word, which consists of ASCII
201 // numbers and an alphabet of the language, into words. When ASCII numbers are
202 // treated as word characters, the split word becomes equal to the dummy word.
203 // Otherwise, the split word does not include ASCII numbers.
204 static const struct {
205 const char* language
;
211 "en-US", L
"0123456789" L
"a", true,
214 "el-GR", L
"0123456789" L
"\x03B1", true,
217 "ru-RU", L
"0123456789" L
"\x0430", true,
220 "he-IL", L
"0123456789" L
"\x05D0", false,
223 "ar", L
"0123456789" L
"\x0627", false,
226 "hi-IN", L
"0123456789" L
"\x0905", true,
229 "th-TH", L
"0123456789" L
"\x0e01", true,
232 "ko-KR", L
"0123456789" L
"\x1100\x1161", true,
236 for (size_t i
= 0; i
< arraysize(kTestCases
); ++i
) {
237 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS
"]: language=%s", i
,
238 kTestCases
[i
].language
));
240 SpellcheckCharAttribute attributes
;
241 attributes
.SetDefaultLanguage(kTestCases
[i
].language
);
243 base::string16
input_word(base::WideToUTF16(kTestCases
[i
].text
));
244 SpellcheckWordIterator iterator
;
245 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
246 EXPECT_TRUE(iterator
.SetText(input_word
.c_str(), input_word
.length()));
248 base::string16 actual_word
;
249 int actual_start
, actual_end
;
250 SpellcheckWordIterator::WordIteratorStatus status
;
252 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
);
253 status
== SpellcheckWordIterator::IS_SKIPPABLE
;
255 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
)) {
259 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD
, status
);
260 if (kTestCases
[i
].left_to_right
)
261 EXPECT_EQ(input_word
, actual_word
);
263 EXPECT_NE(input_word
, actual_word
);
267 // Vertify SpellcheckWordIterator treats typographical apostrophe as a part of
269 TEST(SpellcheckWordIteratorTest
, TypographicalApostropheIsPartOfWord
) {
270 static const struct {
271 const char* language
;
274 // Typewriter apostrophe:
284 // Typographical apostrophe:
286 "en-AU", L
"you\x2019re"
288 "en-CA", L
"you\x2019re"
290 "en-GB", L
"you\x2019re"
292 "en-US", L
"you\x2019re"
296 for (size_t i
= 0; i
< arraysize(kTestCases
); ++i
) {
297 SpellcheckCharAttribute attributes
;
298 attributes
.SetDefaultLanguage(kTestCases
[i
].language
);
300 base::string16
input_word(base::WideToUTF16(kTestCases
[i
].word
));
301 SpellcheckWordIterator iterator
;
302 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
303 EXPECT_TRUE(iterator
.SetText(input_word
.c_str(), input_word
.length()));
305 base::string16 actual_word
;
306 int actual_start
, actual_end
;
307 SpellcheckWordIterator::WordIteratorStatus status
;
309 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
);
310 status
== SpellcheckWordIterator::IS_SKIPPABLE
;
311 iterator
.GetNextWord(&actual_word
, &actual_start
, &actual_end
)) {
315 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD
, status
);
316 EXPECT_EQ(input_word
, actual_word
);
317 EXPECT_EQ(0, actual_start
);
318 EXPECT_EQ(input_word
.length(),
319 static_cast<base::string16::size_type
>(actual_end
));
323 TEST(SpellcheckWordIteratorTest
, Initialization
) {
324 // Test initialization works when a default language is set.
326 SpellcheckCharAttribute attributes
;
327 attributes
.SetDefaultLanguage("en-US");
329 SpellcheckWordIterator iterator
;
330 EXPECT_TRUE(iterator
.Initialize(&attributes
, true));
333 // Test initialization fails when no default language is set.
335 SpellcheckCharAttribute attributes
;
337 SpellcheckWordIterator iterator
;
338 EXPECT_FALSE(iterator
.Initialize(&attributes
, true));
342 // This test uses English rules to check that different character set
343 // combinations properly find word breaks and skippable characters.
344 TEST(SpellcheckWordIteratorTest
, FindSkippableWordsEnglish
) {
345 // A string containing the English word "foo", followed by two Khmer
346 // characters, the English word "Can", and then two Russian characters and
349 base::WideToUTF16(L
"foo \x1791\x17C1 Can \x041C\x0438..."));
350 BreakIterator
iter(text
, GetRulesForLanguage("en-US"));
351 ASSERT_TRUE(iter
.Init());
353 EXPECT_TRUE(iter
.Advance());
355 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter
.GetString());
356 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK
);
357 EXPECT_TRUE(iter
.Advance());
358 // Finds the space and then the Khmer characters.
359 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
360 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
361 EXPECT_TRUE(iter
.Advance());
362 EXPECT_EQ(base::WideToUTF16(L
"\x1791\x17C1"), iter
.GetString());
363 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
364 EXPECT_TRUE(iter
.Advance());
365 // Finds the next space and "Can".
366 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
367 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
368 EXPECT_TRUE(iter
.Advance());
369 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter
.GetString());
370 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK
);
371 EXPECT_TRUE(iter
.Advance());
372 // Finds the next space and each Russian character.
373 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
374 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
375 EXPECT_TRUE(iter
.Advance());
376 EXPECT_EQ(base::WideToUTF16(L
"\x041C"), iter
.GetString());
377 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
378 EXPECT_TRUE(iter
.Advance());
379 EXPECT_EQ(base::WideToUTF16(L
"\x0438"), iter
.GetString());
380 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
381 EXPECT_TRUE(iter
.Advance());
382 // Finds the periods at the end.
383 EXPECT_EQ(base::UTF8ToUTF16("."), iter
.GetString());
384 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
385 EXPECT_TRUE(iter
.Advance());
386 EXPECT_EQ(base::UTF8ToUTF16("."), iter
.GetString());
387 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
388 EXPECT_TRUE(iter
.Advance());
389 EXPECT_EQ(base::UTF8ToUTF16("."), iter
.GetString());
390 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
391 EXPECT_FALSE(iter
.Advance());
394 // This test uses Russian rules to check that different character set
395 // combinations properly find word breaks and skippable characters.
396 TEST(SpellcheckWordIteratorTest
, FindSkippableWordsRussian
) {
397 // A string containing punctuation followed by two Russian characters, the
398 // English word "Can", and then two Khmer characters.
399 base::string16
text(base::WideToUTF16(L
".;\x041C\x0438 Can \x1791\x17C1 "));
400 BreakIterator
iter(text
, GetRulesForLanguage("ru-RU"));
401 ASSERT_TRUE(iter
.Init());
403 EXPECT_TRUE(iter
.Advance());
404 // Finds the period and semicolon.
405 EXPECT_EQ(base::UTF8ToUTF16("."), iter
.GetString());
406 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
407 EXPECT_TRUE(iter
.Advance());
408 EXPECT_EQ(base::UTF8ToUTF16(";"), iter
.GetString());
409 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
410 EXPECT_TRUE(iter
.Advance());
411 // Finds all the Russian characters.
412 EXPECT_EQ(base::WideToUTF16(L
"\x041C\x0438"), iter
.GetString());
413 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK
);
414 EXPECT_TRUE(iter
.Advance());
415 // Finds the space and each character in "Can".
416 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
417 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
418 EXPECT_TRUE(iter
.Advance());
419 EXPECT_EQ(base::UTF8ToUTF16("C"), iter
.GetString());
420 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
421 EXPECT_TRUE(iter
.Advance());
422 EXPECT_EQ(base::UTF8ToUTF16("a"), iter
.GetString());
423 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
424 EXPECT_TRUE(iter
.Advance());
425 EXPECT_EQ(base::UTF8ToUTF16("n"), iter
.GetString());
426 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
427 EXPECT_TRUE(iter
.Advance());
428 // Finds the next space, the Khmer characters, and the last two spaces.
429 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
430 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
431 EXPECT_TRUE(iter
.Advance());
432 EXPECT_EQ(base::WideToUTF16(L
"\x1791\x17C1"), iter
.GetString());
433 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
434 EXPECT_TRUE(iter
.Advance());
435 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
436 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
437 EXPECT_TRUE(iter
.Advance());
438 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
439 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
440 EXPECT_FALSE(iter
.Advance());
443 // This test uses Khmer rules to check that different character set combinations
444 // properly find word breaks and skippable characters. Khmer does not use spaces
445 // between words and uses a dictionary to determine word breaks instead.
446 TEST(SpellcheckWordIteratorTest
, FindSkippableWordsKhmer
) {
447 // A string containing two Russian characters followed by two, three, and
448 // two-character Khmer words, and then English characters and punctuation.
449 base::string16
text(base::WideToUTF16(
450 L
"\x041C\x0438 \x178F\x17BE\x179B\x17C4\x1780\x1798\x1780zoo. ,"));
451 BreakIterator
iter(text
, GetRulesForLanguage("km"));
452 ASSERT_TRUE(iter
.Init());
454 EXPECT_TRUE(iter
.Advance());
455 // Finds each Russian character and the space.
456 EXPECT_EQ(base::WideToUTF16(L
"\x041C"), iter
.GetString());
457 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
458 EXPECT_TRUE(iter
.Advance());
459 EXPECT_EQ(base::WideToUTF16(L
"\x0438"), iter
.GetString());
460 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
461 EXPECT_TRUE(iter
.Advance());
462 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
463 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
464 EXPECT_TRUE(iter
.Advance());
465 // Finds the first two-character Khmer word.
466 EXPECT_EQ(base::WideToUTF16(L
"\x178F\x17BE"), iter
.GetString());
467 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK
);
468 EXPECT_TRUE(iter
.Advance());
469 // Finds the three-character Khmer word and then the next two-character word.
470 // Note: Technically these are two different Khmer words so the Khmer language
471 // rule should find a break between them but due to the heuristic/statistical
472 // nature of the Khmer word breaker it does not.
473 EXPECT_EQ(base::WideToUTF16(L
"\x179B\x17C4\x1780\x1798\x1780"),
475 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK
);
476 EXPECT_TRUE(iter
.Advance());
477 // Finds each character in "zoo".
478 EXPECT_EQ(base::UTF8ToUTF16("z"), iter
.GetString());
479 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
480 EXPECT_TRUE(iter
.Advance());
481 EXPECT_EQ(base::UTF8ToUTF16("o"), iter
.GetString());
482 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
483 EXPECT_TRUE(iter
.Advance());
484 EXPECT_EQ(base::UTF8ToUTF16("o"), iter
.GetString());
485 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
486 EXPECT_TRUE(iter
.Advance());
487 // Finds the period, space, and comma.
488 EXPECT_EQ(base::UTF8ToUTF16("."), iter
.GetString());
489 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
490 EXPECT_TRUE(iter
.Advance());
491 EXPECT_EQ(base::UTF8ToUTF16(" "), iter
.GetString());
492 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
493 EXPECT_TRUE(iter
.Advance());
494 EXPECT_EQ(base::UTF8ToUTF16(","), iter
.GetString());
495 EXPECT_EQ(iter
.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD
);
496 EXPECT_FALSE(iter
.Advance());