chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <string>
   6 #include <vector>
   7
   8 #include "base/format_macros.h"
   9 #include "base/i18n/break_iterator.h"
  10 #include "base/strings/string_split.h"
  11 #include "base/strings/stringprintf.h"
  12 #include "base/strings/utf_string_conversions.h"
  13 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
  14 #include "testing/gtest/include/gtest/gtest.h"
  15
  16 using base::i18n::BreakIterator;
  17
  18 namespace {
  19
  20 struct TestCase {
  21     const char* language;
  22     bool allow_contraction;
  23     const wchar_t* expected_words;
  24 };
  25
  26 base::string16 GetRulesForLanguage(const std::string& language) {
  27   SpellcheckCharAttribute attribute;
  28   attribute.SetDefaultLanguage(language);
  29   return attribute.GetRuleSet(true);
  30 }
  31
  32 }  // namespace
  33
  34 // Tests whether or not our SpellcheckWordIterator can extract words used by the
  35 // specified language from a multi-language text.
  36 TEST(SpellcheckWordIteratorTest, SplitWord) {
  37   // An input text. This text includes words of several languages. (Some words
  38   // are not separated with whitespace characters.) Our SpellcheckWordIterator
  39   // should extract the words used by the specified language from this text and
  40   // normalize them so our spell-checker can check their spellings. If
  41   // characters are found that are not from the specified language the test
  42   // skips them.
  43   const wchar_t kTestText[] =
  44       // Graphic characters
  45       L"!@#$%^&*()"
  46       // Latin (including a contraction character and a ligature).
  47       L"hello:hello a\xFB03x"
  48       // Greek
  49       L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
  50       // Cyrillic
  51       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
  52       L"\x0443\x0439\x0442\x0435"
  53       // Hebrew (including niqquds)
  54       L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
  55       // Hebrew words with U+0027 and U+05F3
  56       L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
  57       // Hebrew words with U+0022 and U+05F4
  58       L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
  59       // Hebrew words enclosed with ASCII quotes.
  60       L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
  61       // Arabic (including vowel marks)
  62       L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
  63       L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
  64       L"\x0652\x0643\x064f\x0645\x0652"
  65       // Hindi
  66       L"\x0930\x093E\x091C\x0927\x093E\x0928"
  67       // Thai
  68       L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
  69       L"\x0e23\x0e31\x0e1a"
  70       // Hiraganas
  71       L"\x3053\x3093\x306B\x3061\x306F"
  72       // CJKV ideographs
  73       L"\x4F60\x597D"
  74       // Hangul Syllables
  75       L"\xC548\xB155\xD558\xC138\xC694"
  76       // Full-width latin : Hello
  77       L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
  78       L"e.g.,";
  79
  80   // The languages and expected results used in this test.
  81   static const TestCase kTestCases[] = {
  82     {
  83       // English (keep contraction words)
  84       "en-US", true, L"hello:hello affix Hello e.g"
  85     }, {
  86       // English (split contraction words)
  87       "en-US", false, L"hello hello affix Hello e g"
  88     }, {
  89       // Greek
  90       "el-GR", true,
  91       L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
  92     }, {
  93       // Russian
  94       "ru-RU", true,
  95       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
  96       L"\x0443\x0439\x0442\x0435"
  97     }, {
  98       // Hebrew
  99       "he-IL", true,
 100       L"\x05e9\x05dc\x05d5\x05dd "
 101       L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
 102       L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
 103       L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
 104     }, {
 105       // Arabic
 106       "ar", true,
 107       L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
 108       L"\x0644\x064a\x0643\x0645"
 109     }, {
 110       // Hindi
 111       "hi-IN", true,
 112       L"\x0930\x093E\x091C\x0927\x093E\x0928"
 113     }, {
 114       // Thai
 115       "th-TH", true,
 116       L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
 117       L"\x0e23\x0e31\x0e1a"
 118     }, {
 119       // Korean
 120       "ko-KR", true,
 121       L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
 122       L"\x1109\x1166\x110b\x116d"
 123     },
 124   };
 125
 126   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
 127     SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
 128                                     kTestCases[i].language));
 129
 130     SpellcheckCharAttribute attributes;
 131     attributes.SetDefaultLanguage(kTestCases[i].language);
 132
 133     base::string16 input(base::WideToUTF16(kTestText));
 134     SpellcheckWordIterator iterator;
 135     EXPECT_TRUE(iterator.Initialize(&attributes,
 136                                     kTestCases[i].allow_contraction));
 137     EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
 138
 139     std::vector<base::string16> expected_words = base::SplitString(
 140         base::WideToUTF16(kTestCases[i].expected_words),
 141         base::string16(1, ' '), base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
 142
 143     base::string16 actual_word;
 144     int actual_start, actual_end;
 145     size_t index = 0;
 146     for (SpellcheckWordIterator::WordIteratorStatus status =
 147              iterator.GetNextWord(&actual_word, &actual_start, &actual_end);
 148          status != SpellcheckWordIterator::IS_END_OF_TEXT;
 149          status =
 150              iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
 151       if (status == SpellcheckWordIterator::WordIteratorStatus::IS_SKIPPABLE)
 152         continue;
 153
 154       EXPECT_TRUE(index < expected_words.size());
 155       if (index < expected_words.size())
 156         EXPECT_EQ(expected_words[index], actual_word);
 157       ++index;
 158     }
 159   }
 160 }
 161
 162 // Tests whether our SpellcheckWordIterator extracts an empty word without
 163 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
 164 // regression test for Issue 46278.)
 165 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
 166   SpellcheckCharAttribute attributes;
 167   attributes.SetDefaultLanguage("en-US");
 168
 169   const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
 170   base::string16 input(base::WideToUTF16(kTestText));
 171
 172   SpellcheckWordIterator iterator;
 173   EXPECT_TRUE(iterator.Initialize(&attributes, true));
 174   EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
 175
 176   // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
 177   // iterator.GetNextWord() calls get stuck in an infinite loop. Therefore, this
 178   // test succeeds if this call returns without timeouts.
 179   base::string16 actual_word;
 180   int actual_start, actual_end;
 181   SpellcheckWordIterator::WordIteratorStatus status;
 182   for (status = iterator.GetNextWord(&actual_word, &actual_start, &actual_end);
 183        status == SpellcheckWordIterator::IS_SKIPPABLE;
 184        status =
 185            iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
 186     continue;
 187   }
 188
 189   EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_END_OF_TEXT, status);
 190   EXPECT_EQ(0, actual_start);
 191   EXPECT_EQ(0, actual_end);
 192 }
 193
 194 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
 195 // on LTR languages. On the other hand, it should not treat ASCII numbers as
 196 // word characters on RTL languages because they change the text direction from
 197 // RTL to LTR.
 198 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
 199   // A set of a language, a dummy word, and a text direction used in this test.
 200   // For each language, this test splits a dummy word, which consists of ASCII
 201   // numbers and an alphabet of the language, into words. When ASCII numbers are
 202   // treated as word characters, the split word becomes equal to the dummy word.
 203   // Otherwise, the split word does not include ASCII numbers.
 204   static const struct {
 205     const char* language;
 206     const wchar_t* text;
 207     bool left_to_right;
 208   } kTestCases[] = {
 209     {
 210       // English
 211       "en-US", L"0123456789" L"a", true,
 212     }, {
 213       // Greek
 214       "el-GR", L"0123456789" L"\x03B1", true,
 215     }, {
 216       // Russian
 217       "ru-RU", L"0123456789" L"\x0430", true,
 218     }, {
 219       // Hebrew
 220       "he-IL", L"0123456789" L"\x05D0", false,
 221     }, {
 222       // Arabic
 223       "ar",  L"0123456789" L"\x0627", false,
 224     }, {
 225       // Hindi
 226       "hi-IN", L"0123456789" L"\x0905", true,
 227     }, {
 228       // Thai
 229       "th-TH", L"0123456789" L"\x0e01", true,
 230     }, {
 231       // Korean
 232       "ko-KR", L"0123456789" L"\x1100\x1161", true,
 233     },
 234   };
 235
 236   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
 237     SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
 238                                     kTestCases[i].language));
 239
 240     SpellcheckCharAttribute attributes;
 241     attributes.SetDefaultLanguage(kTestCases[i].language);
 242
 243     base::string16 input_word(base::WideToUTF16(kTestCases[i].text));
 244     SpellcheckWordIterator iterator;
 245     EXPECT_TRUE(iterator.Initialize(&attributes, true));
 246     EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
 247
 248     base::string16 actual_word;
 249     int actual_start, actual_end;
 250     SpellcheckWordIterator::WordIteratorStatus status;
 251     for (status =
 252              iterator.GetNextWord(&actual_word, &actual_start, &actual_end);
 253          status == SpellcheckWordIterator::IS_SKIPPABLE;
 254          status =
 255              iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
 256       continue;
 257     }
 258
 259     EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);
 260     if (kTestCases[i].left_to_right)
 261       EXPECT_EQ(input_word, actual_word);
 262     else
 263       EXPECT_NE(input_word, actual_word);
 264   }
 265 }
 266
 267 // Vertify SpellcheckWordIterator treats typographical apostrophe as a part of
 268 // the word.
 269 TEST(SpellcheckWordIteratorTest, TypographicalApostropheIsPartOfWord) {
 270   static const struct {
 271     const char* language;
 272     const wchar_t* word;
 273   } kTestCases[] = {
 274     // Typewriter apostrophe:
 275     {
 276       "en-AU", L"you're"
 277     }, {
 278       "en-CA", L"you're"
 279     }, {
 280       "en-GB", L"you're"
 281     }, {
 282       "en-US", L"you're"
 283     },
 284     // Typographical apostrophe:
 285     {
 286       "en-AU", L"you\x2019re"
 287     }, {
 288       "en-CA", L"you\x2019re"
 289     }, {
 290       "en-GB", L"you\x2019re"
 291     }, {
 292       "en-US", L"you\x2019re"
 293     },
 294   };
 295
 296   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
 297     SpellcheckCharAttribute attributes;
 298     attributes.SetDefaultLanguage(kTestCases[i].language);
 299
 300     base::string16 input_word(base::WideToUTF16(kTestCases[i].word));
 301     SpellcheckWordIterator iterator;
 302     EXPECT_TRUE(iterator.Initialize(&attributes, true));
 303     EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
 304
 305     base::string16 actual_word;
 306     int actual_start, actual_end;
 307     SpellcheckWordIterator::WordIteratorStatus status;
 308     for (status =
 309              iterator.GetNextWord(&actual_word, &actual_start, &actual_end);
 310          status == SpellcheckWordIterator::IS_SKIPPABLE;
 311          iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
 312       continue;
 313     }
 314
 315     EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);
 316     EXPECT_EQ(input_word, actual_word);
 317     EXPECT_EQ(0, actual_start);
 318     EXPECT_EQ(input_word.length(),
 319               static_cast<base::string16::size_type>(actual_end));
 320   }
 321 }
 322
 323 TEST(SpellcheckWordIteratorTest, Initialization) {
 324   // Test initialization works when a default language is set.
 325   {
 326     SpellcheckCharAttribute attributes;
 327     attributes.SetDefaultLanguage("en-US");
 328
 329     SpellcheckWordIterator iterator;
 330     EXPECT_TRUE(iterator.Initialize(&attributes, true));
 331   }
 332
 333   // Test initialization fails when no default language is set.
 334   {
 335     SpellcheckCharAttribute attributes;
 336
 337     SpellcheckWordIterator iterator;
 338     EXPECT_FALSE(iterator.Initialize(&attributes, true));
 339   }
 340 }
 341
 342 // This test uses English rules to check that different character set
 343 // combinations properly find word breaks and skippable characters.
 344 TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) {
 345   // A string containing the English word "foo", followed by two Khmer
 346   // characters, the English word "Can", and then two Russian characters and
 347   // punctuation.
 348   base::string16 text(
 349       base::WideToUTF16(L"foo \x1791\x17C1 Can \x041C\x0438..."));
 350   BreakIterator iter(text, GetRulesForLanguage("en-US"));
 351   ASSERT_TRUE(iter.Init());
 352
 353   EXPECT_TRUE(iter.Advance());
 354   // Finds "foo".
 355   EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());
 356   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
 357   EXPECT_TRUE(iter.Advance());
 358   // Finds the space and then the Khmer characters.
 359   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 360   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 361   EXPECT_TRUE(iter.Advance());
 362   EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
 363   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 364   EXPECT_TRUE(iter.Advance());
 365   // Finds the next space and "Can".
 366   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 367   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 368   EXPECT_TRUE(iter.Advance());
 369   EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());
 370   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
 371   EXPECT_TRUE(iter.Advance());
 372   // Finds the next space and each Russian character.
 373   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 374   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 375   EXPECT_TRUE(iter.Advance());
 376   EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());
 377   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 378   EXPECT_TRUE(iter.Advance());
 379   EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());
 380   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 381   EXPECT_TRUE(iter.Advance());
 382   // Finds the periods at the end.
 383   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
 384   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 385   EXPECT_TRUE(iter.Advance());
 386   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
 387   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 388   EXPECT_TRUE(iter.Advance());
 389   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
 390   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 391   EXPECT_FALSE(iter.Advance());
 392 }
 393
 394 // This test uses Russian rules to check that different character set
 395 // combinations properly find word breaks and skippable characters.
 396 TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) {
 397   // A string containing punctuation followed by two Russian characters, the
 398   // English word "Can", and then two Khmer characters.
 399   base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17C1  "));
 400   BreakIterator iter(text, GetRulesForLanguage("ru-RU"));
 401   ASSERT_TRUE(iter.Init());
 402
 403   EXPECT_TRUE(iter.Advance());
 404   // Finds the period and semicolon.
 405   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
 406   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 407   EXPECT_TRUE(iter.Advance());
 408   EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString());
 409   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 410   EXPECT_TRUE(iter.Advance());
 411   // Finds all the Russian characters.
 412   EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());
 413   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
 414   EXPECT_TRUE(iter.Advance());
 415   // Finds the space and each character in "Can".
 416   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 417   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 418   EXPECT_TRUE(iter.Advance());
 419   EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString());
 420   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 421   EXPECT_TRUE(iter.Advance());
 422   EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString());
 423   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 424   EXPECT_TRUE(iter.Advance());
 425   EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString());
 426   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 427   EXPECT_TRUE(iter.Advance());
 428   // Finds the next space, the Khmer characters, and the last two spaces.
 429   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 430   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 431   EXPECT_TRUE(iter.Advance());
 432   EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
 433   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 434   EXPECT_TRUE(iter.Advance());
 435   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 436   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 437   EXPECT_TRUE(iter.Advance());
 438   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 439   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 440   EXPECT_FALSE(iter.Advance());
 441 }
 442
 443 // This test uses Khmer rules to check that different character set combinations
 444 // properly find word breaks and skippable characters. Khmer does not use spaces
 445 // between words and uses a dictionary to determine word breaks instead.
 446 TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) {
 447   // A string containing two Russian characters followed by two, three, and
 448   // two-character Khmer words, and then English characters and punctuation.
 449   base::string16 text(base::WideToUTF16(
 450       L"\x041C\x0438 \x178F\x17BE\x179B\x17C4\x1780\x1798\x1780zoo. ,"));
 451   BreakIterator iter(text, GetRulesForLanguage("km"));
 452   ASSERT_TRUE(iter.Init());
 453
 454   EXPECT_TRUE(iter.Advance());
 455   // Finds each Russian character and the space.
 456   EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());
 457   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 458   EXPECT_TRUE(iter.Advance());
 459   EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());
 460   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 461   EXPECT_TRUE(iter.Advance());
 462   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 463   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 464   EXPECT_TRUE(iter.Advance());
 465   // Finds the first two-character Khmer word.
 466   EXPECT_EQ(base::WideToUTF16(L"\x178F\x17BE"), iter.GetString());
 467   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
 468   EXPECT_TRUE(iter.Advance());
 469   // Finds the three-character Khmer word and then the next two-character word.
 470   // Note: Technically these are two different Khmer words so the Khmer language
 471   // rule should find a break between them but due to the heuristic/statistical
 472   // nature of the Khmer word breaker it does not.
 473   EXPECT_EQ(base::WideToUTF16(L"\x179B\x17C4\x1780\x1798\x1780"),
 474             iter.GetString());
 475   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
 476   EXPECT_TRUE(iter.Advance());
 477   // Finds each character in "zoo".
 478   EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString());
 479   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 480   EXPECT_TRUE(iter.Advance());
 481   EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());
 482   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 483   EXPECT_TRUE(iter.Advance());
 484   EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());
 485   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 486   EXPECT_TRUE(iter.Advance());
 487   // Finds the period, space, and comma.
 488   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
 489   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 490   EXPECT_TRUE(iter.Advance());
 491   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
 492   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 493   EXPECT_TRUE(iter.Advance());
 494   EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString());
 495   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
 496   EXPECT_FALSE(iter.Advance());
 497 }