chrome/renderer/spellchecker/spellcheck_worditerator.h

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Defines an iterator class that enumerates words supported by our spellchecker
   6 // from multi-language text. This class is used for filtering out characters
   7 // not supported by our spellchecker.
   8
   9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
  10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
  11
  12 #include <string>
  13
  14 #include "base/basictypes.h"
  15 #include "base/memory/scoped_ptr.h"
  16 #include "base/strings/string16.h"
  17 #include "third_party/icu/source/common/unicode/uscript.h"
  18
  19 namespace base {
  20 namespace i18n {
  21 class BreakIterator;
  22 } // namespace i18n
  23 } // namespace base
  24
  25 // A class which encapsulates language-specific operations used by
  26 // SpellcheckWordIterator. When we set the spellchecker language, this class
  27 // creates rule sets that filter out the characters not supported by the
  28 // spellchecker. (Please read the comment in the SpellcheckWordIterator class
  29 // about how to use this class.)
  30 class SpellcheckCharAttribute {
  31  public:
  32   SpellcheckCharAttribute();
  33   ~SpellcheckCharAttribute();
  34
  35   // Sets the language of the spellchecker. When this function is called with an
  36   // ISO language code, this function creates the custom rule-sets used by
  37   // the ICU break iterator so it can extract only words used by the language.
  38   // GetRuleSet() returns the rule-sets created in this function.
  39   void SetDefaultLanguage(const std::string& language);
  40
  41   // Returns a custom rule-set string used by the ICU break iterator. This class
  42   // has two rule-sets, one splits a contraction and the other does not, so we
  43   // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
  44   // "seven", "year", and "old") and check their spellings. The result stirng is
  45   // encoded in UTF-16 since ICU needs UTF-16 strings.
  46   base::string16 GetRuleSet(bool allow_contraction) const;
  47
  48   // Outputs a character only if it is a word character. (Please read the
  49   // comments in CreateRuleSets() why we need this function.)
  50   bool OutputChar(UChar c, base::string16* output) const;
  51
  52  private:
  53   // Creates the rule-sets that return words possibly used by the given
  54   // language. Unfortunately, these rule-sets are not perfect and have some
  55   // false-positives. For example, they return combined accent marks even though
  56   // we need English words only. We call OutputCharacter() to filter out such
  57   // false-positive characters.
  58   void CreateRuleSets(const std::string& language);
  59
  60   // Outputs a character only if it is one used by the given language. These
  61   // functions are called from OutputChar().
  62   bool OutputArabic(UChar c, base::string16* output) const;
  63   bool OutputHangul(UChar c, base::string16* output) const;
  64   bool OutputHebrew(UChar c, base::string16* output) const;
  65   bool OutputDefault(UChar c, base::string16* output) const;
  66
  67   // The custom rule-set strings used by ICU break iterator. Since it is not so
  68   // easy to create custom rule-sets from an ISO language code, this class
  69   // saves these rule-set strings created when we set the language.
  70   base::string16 ruleset_allow_contraction_;
  71   base::string16 ruleset_disallow_contraction_;
  72
  73   // The script code used by this language.
  74   UScriptCode script_code_;
  75
  76   DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
  77 };
  78
  79 // A class which extracts words that can be checked for spelling from a
  80 // multi-language string. The ICU word-break iterator does not discard some
  81 // punctuation characters attached to a word. For example, when we set a word
  82 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
  83 // it discard characters not used by the language. For example, it returns
  84 // Russian words even though we need English words only. To extract only the
  85 // words that our spellchecker can check their spellings, this class uses custom
  86 // rule-sets created by the SpellcheckCharAttribute class. Also, this class
  87 // normalizes extracted words so our spellchecker can check the spellings of
  88 // words that include ligatures, combined characters, full-width characters,
  89 // etc. This class uses UTF-16 strings as its input and output strings since
  90 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions
  91 // when changing the encoding of this string for our spellchecker. (Chrome can
  92 // use two or more spellcheckers and we cannot assume their encodings.)
  93 // The following snippet is an example that extracts words with this class.
  94 //
  95 //   // Creates the language-specific attributes for US English.
  96 //   SpellcheckCharAttribute attribute;
  97 //   attribute.SetDefaultLanguage("en-US");
  98 //
  99 //   // Set up a SpellcheckWordIterator object which extracts English words,
 100 //   // and retrieve them.
 101 //   SpellcheckWordIterator iterator;
 102 //   base::string16 text(base::UTF8ToUTF16("this is a test."));
 103 //   iterator.Initialize(&attribute, true);
 104 //   iterator.SetText(text.c_str(), text_.length());
 105 //
 106 //   base::string16 word;
 107 //   int offset;
 108 //   int length;
 109 //   while (iterator.GetNextWord(&word, &offset, &length)) {
 110 //     ...
 111 //   }
 112 //
 113 class SpellcheckWordIterator {
 114  public:
 115   enum WordIteratorStatus {
 116     // The end of a sequence of text that the iterator recognizes as characters
 117     // that can form a word.
 118     IS_WORD,
 119     // Non-word characters that the iterator can skip past, such as punctuation,
 120     // whitespace, and characters from another character set.
 121     IS_SKIPPABLE,
 122     // The end of the text that the iterator is going over.
 123     IS_END_OF_TEXT
 124   };
 125
 126   SpellcheckWordIterator();
 127   ~SpellcheckWordIterator();
 128
 129   // Initializes a word-iterator object with the language-specific attribute. If
 130   // we need to split contractions and concatenated words, call this function
 131   // with its 'allow_contraction' parameter false. (This function uses lots of
 132   // temporal memory to compile a custom word-break rule into an automaton.)
 133   bool Initialize(const SpellcheckCharAttribute* attribute,
 134                   bool allow_contraction);
 135
 136   // Returns whether this word iterator is initialized.
 137   bool IsInitialized() const;
 138
 139   // Set text to be iterated. (This text does not have to be NULL-terminated.)
 140   // This function also resets internal state so we can reuse this iterator
 141   // without calling Initialize().
 142   bool SetText(const base::char16* text, size_t length);
 143
 144   // Advances |iterator_| through |text_| and gets the current status of the
 145   // word iterator within |text|:
 146   //
 147   //  - Returns IS_WORD if the iterator just found the end of a sequence of word
 148   //    characters and it was able to normalize the sequence. This stores the
 149   //    normalized string into |word_string| and stores the position and length
 150   //    into |word_start| and |word_length| respectively. Keep in mind that
 151   //    since this function normalizes the output word, the length of
 152   //    |word_string| may be different from the |word_length|. Therefore, when
 153   //    we call functions that change the input text, such as
 154   //    string16::replace(), we need to use |word_start| and |word_length| as
 155   //    listed in the following snippet:
 156   //
 157   //      while(iterator.GetNextWord(&word, &offset, &length))
 158   //        text.replace(offset, length, word);
 159   //
 160   //  - Returns IS_SKIPPABLE if the iterator just found a character that the
 161   //    iterator can skip past such as punctuation, whitespace, and characters
 162   //    from another character set. This stores the character, position, and
 163   //    length into |word_string|, |word_start|, and |word_length| respectively.
 164   //
 165   //  - Returns IS_END_OF_TEXT if the iterator has reached the end of |text_|.
 166   SpellcheckWordIterator::WordIteratorStatus
 167   GetNextWord(base::string16* word_string, int* word_start, int* word_length);
 168
 169   // Releases all the resources attached to this object.
 170   void Reset();
 171
 172  private:
 173   // Normalizes a non-terminated string returned from an ICU word-break
 174   // iterator. A word returned from an ICU break iterator may include characters
 175   // not supported by our spellchecker, e.g. ligatures, combining/ characters,
 176   // full-width letters, etc. This function replaces such characters with
 177   // alternative characters supported by our spellchecker. This function also
 178   // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
 179   // characters.
 180   bool Normalize(int input_start,
 181                  int input_length,
 182                  base::string16* output_string) const;
 183
 184   // The pointer to the input string from which we are extracting words.
 185   const base::char16* text_;
 186
 187   // The language-specific attributes used for filtering out non-word
 188   // characters.
 189   const SpellcheckCharAttribute* attribute_;
 190
 191   // The break iterator.
 192   scoped_ptr<base::i18n::BreakIterator> iterator_;
 193
 194   DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
 195 };
 196
 197 #endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
 198