1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Defines an iterator class that enumerates words supported by our spellchecker
6 // from multi-language text. This class is used for filtering out characters
7 // not supported by our spellchecker.
9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
14 #include "base/basictypes.h"
15 #include "base/memory/scoped_ptr.h"
16 #include "base/strings/string16.h"
17 #include "third_party/icu/source/common/unicode/uscript.h"
25 // A class which encapsulates language-specific operations used by
26 // SpellcheckWordIterator. When we set the spellchecker language, this class
27 // creates rule sets that filter out the characters not supported by the
28 // spellchecker. (Please read the comment in the SpellcheckWordIterator class
29 // about how to use this class.)
30 class SpellcheckCharAttribute
{
32 SpellcheckCharAttribute();
33 ~SpellcheckCharAttribute();
35 // Sets the language of the spellchecker. When this function is called with an
36 // ISO language code, this function creates the custom rule-sets used by
37 // the ICU break iterator so it can extract only words used by the language.
38 // GetRuleSet() returns the rule-sets created in this function.
39 void SetDefaultLanguage(const std::string
& language
);
41 // Returns a custom rule-set string used by the ICU break iterator. This class
42 // has two rule-sets, one splits a contraction and the other does not, so we
43 // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
44 // "seven", "year", and "old") and check their spellings. The result stirng is
45 // encoded in UTF-16 since ICU needs UTF-16 strings.
46 base::string16
GetRuleSet(bool allow_contraction
) const;
48 // Outputs a character only if it is a word character. (Please read the
49 // comments in CreateRuleSets() why we need this function.)
50 bool OutputChar(UChar c
, base::string16
* output
) const;
53 // Creates the rule-sets that return words possibly used by the given
54 // language. Unfortunately, these rule-sets are not perfect and have some
55 // false-positives. For example, they return combined accent marks even though
56 // we need English words only. We call OutputCharacter() to filter out such
57 // false-positive characters.
58 void CreateRuleSets(const std::string
& language
);
60 // Outputs a character only if it is one used by the given language. These
61 // functions are called from OutputChar().
62 bool OutputArabic(UChar c
, base::string16
* output
) const;
63 bool OutputHangul(UChar c
, base::string16
* output
) const;
64 bool OutputHebrew(UChar c
, base::string16
* output
) const;
65 bool OutputDefault(UChar c
, base::string16
* output
) const;
67 // The custom rule-set strings used by ICU break iterator. Since it is not so
68 // easy to create custom rule-sets from an ISO language code, this class
69 // saves these rule-set strings created when we set the language.
70 base::string16 ruleset_allow_contraction_
;
71 base::string16 ruleset_disallow_contraction_
;
73 // The script code used by this language.
74 UScriptCode script_code_
;
76 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute
);
79 // A class which extracts words that can be checked for spelling from a
80 // multi-language string. The ICU word-break iterator does not discard some
81 // punctuation characters attached to a word. For example, when we set a word
82 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
83 // it discard characters not used by the language. For example, it returns
84 // Russian words even though we need English words only. To extract only the
85 // words that our spellchecker can check their spellings, this class uses custom
86 // rule-sets created by the SpellcheckCharAttribute class. Also, this class
87 // normalizes extracted words so our spellchecker can check the spellings of
88 // words that include ligatures, combined characters, full-width characters,
89 // etc. This class uses UTF-16 strings as its input and output strings since
90 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions
91 // when changing the encoding of this string for our spellchecker. (Chrome can
92 // use two or more spellcheckers and we cannot assume their encodings.)
93 // The following snippet is an example that extracts words with this class.
95 // // Creates the language-specific attributes for US English.
96 // SpellcheckCharAttribute attribute;
97 // attribute.SetDefaultLanguage("en-US");
99 // // Set up a SpellcheckWordIterator object which extracts English words,
100 // // and retrieve them.
101 // SpellcheckWordIterator iterator;
102 // base::string16 text(base::UTF8ToUTF16("this is a test."));
103 // iterator.Initialize(&attribute, true);
104 // iterator.SetText(text.c_str(), text_.length());
106 // base::string16 word;
109 // while (iterator.GetNextWord(&word, &offset, &length)) {
113 class SpellcheckWordIterator
{
115 SpellcheckWordIterator();
116 ~SpellcheckWordIterator();
118 // Initializes a word-iterator object with the language-specific attribute. If
119 // we need to split contractions and concatenated words, call this function
120 // with its 'allow_contraction' parameter false. (This function uses lots of
121 // temporal memory to compile a custom word-break rule into an automaton.)
122 bool Initialize(const SpellcheckCharAttribute
* attribute
,
123 bool allow_contraction
);
125 // Returns whether this word iterator is initialized.
126 bool IsInitialized() const;
128 // Set text to be iterated. (This text does not have to be NULL-terminated.)
129 // This function also resets internal state so we can reuse this iterator
130 // without calling Initialize().
131 bool SetText(const base::char16
* text
, size_t length
);
133 // Retrieves a word (or a contraction), stores its copy to 'word_string', and
134 // stores the position and the length for input word to 'word_start'. Since
135 // this function normalizes the output word, the length of 'word_string' may
136 // be different from the 'word_length'. Therefore, when we call functions that
137 // changes the input text, such as string16::replace(), we need to use
138 // 'word_start' and 'word_length' as listed in the following snippet.
140 // while(iterator.GetNextWord(&word, &offset, &length))
141 // text.replace(offset, length, word);
143 bool GetNextWord(base::string16
* word_string
,
147 // Releases all the resources attached to this object.
151 // Normalizes a non-terminated string returned from an ICU word-break
152 // iterator. A word returned from an ICU break iterator may include characters
153 // not supported by our spellchecker, e.g. ligatures, combining/ characters,
154 // full-width letters, etc. This function replaces such characters with
155 // alternative characters supported by our spellchecker. This function also
156 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
158 bool Normalize(int input_start
,
160 base::string16
* output_string
) const;
162 // The pointer to the input string from which we are extracting words.
163 const base::char16
* text_
;
165 // The language-specific attributes used for filtering out non-word
167 const SpellcheckCharAttribute
* attribute_
;
169 // The break iterator.
170 scoped_ptr
<base::i18n::BreakIterator
> iterator_
;
172 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator
);
175 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_