Make hitting "Enter" submit the add/change profile dialog.
[chromium-blink-merge.git] / chrome / tools / convert_dict / aff_reader.cc
blobb24a0d87a4e79885fa2054ad6b752937d84f0f0d
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/tools/convert_dict/aff_reader.h"
7 #include <algorithm>
9 #include "base/file_util.h"
10 #include "base/i18n/icu_string_conversions.h"
11 #include "base/stringprintf.h"
12 #include "base/string_split.h"
13 #include "base/utf_string_conversions.h"
14 #include "chrome/tools/convert_dict/hunspell_reader.h"
16 namespace convert_dict {
18 namespace {
20 // Returns true if the given line begins with the given case-sensitive
21 // NULL-terminated ASCII string.
22 bool StringBeginsWith(const std::string& str, const char* with) {
23 size_t cur = 0;
24 while (cur < str.size() && with[cur] != 0) {
25 if (str[cur] != with[cur])
26 return false;
27 cur++;
29 return with[cur] == 0;
32 // Collapses runs of spaces to only one space.
33 void CollapseDuplicateSpaces(std::string* str) {
34 int prev_space = false;
35 for (size_t i = 0; i < str->length(); i++) {
36 if ((*str)[i] == ' ') {
37 if (prev_space) {
38 str->erase(str->begin() + i);
39 i--;
41 prev_space = true;
42 } else {
43 prev_space = false;
48 } // namespace
50 AffReader::AffReader(const FilePath& path)
51 : has_indexed_affixes_(false) {
52 file_ = file_util::OpenFile(path, "r");
54 // Default to Latin1 in case the file doesn't specify it.
55 encoding_ = "ISO8859-1";
58 AffReader::~AffReader() {
59 if (file_)
60 file_util::CloseFile(file_);
63 bool AffReader::Read() {
64 if (!file_)
65 return false;
67 // TODO(brettw) handle byte order mark.
69 bool got_command = false;
70 bool got_first_af = false;
71 bool got_first_rep = false;
73 has_indexed_affixes_ = false;
75 while (!feof(file_)) {
76 std::string line = ReadLine(file_);
78 // Save comment lines before any commands.
79 if (!got_command && !line.empty() && line[0] == '#') {
80 intro_comment_.append(line);
81 intro_comment_.push_back('\n');
82 continue;
85 StripComment(&line);
86 if (line.empty())
87 continue;
88 got_command = true;
90 if (StringBeginsWith(line, "SET ")) {
91 // Character set encoding.
92 encoding_ = line.substr(4);
93 TrimLine(&encoding_);
94 } else if (StringBeginsWith(line, "AF ")) {
95 // Affix. The first one is the number of ones following which we don't
96 // bother with.
97 has_indexed_affixes_ = true;
98 if (got_first_af) {
99 std::string group(line.substr(3));
100 AddAffixGroup(&group);
101 } else {
102 got_first_af = true;
104 } else if (StringBeginsWith(line, "SFX ") ||
105 StringBeginsWith(line, "PFX ")) {
106 AddAffix(&line);
107 } else if (StringBeginsWith(line, "REP ")) {
108 // The first rep line is the number of ones following which we don't
109 // bother with.
110 if (got_first_rep) {
111 std::string replacement(line.substr(4));
112 AddReplacement(&replacement);
113 } else {
114 got_first_rep = true;
116 } else if (StringBeginsWith(line, "TRY ") ||
117 StringBeginsWith(line, "MAP ")) {
118 HandleEncodedCommand(line);
119 } else if (StringBeginsWith(line, "IGNORE ")) {
120 printf("We don't support the IGNORE command yet. This would change how "
121 "we would insert things in our lookup table.\n");
122 exit(1);
123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
124 printf("We don't support the COMPLEXPREFIXES command yet. This would "
125 "mean we have to insert words backwards as well (I think)\n");
126 exit(1);
127 } else {
128 // All other commands get stored in the other commands list.
129 HandleRawCommand(line);
133 return true;
136 bool AffReader::EncodingToUTF8(const std::string& encoded,
137 std::string* utf8) const {
138 std::wstring wide_word;
139 if (!base::CodepageToWide(encoded, encoding(),
140 base::OnStringConversionError::FAIL, &wide_word))
141 return false;
142 *utf8 = WideToUTF8(wide_word);
143 return true;
146 int AffReader::GetAFIndexForAFString(const std::string& af_string) {
147 std::map<std::string, int>::iterator found = affix_groups_.find(af_string);
148 if (found != affix_groups_.end())
149 return found->second;
150 std::string my_string(af_string);
151 return AddAffixGroup(&my_string);
154 // We convert the data from our map to an indexed list, and also prefix each
155 // line with "AF" for the parser to read later.
156 std::vector<std::string> AffReader::GetAffixGroups() const {
157 int max_id = 0;
158 for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
159 i != affix_groups_.end(); ++i) {
160 if (i->second > max_id)
161 max_id = i->second;
164 std::vector<std::string> ret;
166 ret.resize(max_id);
167 for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
168 i != affix_groups_.end(); ++i) {
169 // Convert the indices into 1-based.
170 ret[i->second - 1] = std::string("AF ") + i->first;
173 return ret;
176 int AffReader::AddAffixGroup(std::string* rule) {
177 TrimLine(rule);
179 // We use the 1-based index of the rule. This matches the way Hunspell
180 // refers to the numbers.
181 int affix_id = static_cast<int>(affix_groups_.size()) + 1;
182 affix_groups_.insert(std::make_pair(*rule, affix_id));
183 return affix_id;
186 void AffReader::AddAffix(std::string* rule) {
187 TrimLine(rule);
188 CollapseDuplicateSpaces(rule);
190 // These lines have two forms:
191 // AFX D Y 4 <- First line, lists how many affixes for "D" there are.
192 // AFX D 0 d e <- Following lines.
193 // We want to ensure the two last groups on the last line are encoded in
194 // UTF-8, and we want to make sure that the affix identifier "D" is *not*
195 // encoded, since that's basically an 8-bit identifier.
197 // Count to the third space. Everything after that will be re-encoded. This
198 // will re-encode the number on the first line, but that will be a NOP. If
199 // there are not that many groups, we won't reencode it, but pass it through.
200 int found_spaces = 0;
201 std::string token;
202 for (size_t i = 0; i < rule->length(); i++) {
203 if ((*rule)[i] == ' ') {
204 found_spaces++;
205 if (found_spaces == 3) {
206 size_t part_start = i;
207 std::string part;
208 if (token[0] != 'Y' && token[0] != 'N') {
209 // This token represents a stripping prefix or suffix, which is
210 // either a length or a string to be replaced.
211 // We also reencode them to UTF-8.
212 part_start = i - token.length();
214 part = rule->substr(part_start); // From here to end.
216 if (part.find('-') != std::string::npos) {
217 // This rule has a morph rule used by old Hungarian dictionaries.
218 // When a line has a morph rule, its format becomes as listed below.
219 // AFX D 0 d e - M
220 // To make hunspell work more happily, replace this morph rule with
221 // a compound flag as listed below.
222 // AFX D 0 d/M e
223 std::vector<std::string> tokens;
224 base::SplitString(part, ' ', &tokens);
225 if (tokens.size() >= 5) {
226 part = base::StringPrintf("%s %s/%s %s",
227 tokens[0].c_str(),
228 tokens[1].c_str(),
229 tokens[4].c_str(),
230 tokens[2].c_str());
234 size_t slash_index = part.find('/');
235 if (slash_index != std::string::npos && !has_indexed_affixes()) {
236 // This can also have a rule string associated with it following a
237 // slash. For example:
238 // PFX P 0 foo/Y .
239 // The "Y" is a flag. For example, the aff file might have a line:
240 // COMPOUNDFLAG Y
241 // so that means that this prefix would be a compound one.
243 // It expects these rules to use the same alias rules as the .dic
244 // file. We've forced it to use aliases, which is a numerical index
245 // instead of these character flags, and this needs to be consistent.
247 std::string before_flags = part.substr(0, slash_index + 1);
249 // After the slash are both the flags, then whitespace, then the part
250 // that tells us what to strip.
251 std::vector<std::string> after_slash;
252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
253 if (after_slash.size() == 0) {
254 printf("ERROR: Found 0 terms after slash in affix rule '%s', "
255 "but need at least 2.\n",
256 part.c_str());
257 return;
259 if (after_slash.size() == 1) {
260 printf("WARNING: Found 1 term after slash in affix rule '%s', "
261 "but expected at least 2. Adding '.'.\n",
262 part.c_str());
263 after_slash.push_back(".");
265 // Note that we may get a third term here which is the morphological
266 // description of this rule. This happens in the tests only, so we can
267 // just ignore it.
269 part = base::StringPrintf("%s%d %s",
270 before_flags.c_str(),
271 GetAFIndexForAFString(after_slash[0]),
272 after_slash[1].c_str());
275 // Reencode from here
276 std::string reencoded;
277 if (!EncodingToUTF8(part, &reencoded)) {
278 printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n",
279 part.c_str());
280 break;
283 *rule = rule->substr(0, part_start) + reencoded;
284 break;
286 token.clear();
287 } else {
288 token.push_back((*rule)[i]);
292 affix_rules_.push_back(*rule);
295 void AffReader::AddReplacement(std::string* rule) {
296 TrimLine(rule);
297 CollapseDuplicateSpaces(rule);
299 std::string utf8rule;
300 if (!EncodingToUTF8(*rule, &utf8rule)) {
301 printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n",
302 rule->c_str());
303 return;
306 // The first space separates key and value.
307 size_t space_index = utf8rule.find(' ');
308 if (space_index == std::string::npos) {
309 printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str());
310 return;
312 std::vector<std::string> split;
313 split.push_back(utf8rule.substr(0, space_index));
314 split.push_back(utf8rule.substr(space_index + 1));
316 // Underscores are used to represent spaces in most aff files
317 // (since the line is parsed on spaces).
318 std::replace(split[0].begin(), split[0].end(), '_', ' ');
319 std::replace(split[1].begin(), split[1].end(), '_', ' ');
321 replacements_.push_back(std::make_pair(split[0], split[1]));
324 void AffReader::HandleRawCommand(const std::string& line) {
325 other_commands_.push_back(line);
328 void AffReader::HandleEncodedCommand(const std::string& line) {
329 std::string utf8;
330 if (!EncodingToUTF8(line, &utf8)) {
331 printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str());
332 return;
334 other_commands_.push_back(utf8);
337 } // namespace convert_dict