1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/tools/convert_dict/aff_reader.h"
9 #include "base/file_util.h"
10 #include "base/i18n/icu_string_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/stringprintf.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "chrome/tools/convert_dict/hunspell_reader.h"
16 namespace convert_dict
{
20 // Returns true if the given line begins with the given case-sensitive
21 // NULL-terminated ASCII string.
22 bool StringBeginsWith(const std::string
& str
, const char* with
) {
24 while (cur
< str
.size() && with
[cur
] != 0) {
25 if (str
[cur
] != with
[cur
])
29 return with
[cur
] == 0;
32 // Collapses runs of spaces to only one space.
33 void CollapseDuplicateSpaces(std::string
* str
) {
34 int prev_space
= false;
35 for (size_t i
= 0; i
< str
->length(); i
++) {
36 if ((*str
)[i
] == ' ') {
38 str
->erase(str
->begin() + i
);
48 // Print an error message and terminate execution
49 void Panic(const char* fmt
, ...) {
61 AffReader::AffReader(const base::FilePath
& path
)
62 : has_indexed_affixes_(false) {
63 file_
= base::OpenFile(path
, "r");
65 // Default to Latin1 in case the file doesn't specify it.
66 encoding_
= "ISO8859-1";
69 AffReader::~AffReader() {
71 base::CloseFile(file_
);
74 bool AffReader::Read() {
78 // TODO(brettw) handle byte order mark.
80 bool got_command
= false;
81 bool got_first_af
= false;
82 bool got_first_rep
= false;
84 has_indexed_affixes_
= false;
86 while (!feof(file_
)) {
87 std::string line
= ReadLine(file_
);
89 // Save comment lines before any commands.
90 if (!got_command
&& !line
.empty() && line
[0] == '#') {
91 intro_comment_
.append(line
);
92 intro_comment_
.push_back('\n');
101 if (StringBeginsWith(line
, "SET ")) {
102 // Character set encoding.
103 encoding_
= line
.substr(4);
104 TrimLine(&encoding_
);
105 } else if (StringBeginsWith(line
, "AF ")) {
106 // Affix. The first one is the number of ones following which we don't
108 has_indexed_affixes_
= true;
110 std::string
group(line
.substr(3));
111 AddAffixGroup(&group
);
115 } else if (StringBeginsWith(line
, "SFX ") ||
116 StringBeginsWith(line
, "PFX ")) {
118 } else if (StringBeginsWith(line
, "REP ")) {
119 // The first rep line is the number of ones following which we don't
122 std::string
replacement(line
.substr(4));
123 AddReplacement(&replacement
);
125 got_first_rep
= true;
127 } else if (StringBeginsWith(line
, "TRY ") ||
128 StringBeginsWith(line
, "MAP ")) {
129 HandleEncodedCommand(line
);
130 } else if (StringBeginsWith(line
, "IGNORE ")) {
131 Panic("We don't support the IGNORE command yet. This would change how "
132 "we would insert things in our lookup table.");
133 } else if (StringBeginsWith(line
, "COMPLEXPREFIXES ")) {
134 Panic("We don't support the COMPLEXPREFIXES command yet. This would "
135 "mean we have to insert words backwards as well (I think)");
137 // All other commands get stored in the other commands list.
138 HandleRawCommand(line
);
145 bool AffReader::EncodingToUTF8(const std::string
& encoded
,
146 std::string
* utf8
) const {
147 std::wstring wide_word
;
148 if (!base::CodepageToWide(encoded
, encoding(),
149 base::OnStringConversionError::FAIL
, &wide_word
))
151 *utf8
= base::WideToUTF8(wide_word
);
155 int AffReader::GetAFIndexForAFString(const std::string
& af_string
) {
156 std::map
<std::string
, int>::iterator found
= affix_groups_
.find(af_string
);
157 if (found
!= affix_groups_
.end())
158 return found
->second
;
159 std::string
my_string(af_string
);
160 return AddAffixGroup(&my_string
);
163 // We convert the data from our map to an indexed list, and also prefix each
164 // line with "AF" for the parser to read later.
165 std::vector
<std::string
> AffReader::GetAffixGroups() const {
167 for (std::map
<std::string
, int>::const_iterator i
= affix_groups_
.begin();
168 i
!= affix_groups_
.end(); ++i
) {
169 if (i
->second
> max_id
)
173 std::vector
<std::string
> ret
;
176 for (std::map
<std::string
, int>::const_iterator i
= affix_groups_
.begin();
177 i
!= affix_groups_
.end(); ++i
) {
178 // Convert the indices into 1-based.
179 ret
[i
->second
- 1] = std::string("AF ") + i
->first
;
185 int AffReader::AddAffixGroup(std::string
* rule
) {
188 // We use the 1-based index of the rule. This matches the way Hunspell
189 // refers to the numbers.
190 int affix_id
= static_cast<int>(affix_groups_
.size()) + 1;
191 affix_groups_
.insert(std::make_pair(*rule
, affix_id
));
195 void AffReader::AddAffix(std::string
* rule
) {
197 CollapseDuplicateSpaces(rule
);
199 // These lines have two forms:
200 // AFX D Y 4 <- First line, lists how many affixes for "D" there are.
201 // AFX D 0 d e <- Following lines.
202 // We want to ensure the two last groups on the last line are encoded in
203 // UTF-8, and we want to make sure that the affix identifier "D" is *not*
204 // encoded, since that's basically an 8-bit identifier.
206 // Count to the third space. Everything after that will be re-encoded. This
207 // will re-encode the number on the first line, but that will be a NOP. If
208 // there are not that many groups, we won't reencode it, but pass it through.
209 int found_spaces
= 0;
211 for (size_t i
= 0; i
< rule
->length(); i
++) {
212 if ((*rule
)[i
] == ' ') {
214 if (found_spaces
== 3) {
215 size_t part_start
= i
;
217 if (token
[0] != 'Y' && token
[0] != 'N') {
218 // This token represents a stripping prefix or suffix, which is
219 // either a length or a string to be replaced.
220 // We also reencode them to UTF-8.
221 part_start
= i
- token
.length();
223 part
= rule
->substr(part_start
); // From here to end.
225 if (part
.find('-') != std::string::npos
) {
226 // This rule has a morph rule used by old Hungarian dictionaries.
227 // When a line has a morph rule, its format becomes as listed below.
229 // To make hunspell work more happily, replace this morph rule with
230 // a compound flag as listed below.
232 std::vector
<std::string
> tokens
;
233 base::SplitString(part
, ' ', &tokens
);
234 if (tokens
.size() >= 5) {
235 part
= base::StringPrintf("%s %s/%s %s",
243 size_t slash_index
= part
.find('/');
244 if (slash_index
!= std::string::npos
&& !has_indexed_affixes()) {
245 // This can also have a rule string associated with it following a
246 // slash. For example:
248 // The "Y" is a flag. For example, the aff file might have a line:
250 // so that means that this prefix would be a compound one.
252 // It expects these rules to use the same alias rules as the .dic
253 // file. We've forced it to use aliases, which is a numerical index
254 // instead of these character flags, and this needs to be consistent.
256 std::string before_flags
= part
.substr(0, slash_index
+ 1);
258 // After the slash are both the flags, then whitespace, then the part
259 // that tells us what to strip.
260 std::vector
<std::string
> after_slash
;
261 base::SplitString(part
.substr(slash_index
+ 1), ' ', &after_slash
);
262 if (after_slash
.size() == 0) {
263 Panic("Found 0 terms after slash in affix rule '%s', "
264 "but need at least 2.",
267 if (after_slash
.size() == 1) {
268 printf("WARNING: Found 1 term after slash in affix rule '%s', "
269 "but expected at least 2. Adding '.'.\n",
271 after_slash
.push_back(".");
273 // Note that we may get a third term here which is the morphological
274 // description of this rule. This happens in the tests only, so we can
277 part
= base::StringPrintf("%s%d %s",
278 before_flags
.c_str(),
279 GetAFIndexForAFString(after_slash
[0]),
280 after_slash
[1].c_str());
283 // Reencode from here
284 std::string reencoded
;
285 if (!EncodingToUTF8(part
, &reencoded
))
286 Panic("Cannot encode affix rule part '%s' to utf8.", part
.c_str());
288 *rule
= rule
->substr(0, part_start
) + reencoded
;
293 token
.push_back((*rule
)[i
]);
297 affix_rules_
.push_back(*rule
);
300 void AffReader::AddReplacement(std::string
* rule
) {
302 CollapseDuplicateSpaces(rule
);
304 std::string utf8rule
;
305 if (!EncodingToUTF8(*rule
, &utf8rule
))
306 Panic("Cannot encode replacement rule '%s' to utf8.", rule
->c_str());
308 // The first space separates key and value.
309 size_t space_index
= utf8rule
.find(' ');
310 if (space_index
== std::string::npos
)
311 Panic("Did not find a space in '%s'.", utf8rule
.c_str());
313 std::vector
<std::string
> split
;
314 split
.push_back(utf8rule
.substr(0, space_index
));
315 split
.push_back(utf8rule
.substr(space_index
+ 1));
317 // Underscores are used to represent spaces in most aff files
318 // (since the line is parsed on spaces).
319 std::replace(split
[0].begin(), split
[0].end(), '_', ' ');
320 std::replace(split
[1].begin(), split
[1].end(), '_', ' ');
322 replacements_
.push_back(std::make_pair(split
[0], split
[1]));
325 void AffReader::HandleRawCommand(const std::string
& line
) {
326 other_commands_
.push_back(line
);
329 void AffReader::HandleEncodedCommand(const std::string
& line
) {
331 if (!EncodingToUTF8(line
, &utf8
))
332 Panic("Cannot encode command '%s' to utf8.", line
.c_str());
333 other_commands_
.push_back(utf8
);
336 } // namespace convert_dict