1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/tools/convert_dict/aff_reader.h"
9 #include "base/file_util.h"
10 #include "base/i18n/icu_string_conversions.h"
11 #include "base/stringprintf.h"
12 #include "base/string_split.h"
13 #include "base/utf_string_conversions.h"
14 #include "chrome/tools/convert_dict/hunspell_reader.h"
16 namespace convert_dict
{
20 // Returns true if the given line begins with the given case-sensitive
21 // NULL-terminated ASCII string.
22 bool StringBeginsWith(const std::string
& str
, const char* with
) {
24 while (cur
< str
.size() && with
[cur
] != 0) {
25 if (str
[cur
] != with
[cur
])
29 return with
[cur
] == 0;
32 // Collapses runs of spaces to only one space.
33 void CollapseDuplicateSpaces(std::string
* str
) {
34 int prev_space
= false;
35 for (size_t i
= 0; i
< str
->length(); i
++) {
36 if ((*str
)[i
] == ' ') {
38 str
->erase(str
->begin() + i
);
50 AffReader::AffReader(const FilePath
& path
)
51 : has_indexed_affixes_(false) {
52 file_
= file_util::OpenFile(path
, "r");
54 // Default to Latin1 in case the file doesn't specify it.
55 encoding_
= "ISO8859-1";
58 AffReader::~AffReader() {
60 file_util::CloseFile(file_
);
63 bool AffReader::Read() {
67 // TODO(brettw) handle byte order mark.
69 bool got_command
= false;
70 bool got_first_af
= false;
71 bool got_first_rep
= false;
73 has_indexed_affixes_
= false;
75 while (!feof(file_
)) {
76 std::string line
= ReadLine(file_
);
78 // Save comment lines before any commands.
79 if (!got_command
&& !line
.empty() && line
[0] == '#') {
80 intro_comment_
.append(line
);
81 intro_comment_
.push_back('\n');
90 if (StringBeginsWith(line
, "SET ")) {
91 // Character set encoding.
92 encoding_
= line
.substr(4);
94 } else if (StringBeginsWith(line
, "AF ")) {
95 // Affix. The first one is the number of ones following which we don't
97 has_indexed_affixes_
= true;
99 std::string
group(line
.substr(3));
100 AddAffixGroup(&group
);
104 } else if (StringBeginsWith(line
, "SFX ") ||
105 StringBeginsWith(line
, "PFX ")) {
107 } else if (StringBeginsWith(line
, "REP ")) {
108 // The first rep line is the number of ones following which we don't
111 std::string
replacement(line
.substr(4));
112 AddReplacement(&replacement
);
114 got_first_rep
= true;
116 } else if (StringBeginsWith(line
, "TRY ") ||
117 StringBeginsWith(line
, "MAP ")) {
118 HandleEncodedCommand(line
);
119 } else if (StringBeginsWith(line
, "IGNORE ")) {
120 printf("We don't support the IGNORE command yet. This would change how "
121 "we would insert things in our lookup table.\n");
123 } else if (StringBeginsWith(line
, "COMPLEXPREFIXES ")) {
124 printf("We don't support the COMPLEXPREFIXES command yet. This would "
125 "mean we have to insert words backwards as well (I think)\n");
128 // All other commands get stored in the other commands list.
129 HandleRawCommand(line
);
136 bool AffReader::EncodingToUTF8(const std::string
& encoded
,
137 std::string
* utf8
) const {
138 std::wstring wide_word
;
139 if (!base::CodepageToWide(encoded
, encoding(),
140 base::OnStringConversionError::FAIL
, &wide_word
))
142 *utf8
= WideToUTF8(wide_word
);
146 int AffReader::GetAFIndexForAFString(const std::string
& af_string
) {
147 std::map
<std::string
, int>::iterator found
= affix_groups_
.find(af_string
);
148 if (found
!= affix_groups_
.end())
149 return found
->second
;
150 std::string
my_string(af_string
);
151 return AddAffixGroup(&my_string
);
154 // We convert the data from our map to an indexed list, and also prefix each
155 // line with "AF" for the parser to read later.
156 std::vector
<std::string
> AffReader::GetAffixGroups() const {
158 for (std::map
<std::string
, int>::const_iterator i
= affix_groups_
.begin();
159 i
!= affix_groups_
.end(); ++i
) {
160 if (i
->second
> max_id
)
164 std::vector
<std::string
> ret
;
167 for (std::map
<std::string
, int>::const_iterator i
= affix_groups_
.begin();
168 i
!= affix_groups_
.end(); ++i
) {
169 // Convert the indices into 1-based.
170 ret
[i
->second
- 1] = std::string("AF ") + i
->first
;
176 int AffReader::AddAffixGroup(std::string
* rule
) {
179 // We use the 1-based index of the rule. This matches the way Hunspell
180 // refers to the numbers.
181 int affix_id
= static_cast<int>(affix_groups_
.size()) + 1;
182 affix_groups_
.insert(std::make_pair(*rule
, affix_id
));
186 void AffReader::AddAffix(std::string
* rule
) {
188 CollapseDuplicateSpaces(rule
);
190 // These lines have two forms:
191 // AFX D Y 4 <- First line, lists how many affixes for "D" there are.
192 // AFX D 0 d e <- Following lines.
193 // We want to ensure the two last groups on the last line are encoded in
194 // UTF-8, and we want to make sure that the affix identifier "D" is *not*
195 // encoded, since that's basically an 8-bit identifier.
197 // Count to the third space. Everything after that will be re-encoded. This
198 // will re-encode the number on the first line, but that will be a NOP. If
199 // there are not that many groups, we won't reencode it, but pass it through.
200 int found_spaces
= 0;
202 for (size_t i
= 0; i
< rule
->length(); i
++) {
203 if ((*rule
)[i
] == ' ') {
205 if (found_spaces
== 3) {
206 size_t part_start
= i
;
208 if (token
[0] != 'Y' && token
[0] != 'N') {
209 // This token represents a stripping prefix or suffix, which is
210 // either a length or a string to be replaced.
211 // We also reencode them to UTF-8.
212 part_start
= i
- token
.length();
214 part
= rule
->substr(part_start
); // From here to end.
216 if (part
.find('-') != std::string::npos
) {
217 // This rule has a morph rule used by old Hungarian dictionaries.
218 // When a line has a morph rule, its format becomes as listed below.
220 // To make hunspell work more happily, replace this morph rule with
221 // a compound flag as listed below.
223 std::vector
<std::string
> tokens
;
224 base::SplitString(part
, ' ', &tokens
);
225 if (tokens
.size() >= 5) {
226 part
= base::StringPrintf("%s %s/%s %s",
234 size_t slash_index
= part
.find('/');
235 if (slash_index
!= std::string::npos
&& !has_indexed_affixes()) {
236 // This can also have a rule string associated with it following a
237 // slash. For example:
239 // The "Y" is a flag. For example, the aff file might have a line:
241 // so that means that this prefix would be a compound one.
243 // It expects these rules to use the same alias rules as the .dic
244 // file. We've forced it to use aliases, which is a numerical index
245 // instead of these character flags, and this needs to be consistent.
247 std::string before_flags
= part
.substr(0, slash_index
+ 1);
249 // After the slash are both the flags, then whitespace, then the part
250 // that tells us what to strip.
251 std::vector
<std::string
> after_slash
;
252 base::SplitString(part
.substr(slash_index
+ 1), ' ', &after_slash
);
253 if (after_slash
.size() == 0) {
254 printf("ERROR: Found 0 terms after slash in affix rule '%s', "
255 "but need at least 2.\n",
259 if (after_slash
.size() == 1) {
260 printf("WARNING: Found 1 term after slash in affix rule '%s', "
261 "but expected at least 2. Adding '.'.\n",
263 after_slash
.push_back(".");
265 // Note that we may get a third term here which is the morphological
266 // description of this rule. This happens in the tests only, so we can
269 part
= base::StringPrintf("%s%d %s",
270 before_flags
.c_str(),
271 GetAFIndexForAFString(after_slash
[0]),
272 after_slash
[1].c_str());
275 // Reencode from here
276 std::string reencoded
;
277 if (!EncodingToUTF8(part
, &reencoded
)) {
278 printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n",
283 *rule
= rule
->substr(0, part_start
) + reencoded
;
288 token
.push_back((*rule
)[i
]);
292 affix_rules_
.push_back(*rule
);
295 void AffReader::AddReplacement(std::string
* rule
) {
297 CollapseDuplicateSpaces(rule
);
299 std::string utf8rule
;
300 if (!EncodingToUTF8(*rule
, &utf8rule
)) {
301 printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n",
306 // The first space separates key and value.
307 size_t space_index
= utf8rule
.find(' ');
308 if (space_index
== std::string::npos
) {
309 printf("ERROR: Did not find a space in '%s'.\n", utf8rule
.c_str());
312 std::vector
<std::string
> split
;
313 split
.push_back(utf8rule
.substr(0, space_index
));
314 split
.push_back(utf8rule
.substr(space_index
+ 1));
316 // Underscores are used to represent spaces in most aff files
317 // (since the line is parsed on spaces).
318 std::replace(split
[0].begin(), split
[0].end(), '_', ' ');
319 std::replace(split
[1].begin(), split
[1].end(), '_', ' ');
321 replacements_
.push_back(std::make_pair(split
[0], split
[1]));
324 void AffReader::HandleRawCommand(const std::string
& line
) {
325 other_commands_
.push_back(line
);
328 void AffReader::HandleEncodedCommand(const std::string
& line
) {
330 if (!EncodingToUTF8(line
, &utf8
)) {
331 printf("ERROR: Cannot encode command '%s' to utf8.\n", line
.c_str());
334 other_commands_
.push_back(utf8
);
337 } // namespace convert_dict