1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/tools/convert_dict/dic_reader.h"
10 #include "base/files/file_util.h"
11 #include "base/strings/string_util.h"
12 #include "chrome/tools/convert_dict/aff_reader.h"
13 #include "chrome/tools/convert_dict/hunspell_reader.h"
15 namespace convert_dict
{
19 // Maps each unique word to the unique affix group IDs associated with it.
20 typedef std::map
<std::string
, std::set
<int> > WordSet
;
22 void SplitDicLine(const std::string
& line
, std::vector
<std::string
>* output
) {
23 // We split the line on a slash not preceded by a backslash. A slash at the
24 // beginning of the line is not a separator either.
25 size_t slash_index
= line
.size();
26 for (size_t i
= 0; i
< line
.size(); i
++) {
27 if (line
[i
] == '/' && i
> 0 && line
[i
- 1] != '\\') {
35 // Everything before the slash index is the first term. We also need to
36 // convert all escaped slashes ("\/" sequences) to regular slashes.
37 std::string word
= line
.substr(0, slash_index
);
38 base::ReplaceSubstringsAfterOffset(&word
, 0, "\\/", "/");
39 output
->push_back(word
);
41 // Everything (if anything) after the slash is the second.
42 if (slash_index
< line
.size() - 1)
43 output
->push_back(line
.substr(slash_index
+ 1));
46 // This function reads words from a .dic file, or a .dic_delta file. Note that
47 // we read 'all' the words in the file, irrespective of the word count given
48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta
49 // file, the first line actually does _not_ have the number of words. In order
50 // to control this, we use the |file_has_word_count_in_the_first_line|
51 // parameter to tell this method whether the first non empty line in the file
52 // contains the number of words or not. If it does, skip the first line. If it
53 // does not, then the first line contains a word.
54 bool PopulateWordSet(WordSet
* word_set
, FILE* file
, AffReader
* aff_reader
,
55 const char* file_type
, const char* encoding
,
56 bool file_has_word_count_in_the_first_line
) {
59 std::string line
= ReadLine(file
);
65 if (file_has_word_count_in_the_first_line
) {
66 // Skip the first nonempty line, this is the line count. We don't bother
67 // with it and just read all the lines.
68 file_has_word_count_in_the_first_line
= false;
72 std::vector
<std::string
> split
;
73 SplitDicLine(line
, &split
);
74 if (split
.empty() || split
.size() > 2) {
75 printf("Line %d has extra slashes in the %s file\n", line_number
,
80 // The first part is the word, the second (optional) part is the affix. We
81 // always use UTF-8 as the encoding to simplify life.
83 std::string
encoding_string(encoding
);
84 if (encoding_string
== "UTF-8") {
86 } else if (!aff_reader
->EncodingToUTF8(split
[0], &utf8word
)) {
87 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
88 line_number
, encoding
, file_type
);
92 // We always convert the affix to an index. 0 means no affix.
94 if (split
.size() == 2) {
95 // Got a rule, which is the stuff after the slash. The line may also have
96 // an optional term separated by a tab. This is the morphological
97 // description. We don't care about this (it is used in the tests to
98 // generate a nice dump), so we remove it.
99 size_t split1_tab_offset
= split
[1].find('\t');
100 if (split1_tab_offset
!= std::string::npos
)
101 split
[1] = split
[1].substr(0, split1_tab_offset
);
103 if (aff_reader
->has_indexed_affixes())
104 affix_index
= atoi(split
[1].c_str());
106 affix_index
= aff_reader
->GetAFIndexForAFString(split
[1]);
109 // Discard the morphological description if it is attached to the first
110 // token. (It is attached to the first token if a word doesn't have affix
112 size_t word_tab_offset
= utf8word
.find('\t');
113 if (word_tab_offset
!= std::string::npos
)
114 utf8word
= utf8word
.substr(0, word_tab_offset
);
116 WordSet::iterator found
= word_set
->find(utf8word
);
117 std::set
<int> affix_vector
;
118 affix_vector
.insert(affix_index
);
120 if (found
== word_set
->end())
121 word_set
->insert(std::make_pair(utf8word
, affix_vector
));
123 found
->second
.insert(affix_index
);
131 DicReader::DicReader(const base::FilePath
& path
) {
132 file_
= base::OpenFile(path
, "r");
134 base::FilePath additional_path
=
135 path
.ReplaceExtension(FILE_PATH_LITERAL("dic_delta"));
136 additional_words_file_
= base::OpenFile(additional_path
, "r");
138 if (additional_words_file_
)
139 printf("Reading %" PRFilePath
" ...\n", additional_path
.value().c_str());
141 printf("%" PRFilePath
" not found.\n", additional_path
.value().c_str());
144 DicReader::~DicReader() {
146 base::CloseFile(file_
);
147 if (additional_words_file_
)
148 base::CloseFile(additional_words_file_
);
151 bool DicReader::Read(AffReader
* aff_reader
) {
157 // Add words from the dic file to the word set.
158 // Note that the first line is the word count in the file.
159 if (!PopulateWordSet(&word_set
, file_
, aff_reader
, "dic",
160 aff_reader
->encoding(), true))
163 // Add words from the .dic_delta file to the word set, if it exists.
164 // The first line is the first word to add. Word count line is not present.
165 // NOTE: These additional words should be encoded as UTF-8.
166 if (additional_words_file_
!= NULL
) {
167 PopulateWordSet(&word_set
, additional_words_file_
, aff_reader
, "dic delta",
170 // Make sure the words are sorted, they may be unsorted in the input.
171 for (WordSet::iterator word
= word_set
.begin(); word
!= word_set
.end();
173 std::vector
<int> affixes
;
174 for (std::set
<int>::iterator aff
= word
->second
.begin();
175 aff
!= word
->second
.end(); ++aff
)
176 affixes
.push_back(*aff
);
178 // Double check that the affixes are sorted. This isn't strictly necessary
179 // but it's nice for the file to have a fixed layout.
180 std::sort(affixes
.begin(), affixes
.end());
181 std::reverse(affixes
.begin(), affixes
.end());
182 words_
.push_back(std::make_pair(word
->first
, affixes
));
185 // Double-check that the words are sorted.
186 std::sort(words_
.begin(), words_
.end());
190 } // namespace convert_dict