1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // This tool converts Hunspell .aff/.dic pairs to a combined binary dictionary
6 // format (.bdic). This format is more compact, and can be more efficiently
7 // read by the client application.
9 // We do this conversion manually before publishing dictionary files. It is not
10 // part of any build process.
12 // See PrintHelp() below for usage.
16 #include "base/at_exit.h"
17 #include "base/file_util.h"
18 #include "base/files/file_path.h"
19 #include "base/i18n/icu_util.h"
20 #include "base/logging.h"
21 #include "base/process/memory.h"
22 #include "base/strings/string_util.h"
23 #include "chrome/tools/convert_dict/aff_reader.h"
24 #include "chrome/tools/convert_dict/dic_reader.h"
25 #include "third_party/hunspell/google/bdict_reader.h"
26 #include "third_party/hunspell/google/bdict_writer.h"
30 // Compares the given word list with the serialized trie to make sure they
32 bool VerifyWords(const convert_dict::DicReader::WordList
& org_words
,
33 const std::string
& serialized
) {
34 hunspell::BDictReader reader
;
35 if (!reader
.Init(reinterpret_cast<const unsigned char*>(serialized
.data()),
37 printf("BDict is invalid\n");
40 hunspell::WordIterator iter
= reader
.GetAllWordIterator();
42 int affix_ids
[hunspell::BDict::MAX_AFFIXES_PER_WORD
];
44 static const int buf_size
= 128;
46 for (size_t i
= 0; i
< org_words
.size(); i
++) {
47 int affix_matches
= iter
.Advance(buf
, buf_size
, affix_ids
);
48 if (affix_matches
== 0) {
49 printf("Found the end before we expected\n");
53 if (org_words
[i
].first
!= buf
) {
54 printf("Word doesn't match, word #%s\n", buf
);
58 if (affix_matches
!= static_cast<int>(org_words
[i
].second
.size())) {
59 printf("Different number of affix indices, word #%s\n", buf
);
63 // Check the individual affix indices.
64 for (size_t affix_index
= 0; affix_index
< org_words
[i
].second
.size();
66 if (affix_ids
[affix_index
] != org_words
[i
].second
[affix_index
]) {
67 printf("Index doesn't match, word #%s\n", buf
);
77 printf("Usage: convert_dict <dicfile base name>\n\n");
79 printf(" convert_dict en-US\nwill read en-US.dic, en-US.dic_delta, and "
80 "en-US.aff from the current directory and generate en-US.bdic\n\n");
87 int wmain(int argc
, wchar_t* argv
[]) {
89 int main(int argc
, char* argv
[]) {
91 base::EnableTerminationOnHeapCorruption();
95 base::AtExitManager exit_manager
;
96 base::i18n::InitializeICU();
98 base::FilePath file_base
= base::FilePath(argv
[1]);
100 base::FilePath aff_path
=
101 file_base
.ReplaceExtension(FILE_PATH_LITERAL(".aff"));
102 printf("Reading %" PRFilePath
" ...\n", aff_path
.value().c_str());
103 convert_dict::AffReader
aff_reader(aff_path
);
104 if (!aff_reader
.Read()) {
105 printf("Unable to read the aff file.\n");
109 base::FilePath dic_path
=
110 file_base
.ReplaceExtension(FILE_PATH_LITERAL(".dic"));
111 printf("Reading %" PRFilePath
" ...\n", dic_path
.value().c_str());
112 // DicReader will also read the .dic_delta file.
113 convert_dict::DicReader
dic_reader(dic_path
);
114 if (!dic_reader
.Read(&aff_reader
)) {
115 printf("Unable to read the dic file.\n");
119 hunspell::BDictWriter writer
;
120 writer
.SetComment(aff_reader
.comments());
121 writer
.SetAffixRules(aff_reader
.affix_rules());
122 writer
.SetAffixGroups(aff_reader
.GetAffixGroups());
123 writer
.SetReplacements(aff_reader
.replacements());
124 writer
.SetOtherCommands(aff_reader
.other_commands());
125 writer
.SetWords(dic_reader
.words());
127 printf("Serializing...\n");
128 std::string serialized
= writer
.GetBDict();
130 printf("Verifying...\n");
131 if (!VerifyWords(dic_reader
.words(), serialized
)) {
132 printf("ERROR converting, the dictionary does not check out OK.");
136 base::FilePath out_path
=
137 file_base
.ReplaceExtension(FILE_PATH_LITERAL(".bdic"));
138 printf("Writing %" PRFilePath
" ...\n", out_path
.value().c_str());
139 FILE* out_file
= base::OpenFile(out_path
, "wb");
141 printf("ERROR writing file\n");
144 size_t written
= fwrite(&serialized
[0], 1, serialized
.size(), out_file
);
145 CHECK(written
== serialized
.size());
146 base::CloseFile(out_file
);