1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "third_party/hunspell/google/bdict_writer.h"
7 #include "base/logging.h"
8 #include "base/strings/stringprintf.h"
9 #include "third_party/hunspell/google/bdict.h"
13 // Represents one node the word trie in memory. This does not have to be very
14 // efficient since it is only used when building.
18 UNDEFINED
, // Uninitialized storage type.
19 LEAF
, // Word with no additional string data.
20 LEAFMORE
, // Word with additional suffix following.
21 LIST16
, // List of sub-nodes with 16-bit relative offsets.
22 LIST8
, // List of sub-nodes with 8-bit relative offsets.
23 LOOKUP32
, // Lookup table with 32-bit absolute offsets.
24 LOOKUP16
, // LOokup table with 16-bit relative offsets.
27 DicNode() : addition(0), storage(UNDEFINED
) {
31 for (size_t i
= 0; i
< children
.size(); i
++)
35 bool is_leaf() const { return children
.empty(); }
37 // When non-zero, this character is the additional level that this
38 // node represents. This will be 0 for some leaf nodes when there is no
39 // addition and for the root node.
42 std::vector
<DicNode
*> children
;
44 // When there are no children, this is a leaf node and this "addition string"
45 // is appended to the result. When there are children, this will be empty.
46 std::string leaf_addition
;
48 // For leaf nodes, this are the indices into the affix table.
49 std::vector
<int> affix_indices
;
51 // Initially uninitialized, ComputeStorage() will fill this in with the
52 // desired serialization method.
58 void SerializeTrie(const DicNode
* node
, std::string
* output
);
60 // Returns true if the nth character in the given word is |ch|. Will return
61 // false when there is no nth character. Note that this will also match an
62 // implicit NULL at the end of the string.
63 bool NthCharacterIs(const std::string
& word
, size_t n
, char ch
) {
64 if (word
.length() < n
) // Want to allow n == length() to catch the NULL.
66 return word
.c_str()[n
] == ch
; // Use c_str() to get NULL terminator.
69 // Recursively build the trie data structure for the range in the |words| list
70 // in [begin, end). It is assumed that all words in that range will have the
71 // same |node_depth - 2| characters at the beginning. This node will key off of
72 // the |node_depth - 1| character, with a special case for the root.
74 // |prefix_chars| is how deep this node is in the trie (and corresponds to how
75 // many letters of the word we will skip). The root level will have
76 // |prefix_chars| of 0.
78 // The given |node| will be filled with the data. The return value is the
79 // index into the |words| vector of the next word to process. It will be
80 // equal to |end| when all words have been consumed.
81 size_t BuildTrie(const BDictWriter::WordList
& words
,
82 size_t begin
, size_t end
,
83 size_t node_depth
, DicNode
* node
) {
84 // Find the prefix that this node represents.
85 const std::string
& begin_str
= words
[begin
].first
;
86 if (begin_str
.length() < node_depth
) {
89 node
->affix_indices
= words
[begin
].second
;
93 // Now find the range of words sharing this prefix.
95 if (node_depth
== 0 && begin
== 0) {
96 // Special case the root node.
97 match_count
= end
- begin
;
101 node
->addition
= begin_str
[node_depth
- 1];
102 // We know the strings should have [0, node_depth-1) characters at the
103 // beginning already matching, so we only need to check the new one.
104 while (begin
+ match_count
< end
&&
105 NthCharacterIs(words
[begin
+ match_count
].first
,
106 node_depth
- 1, node
->addition
))
110 if (match_count
== 1) {
111 // Just found a leaf node with no other words sharing its prefix. Save any
112 // remaining characters and we're done.
113 node
->affix_indices
= words
[begin
].second
;
114 node
->leaf_addition
= begin_str
.substr(node_depth
);
118 // We have a range of words, add them as children of this node.
120 while (i
< begin
+ match_count
) {
121 DicNode
* cur
= new DicNode
;
122 i
= BuildTrie(words
, i
, begin
+ match_count
, node_depth
+ 1, cur
);
123 node
->children
.push_back(cur
);
126 return begin
+ match_count
;
129 // Lookup tables are complicated. They can have a magic 0th entry not counted
130 // in the table dimensions, and also have indices only for the used sub-range.
131 // This function will compute the starting point and size of a lookup table,
132 // in addition to whether it should have the magic 0th entry for the given
133 // list of child nodes.
134 void ComputeLookupStrategyDetails(const std::vector
<DicNode
*>& children
,
138 *has_0th_entry
= false;
141 if (children
.empty())
144 size_t first_offset
= 0;
145 if (children
[0]->addition
== 0) {
146 *has_0th_entry
= true;
150 if (children
.size() == first_offset
)
153 *first_item
= static_cast<unsigned char>(children
[first_offset
]->addition
);
154 unsigned char last_item
= children
[children
.size() - 1]->addition
;
155 *list_size
= last_item
- *first_item
+ 1;
158 // Recursively fills in the storage strategy for this node and each of its
159 // children. This must be done before actually serializing because the storage
160 // mode will depend on the size of the children.
161 size_t ComputeTrieStorage(DicNode
* node
) {
162 if (node
->is_leaf()) {
163 // The additional affix list holds affixes when there is more than one. Each
164 // entry is two bytes, plus an additional FFFF terminator.
165 size_t supplimentary_size
= 0;
166 if (node
->affix_indices
[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID
) {
167 // We cannot store the first affix ID of the affix list into a leaf node.
168 // In this case, we have to store all the affix IDs and a terminator
169 // into a supplimentary list.
170 supplimentary_size
= node
->affix_indices
.size() * 2 + 2;
171 } else if (node
->affix_indices
.size() > 1) {
172 // We can store the first affix ID of the affix list into a leaf node.
173 // In this case, we need to store the remaining affix IDs and a
174 // terminator into a supplimentary list.
175 supplimentary_size
= node
->affix_indices
.size() * 2;
178 if (node
->leaf_addition
.empty()) {
179 node
->storage
= DicNode::LEAF
;
180 return 2 + supplimentary_size
;
182 node
->storage
= DicNode::LEAFMORE
;
183 // Signature & affix (2) + null for leaf_addition (1) = 3
184 return 3 + node
->leaf_addition
.size() + supplimentary_size
;
187 // Recursively compute the size of the children for non-leaf nodes.
188 size_t child_size
= 0;
189 for (size_t i
= 0; i
< node
->children
.size(); i
++)
190 child_size
+= ComputeTrieStorage(node
->children
[i
]);
192 // Fixed size is only 1 byte which is the ID byte and the count combined.
193 static const int kListHeaderSize
= 1;
195 // Lists can only store up to 16 items.
196 static const size_t kListThreshold
= 16;
197 if (node
->children
.size() < kListThreshold
&& child_size
<= 0xFF) {
198 node
->storage
= DicNode::LIST8
;
199 return kListHeaderSize
+ node
->children
.size() * 2 + child_size
;
202 if (node
->children
.size() < kListThreshold
&& child_size
<= 0xFFFF) {
203 node
->storage
= DicNode::LIST16
;
204 // Fixed size is one byte plus 3 for each table entry.
205 return kListHeaderSize
+ node
->children
.size() * 3 + child_size
;
208 static const int kTableHeaderSize
= 2; // Type + table size.
211 int first_table_item
, table_item_count
;
212 ComputeLookupStrategyDetails(node
->children
, &has_0th_item
,
213 &first_table_item
, &table_item_count
);
214 if (child_size
+ kTableHeaderSize
+ (has_0th_item
? 2 : 0) +
215 table_item_count
* 2 < 0xFFFF) {
216 // Use 16-bit addressing since the children will fit.
217 node
->storage
= DicNode::LOOKUP16
;
218 return kTableHeaderSize
+ (has_0th_item
? 2 : 0) + table_item_count
* 2 +
222 // Use 32-bit addressing as a last resort.
223 node
->storage
= DicNode::LOOKUP32
;
224 return kTableHeaderSize
+ (has_0th_item
? 4 : 0) + table_item_count
* 4 +
228 // Serializes the given node when it is DicNode::LEAF* to the output.
229 void SerializeLeaf(const DicNode
* node
, std::string
* output
) {
230 // The low 6 bits of the ID byte are the high 6 bits of the first affix ID.
231 int first_affix
= node
->affix_indices
.size() ? node
->affix_indices
[0] : 0;
233 // We may store the first value with the node or in the supplimentary list.
234 size_t first_affix_in_supplimentary_list
= 1;
235 if (first_affix
> BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID
) {
236 // There are not enough bits for this value, move it to the supplimentary
237 // list where there are more bits per value.
238 first_affix_in_supplimentary_list
= 0;
239 first_affix
= BDict::FIRST_AFFIX_IS_UNUSED
;
242 unsigned char id_byte
= (first_affix
>> 8) &
243 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK
;
245 // The next two bits indicates an additional string and more affixes.
246 if (node
->storage
== DicNode::LEAFMORE
)
247 id_byte
|= BDict::LEAF_NODE_ADDITIONAL_VALUE
;
248 if (node
->affix_indices
.size() > 1 || first_affix_in_supplimentary_list
== 0)
249 id_byte
|= BDict::LEAF_NODE_FOLLOWING_VALUE
;
250 output
->push_back(id_byte
);
252 // Following is the low 8 bits of the affix index.
253 output
->push_back(first_affix
& 0xff);
255 // Handle the optional addition with NULL terminator.
256 if (node
->storage
== DicNode::LEAFMORE
) {
257 for (size_t i
= 0; i
< node
->leaf_addition
.size() + 1; i
++)
258 output
->push_back(node
->leaf_addition
.c_str()[i
]);
261 // Handle any following affixes. We already wrote the 0th one.
262 if (node
->affix_indices
.size() > first_affix_in_supplimentary_list
) {
263 for (size_t i
= first_affix_in_supplimentary_list
;
264 i
< node
->affix_indices
.size() && i
< BDict::MAX_AFFIXES_PER_WORD
;
266 output
->push_back(static_cast<char>(node
->affix_indices
[i
] & 0xFF));
268 static_cast<char>((node
->affix_indices
[i
] >> 8) & 0xFF));
271 // Terminator for affix list. We use 0xFFFF.
272 output
->push_back(static_cast<unsigned char>(0xFF));
273 output
->push_back(static_cast<unsigned char>(0xFF));
277 // Serializes the given node when it is DicNode::LIST* to the output.
278 void SerializeList(const DicNode
* node
, std::string
* output
) {
279 bool is_8_bit
= node
->storage
== DicNode::LIST8
;
280 unsigned char id_byte
= BDict::LIST_NODE_TYPE_VALUE
|
281 (is_8_bit
? 0 : BDict::LIST_NODE_16BIT_VALUE
);
282 id_byte
|= node
->children
.size(); // We assume the size is < 4 bits.
283 output
->push_back(id_byte
);
285 // Reserve enough room for the lookup table (either 2 or 3 bytes per entry).
286 int bytes_per_entry
= (is_8_bit
? 2 : 3);
287 size_t table_begin
= output
->size();
288 output
->resize(output
->size() + node
->children
.size() * bytes_per_entry
);
289 size_t children_begin
= output
->size();
291 for (size_t i
= 0; i
< node
->children
.size(); i
++) {
292 // First is the character this entry represents.
293 (*output
)[table_begin
+ i
* bytes_per_entry
] = node
->children
[i
]->addition
;
295 // Next is the 8- or 16-bit offset.
296 size_t offset
= output
->size() - children_begin
;
298 DCHECK(offset
<= 0xFF);
299 (*output
)[table_begin
+ i
* bytes_per_entry
+ 1] =
300 static_cast<char>(offset
& 0xFF);
302 unsigned short* output16
= reinterpret_cast<unsigned short*>(
303 &(*output
)[table_begin
+ i
* bytes_per_entry
+ 1]);
304 *output16
= static_cast<unsigned short>(offset
);
307 // Now append the children's data.
308 SerializeTrie(node
->children
[i
], output
);
312 // Serializes the given node when it is DicNode::LOOKUP* to the output.
313 void SerializeLookup(const DicNode
* node
, std::string
* output
) {
314 unsigned char id_byte
= BDict::LOOKUP_NODE_TYPE_VALUE
;
317 int first_table_item
, table_item_count
;
318 ComputeLookupStrategyDetails(node
->children
, &has_0th_item
,
319 &first_table_item
, &table_item_count
);
321 // Set the extra bits in the ID byte.
322 bool is_32_bit
= (node
->storage
== DicNode::LOOKUP32
);
324 id_byte
|= BDict::LOOKUP_NODE_32BIT_VALUE
;
326 id_byte
|= BDict::LOOKUP_NODE_0TH_VALUE
;
328 size_t begin_offset
= output
->size();
330 output
->push_back(id_byte
);
331 output
->push_back(static_cast<char>(first_table_item
));
332 output
->push_back(static_cast<char>(table_item_count
));
334 // Save room for the lookup table and the optional 0th item.
335 int bytes_per_entry
= (is_32_bit
? 4 : 2);
336 size_t zeroth_item_offset
= output
->size();
338 output
->resize(output
->size() + bytes_per_entry
);
339 size_t table_begin
= output
->size();
340 output
->resize(output
->size() + table_item_count
* bytes_per_entry
);
342 // Append the children.
343 for (size_t i
= 0; i
< node
->children
.size(); i
++) {
344 size_t offset
= output
->size();
346 // Compute the location at which we'll store the offset of the child data.
347 // We may be writing the magic 0th item.
348 size_t offset_offset
;
349 if (i
== 0 && has_0th_item
) {
350 offset_offset
= zeroth_item_offset
;
352 int table_index
= static_cast<unsigned char>(node
->children
[i
]->addition
) - first_table_item
;
353 offset_offset
= table_begin
+ table_index
* bytes_per_entry
;
358 // Use 32-bit absolute offsets.
359 // FIXME(brettw) use bit cast.
360 unsigned* offset32
= reinterpret_cast<unsigned*>(&(*output
)[offset_offset
]);
361 *offset32
= static_cast<unsigned>(output
->size());
363 // Use 16-bit relative offsets.
364 unsigned short* offset16
= reinterpret_cast<unsigned short*>(&(*output
)[offset_offset
]);
365 *offset16
= static_cast<unsigned short>(output
->size() - begin_offset
);
368 SerializeTrie(node
->children
[i
], output
);
372 // Recursively serializes this node and all of its children to the output.
373 void SerializeTrie(const DicNode
* node
, std::string
* output
) {
374 if (node
->storage
== DicNode::LEAF
||
375 node
->storage
== DicNode::LEAFMORE
) {
376 SerializeLeaf(node
, output
);
377 } else if (node
->storage
== DicNode::LIST8
||
378 node
->storage
== DicNode::LIST16
) {
379 SerializeList(node
, output
);
380 } else if (node
->storage
== DicNode::LOOKUP16
||
381 node
->storage
== DicNode::LOOKUP32
) {
382 SerializeLookup(node
, output
);
386 // Serializes the given list of strings with 0 bytes separating them. The end
387 // will be marked by a double-0.
388 void SerializeStringListNullTerm(const std::vector
<std::string
>& strings
,
389 std::string
* output
) {
390 for (size_t i
= 0; i
< strings
.size(); i
++) {
391 // Can't tolerate empty strings since the'll mark the end.
392 if (strings
[i
].empty())
393 output
->push_back(' ');
395 output
->append(strings
[i
]);
396 output
->push_back(0);
398 output
->push_back(0);
401 void SerializeReplacements(
402 const std::vector
< std::pair
<std::string
, std::string
> >& repl
,
403 std::string
* output
) {
404 for (size_t i
= 0; i
< repl
.size(); i
++) {
405 output
->append(repl
[i
].first
);
406 output
->push_back(0);
407 output
->append(repl
[i
].second
);
408 output
->push_back(0);
410 output
->push_back(0);
415 BDictWriter::BDictWriter() : trie_root_(NULL
) {
418 BDictWriter::~BDictWriter() {
422 void BDictWriter::SetWords(const WordList
& words
) {
423 trie_root_
= new DicNode
;
424 BuildTrie(words
, 0, words
.size(), 0, trie_root_
);
427 std::string
BDictWriter::GetBDict() const {
430 // Save room for the header. This will be populated at the end.
431 ret
.resize(sizeof(hunspell::BDict::Header
));
433 // Serialize the affix portion.
434 size_t aff_offset
= ret
.size();
437 // Serialize the dictionary words.
438 size_t dic_offset
= ret
.size();
439 ret
.reserve(ret
.size() + ComputeTrieStorage(trie_root_
));
440 SerializeTrie(trie_root_
, &ret
);
442 // Fill the header last, now that we have the data.
443 hunspell::BDict::Header
* header
=
444 reinterpret_cast<hunspell::BDict::Header
*>(&ret
[0]);
445 header
->signature
= hunspell::BDict::SIGNATURE
;
446 header
->major_version
= hunspell::BDict::MAJOR_VERSION
;
447 header
->minor_version
= hunspell::BDict::MINOR_VERSION
;
448 header
->aff_offset
= static_cast<uint32
>(aff_offset
);
449 header
->dic_offset
= static_cast<uint32
>(dic_offset
);
451 // Write the MD5 digest of the affix information and the dictionary words at
452 // the end of the BDic header.
453 if (header
->major_version
>= 2)
454 base::MD5Sum(&ret
[aff_offset
], ret
.size() - aff_offset
, &header
->digest
);
459 void BDictWriter::SerializeAff(std::string
* output
) const {
460 // Reserve enough room for the header.
461 size_t header_offset
= output
->size();
462 output
->resize(output
->size() + sizeof(hunspell::BDict::AffHeader
));
464 // Write the comment.
465 output
->push_back('\n');
466 output
->append(comment_
);
467 output
->push_back('\n');
469 // We need a magic first AF line that lists the number of following ones.
470 size_t affix_group_offset
= output
->size();
471 output
->append(base::StringPrintf("AF %d",
472 static_cast<int>(affix_groups_
.size())));
473 output
->push_back(0);
474 SerializeStringListNullTerm(affix_groups_
, output
);
476 size_t affix_rule_offset
= output
->size();
477 SerializeStringListNullTerm(affix_rules_
, output
);
479 size_t rep_offset
= output
->size();
480 SerializeReplacements(replacements_
, output
);
482 size_t other_offset
= output
->size();
483 SerializeStringListNullTerm(other_commands_
, output
);
485 // Add the header now that we know the offsets.
486 hunspell::BDict::AffHeader
* header
=
487 reinterpret_cast<hunspell::BDict::AffHeader
*>(&(*output
)[header_offset
]);
488 header
->affix_group_offset
= static_cast<uint32
>(affix_group_offset
);
489 header
->affix_rule_offset
= static_cast<uint32
>(affix_rule_offset
);
490 header
->rep_offset
= static_cast<uint32
>(rep_offset
);
491 header
->other_offset
= static_cast<uint32
>(other_offset
);
494 } // namespace hunspell