Update mojo sdk to rev 1dc8a9a5db73d3718d99917fadf31f5fb2ebad4f
[chromium-blink-merge.git] / third_party / hunspell / google / bdict_writer.cc
blobcd39f101fdf136dca7999b34264ef8637ee5cabf
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "third_party/hunspell/google/bdict_writer.h"
7 #include "base/logging.h"
8 #include "base/strings/stringprintf.h"
9 #include "third_party/hunspell/google/bdict.h"
11 namespace hunspell {
13 // Represents one node the word trie in memory. This does not have to be very
14 // efficient since it is only used when building.
15 class DicNode {
16 public:
17 enum StorageType {
18 UNDEFINED, // Uninitialized storage type.
19 LEAF, // Word with no additional string data.
20 LEAFMORE, // Word with additional suffix following.
21 LIST16, // List of sub-nodes with 16-bit relative offsets.
22 LIST8, // List of sub-nodes with 8-bit relative offsets.
23 LOOKUP32, // Lookup table with 32-bit absolute offsets.
24 LOOKUP16, // LOokup table with 16-bit relative offsets.
27 DicNode() : addition(0), storage(UNDEFINED) {
30 ~DicNode() {
31 for (size_t i = 0; i < children.size(); i++)
32 delete children[i];
35 bool is_leaf() const { return children.empty(); }
37 // When non-zero, this character is the additional level that this
38 // node represents. This will be 0 for some leaf nodes when there is no
39 // addition and for the root node.
40 char addition;
42 std::vector<DicNode*> children;
44 // When there are no children, this is a leaf node and this "addition string"
45 // is appended to the result. When there are children, this will be empty.
46 std::string leaf_addition;
48 // For leaf nodes, this are the indices into the affix table.
49 std::vector<int> affix_indices;
51 // Initially uninitialized, ComputeStorage() will fill this in with the
52 // desired serialization method.
53 StorageType storage;
56 namespace {
58 void SerializeTrie(const DicNode* node, std::string* output);
60 // Returns true if the nth character in the given word is |ch|. Will return
61 // false when there is no nth character. Note that this will also match an
62 // implicit NULL at the end of the string.
63 bool NthCharacterIs(const std::string& word, size_t n, char ch) {
64 if (word.length() < n) // Want to allow n == length() to catch the NULL.
65 return false;
66 return word.c_str()[n] == ch; // Use c_str() to get NULL terminator.
69 // Recursively build the trie data structure for the range in the |words| list
70 // in [begin, end). It is assumed that all words in that range will have the
71 // same |node_depth - 2| characters at the beginning. This node will key off of
72 // the |node_depth - 1| character, with a special case for the root.
74 // |prefix_chars| is how deep this node is in the trie (and corresponds to how
75 // many letters of the word we will skip). The root level will have
76 // |prefix_chars| of 0.
78 // The given |node| will be filled with the data. The return value is the
79 // index into the |words| vector of the next word to process. It will be
80 // equal to |end| when all words have been consumed.
81 size_t BuildTrie(const BDictWriter::WordList& words,
82 size_t begin, size_t end,
83 size_t node_depth, DicNode* node) {
84 // Find the prefix that this node represents.
85 const std::string& begin_str = words[begin].first;
86 if (begin_str.length() < node_depth) {
87 // Singleton.
88 node->addition = 0;
89 node->affix_indices = words[begin].second;
90 return begin + 1;
93 // Now find the range of words sharing this prefix.
94 size_t match_count;
95 if (node_depth == 0 && begin == 0) {
96 // Special case the root node.
97 match_count = end - begin;
98 node->addition = 0;
99 } else {
100 match_count = 0;
101 node->addition = begin_str[node_depth - 1];
102 // We know the strings should have [0, node_depth-1) characters at the
103 // beginning already matching, so we only need to check the new one.
104 while (begin + match_count < end &&
105 NthCharacterIs(words[begin + match_count].first,
106 node_depth - 1, node->addition))
107 match_count++;
110 if (match_count == 1) {
111 // Just found a leaf node with no other words sharing its prefix. Save any
112 // remaining characters and we're done.
113 node->affix_indices = words[begin].second;
114 node->leaf_addition = begin_str.substr(node_depth);
115 return begin + 1;
118 // We have a range of words, add them as children of this node.
119 size_t i = begin;
120 while (i < begin + match_count) {
121 DicNode* cur = new DicNode;
122 i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);
123 node->children.push_back(cur);
126 return begin + match_count;
129 // Lookup tables are complicated. They can have a magic 0th entry not counted
130 // in the table dimensions, and also have indices only for the used sub-range.
131 // This function will compute the starting point and size of a lookup table,
132 // in addition to whether it should have the magic 0th entry for the given
133 // list of child nodes.
134 void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,
135 bool* has_0th_entry,
136 int* first_item,
137 int* list_size) {
138 *has_0th_entry = false;
139 *first_item = 0;
140 *list_size = 0;
141 if (children.empty())
142 return;
144 size_t first_offset = 0;
145 if (children[0]->addition == 0) {
146 *has_0th_entry = true;
147 first_offset++;
150 if (children.size() == first_offset)
151 return;
153 *first_item = static_cast<unsigned char>(children[first_offset]->addition);
154 unsigned char last_item = children[children.size() - 1]->addition;
155 *list_size = last_item - *first_item + 1;
158 // Recursively fills in the storage strategy for this node and each of its
159 // children. This must be done before actually serializing because the storage
160 // mode will depend on the size of the children.
161 size_t ComputeTrieStorage(DicNode* node) {
162 if (node->is_leaf()) {
163 // The additional affix list holds affixes when there is more than one. Each
164 // entry is two bytes, plus an additional FFFF terminator.
165 size_t supplimentary_size = 0;
166 if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
167 // We cannot store the first affix ID of the affix list into a leaf node.
168 // In this case, we have to store all the affix IDs and a terminator
169 // into a supplimentary list.
170 supplimentary_size = node->affix_indices.size() * 2 + 2;
171 } else if (node->affix_indices.size() > 1) {
172 // We can store the first affix ID of the affix list into a leaf node.
173 // In this case, we need to store the remaining affix IDs and a
174 // terminator into a supplimentary list.
175 supplimentary_size = node->affix_indices.size() * 2;
178 if (node->leaf_addition.empty()) {
179 node->storage = DicNode::LEAF;
180 return 2 + supplimentary_size;
182 node->storage = DicNode::LEAFMORE;
183 // Signature & affix (2) + null for leaf_addition (1) = 3
184 return 3 + node->leaf_addition.size() + supplimentary_size;
187 // Recursively compute the size of the children for non-leaf nodes.
188 size_t child_size = 0;
189 for (size_t i = 0; i < node->children.size(); i++)
190 child_size += ComputeTrieStorage(node->children[i]);
192 // Fixed size is only 1 byte which is the ID byte and the count combined.
193 static const int kListHeaderSize = 1;
195 // Lists can only store up to 16 items.
196 static const size_t kListThreshold = 16;
197 if (node->children.size() < kListThreshold && child_size <= 0xFF) {
198 node->storage = DicNode::LIST8;
199 return kListHeaderSize + node->children.size() * 2 + child_size;
202 if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {
203 node->storage = DicNode::LIST16;
204 // Fixed size is one byte plus 3 for each table entry.
205 return kListHeaderSize + node->children.size() * 3 + child_size;
208 static const int kTableHeaderSize = 2; // Type + table size.
210 bool has_0th_item;
211 int first_table_item, table_item_count;
212 ComputeLookupStrategyDetails(node->children, &has_0th_item,
213 &first_table_item, &table_item_count);
214 if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +
215 table_item_count * 2 < 0xFFFF) {
216 // Use 16-bit addressing since the children will fit.
217 node->storage = DicNode::LOOKUP16;
218 return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +
219 child_size;
222 // Use 32-bit addressing as a last resort.
223 node->storage = DicNode::LOOKUP32;
224 return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +
225 child_size;
228 // Serializes the given node when it is DicNode::LEAF* to the output.
229 void SerializeLeaf(const DicNode* node, std::string* output) {
230 // The low 6 bits of the ID byte are the high 6 bits of the first affix ID.
231 int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;
233 // We may store the first value with the node or in the supplimentary list.
234 size_t first_affix_in_supplimentary_list = 1;
235 if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
236 // There are not enough bits for this value, move it to the supplimentary
237 // list where there are more bits per value.
238 first_affix_in_supplimentary_list = 0;
239 first_affix = BDict::FIRST_AFFIX_IS_UNUSED;
242 unsigned char id_byte = (first_affix >> 8) &
243 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;
245 // The next two bits indicates an additional string and more affixes.
246 if (node->storage == DicNode::LEAFMORE)
247 id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE;
248 if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0)
249 id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE;
250 output->push_back(id_byte);
252 // Following is the low 8 bits of the affix index.
253 output->push_back(first_affix & 0xff);
255 // Handle the optional addition with NULL terminator.
256 if (node->storage == DicNode::LEAFMORE) {
257 for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)
258 output->push_back(node->leaf_addition.c_str()[i]);
261 // Handle any following affixes. We already wrote the 0th one.
262 if (node->affix_indices.size() > first_affix_in_supplimentary_list) {
263 for (size_t i = first_affix_in_supplimentary_list;
264 i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;
265 i++) {
266 output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));
267 output->push_back(
268 static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));
271 // Terminator for affix list. We use 0xFFFF.
272 output->push_back(static_cast<unsigned char>(0xFF));
273 output->push_back(static_cast<unsigned char>(0xFF));
277 // Serializes the given node when it is DicNode::LIST* to the output.
278 void SerializeList(const DicNode* node, std::string* output) {
279 bool is_8_bit = node->storage == DicNode::LIST8;
280 unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE |
281 (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);
282 id_byte |= node->children.size(); // We assume the size is < 4 bits.
283 output->push_back(id_byte);
285 // Reserve enough room for the lookup table (either 2 or 3 bytes per entry).
286 int bytes_per_entry = (is_8_bit ? 2 : 3);
287 size_t table_begin = output->size();
288 output->resize(output->size() + node->children.size() * bytes_per_entry);
289 size_t children_begin = output->size();
291 for (size_t i = 0; i < node->children.size(); i++) {
292 // First is the character this entry represents.
293 (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition;
295 // Next is the 8- or 16-bit offset.
296 size_t offset = output->size() - children_begin;
297 if (is_8_bit) {
298 DCHECK(offset <= 0xFF);
299 (*output)[table_begin + i * bytes_per_entry + 1] =
300 static_cast<char>(offset & 0xFF);
301 } else {
302 unsigned short* output16 = reinterpret_cast<unsigned short*>(
303 &(*output)[table_begin + i * bytes_per_entry + 1]);
304 *output16 = static_cast<unsigned short>(offset);
307 // Now append the children's data.
308 SerializeTrie(node->children[i], output);
312 // Serializes the given node when it is DicNode::LOOKUP* to the output.
313 void SerializeLookup(const DicNode* node, std::string* output) {
314 unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;
316 bool has_0th_item;
317 int first_table_item, table_item_count;
318 ComputeLookupStrategyDetails(node->children, &has_0th_item,
319 &first_table_item, &table_item_count);
321 // Set the extra bits in the ID byte.
322 bool is_32_bit = (node->storage == DicNode::LOOKUP32);
323 if (is_32_bit)
324 id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE;
325 if (has_0th_item)
326 id_byte |= BDict::LOOKUP_NODE_0TH_VALUE;
328 size_t begin_offset = output->size();
330 output->push_back(id_byte);
331 output->push_back(static_cast<char>(first_table_item));
332 output->push_back(static_cast<char>(table_item_count));
334 // Save room for the lookup table and the optional 0th item.
335 int bytes_per_entry = (is_32_bit ? 4 : 2);
336 size_t zeroth_item_offset = output->size();
337 if (has_0th_item)
338 output->resize(output->size() + bytes_per_entry);
339 size_t table_begin = output->size();
340 output->resize(output->size() + table_item_count * bytes_per_entry);
342 // Append the children.
343 for (size_t i = 0; i < node->children.size(); i++) {
344 size_t offset = output->size();
346 // Compute the location at which we'll store the offset of the child data.
347 // We may be writing the magic 0th item.
348 size_t offset_offset;
349 if (i == 0 && has_0th_item) {
350 offset_offset = zeroth_item_offset;
351 } else {
352 int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;
353 offset_offset = table_begin + table_index * bytes_per_entry;
356 // Write the offset.
357 if (is_32_bit) {
358 // Use 32-bit absolute offsets.
359 // FIXME(brettw) use bit cast.
360 unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]);
361 *offset32 = static_cast<unsigned>(output->size());
362 } else {
363 // Use 16-bit relative offsets.
364 unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[offset_offset]);
365 *offset16 = static_cast<unsigned short>(output->size() - begin_offset);
368 SerializeTrie(node->children[i], output);
372 // Recursively serializes this node and all of its children to the output.
373 void SerializeTrie(const DicNode* node, std::string* output) {
374 if (node->storage == DicNode::LEAF ||
375 node->storage == DicNode::LEAFMORE) {
376 SerializeLeaf(node, output);
377 } else if (node->storage == DicNode::LIST8 ||
378 node->storage == DicNode::LIST16) {
379 SerializeList(node, output);
380 } else if (node->storage == DicNode::LOOKUP16 ||
381 node->storage == DicNode::LOOKUP32) {
382 SerializeLookup(node, output);
386 void SerializeStringList(const std::vector<std::string>& list,
387 std::string* output) {
388 for (size_t i = 0; i < list.size(); i++) {
389 if (i != 0)
390 output->push_back('\n');
391 output->append(list[i]);
393 output->push_back(0);
397 // Appends the given uint32 to the given string.
398 void AppendUint32(uint32 a, std::string* output) {
399 size_t offset = output->size();
400 output->resize(offset + 4);
401 memcpy(&(*output)[offset], &a, sizeof(uint32));
404 // Serializes the given list of strings with 0 bytes separating them. The end
405 // will be marked by a double-0.
406 void SerializeStringListNullTerm(const std::vector<std::string>& strings,
407 std::string* output) {
408 for (size_t i = 0; i < strings.size(); i++) {
409 // Can't tolerate empty strings since the'll mark the end.
410 if (strings[i].empty())
411 output->push_back(' ');
412 else
413 output->append(strings[i]);
414 output->push_back(0);
416 output->push_back(0);
419 void SerializeReplacements(
420 const std::vector< std::pair<std::string, std::string> >& repl,
421 std::string* output) {
422 for (size_t i = 0; i < repl.size(); i++) {
423 output->append(repl[i].first);
424 output->push_back(0);
425 output->append(repl[i].second);
426 output->push_back(0);
428 output->push_back(0);
431 } // namespace
433 BDictWriter::BDictWriter() : trie_root_(NULL) {
436 BDictWriter::~BDictWriter() {
437 delete trie_root_;
440 void BDictWriter::SetWords(const WordList& words) {
441 trie_root_ = new DicNode;
442 BuildTrie(words, 0, words.size(), 0, trie_root_);
445 std::string BDictWriter::GetBDict() const {
446 std::string ret;
448 // Save room for the header. This will be populated at the end.
449 ret.resize(sizeof(hunspell::BDict::Header));
451 // Serialize the affix portion.
452 size_t aff_offset = ret.size();
453 SerializeAff(&ret);
455 // Serialize the dictionary words.
456 size_t dic_offset = ret.size();
457 ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));
458 SerializeTrie(trie_root_, &ret);
460 // Fill the header last, now that we have the data.
461 hunspell::BDict::Header* header =
462 reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);
463 header->signature = hunspell::BDict::SIGNATURE;
464 header->major_version = hunspell::BDict::MAJOR_VERSION;
465 header->minor_version = hunspell::BDict::MINOR_VERSION;
466 header->aff_offset = static_cast<uint32>(aff_offset);
467 header->dic_offset = static_cast<uint32>(dic_offset);
469 // Write the MD5 digest of the affix information and the dictionary words at
470 // the end of the BDic header.
471 if (header->major_version >= 2)
472 base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest);
474 return ret;
477 void BDictWriter::SerializeAff(std::string* output) const {
478 // Reserve enough room for the header.
479 size_t header_offset = output->size();
480 output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));
482 // Write the comment.
483 output->push_back('\n');
484 output->append(comment_);
485 output->push_back('\n');
487 // We need a magic first AF line that lists the number of following ones.
488 size_t affix_group_offset = output->size();
489 output->append(base::StringPrintf("AF %d",
490 static_cast<int>(affix_groups_.size())));
491 output->push_back(0);
492 SerializeStringListNullTerm(affix_groups_, output);
494 size_t affix_rule_offset = output->size();
495 SerializeStringListNullTerm(affix_rules_, output);
497 size_t rep_offset = output->size();
498 SerializeReplacements(replacements_, output);
500 size_t other_offset = output->size();
501 SerializeStringListNullTerm(other_commands_, output);
503 // Add the header now that we know the offsets.
504 hunspell::BDict::AffHeader* header =
505 reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]);
506 header->affix_group_offset = static_cast<uint32>(affix_group_offset);
507 header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);
508 header->rep_offset = static_cast<uint32>(rep_offset);
509 header->other_offset = static_cast<uint32>(other_offset);
512 } // namespace hunspell