third_party/hunspell/google/bdict_writer.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "third_party/hunspell/google/bdict_writer.h"
   6
   7 #include "base/logging.h"
   8 #include "base/strings/stringprintf.h"
   9 #include "third_party/hunspell/google/bdict.h"
  10
  11 namespace hunspell {
  12
  13 // Represents one node the word trie in memory. This does not have to be very
  14 // efficient since it is only used when building.
  15 class DicNode {
  16  public:
  17   enum StorageType {
  18     UNDEFINED,  // Uninitialized storage type.
  19     LEAF,       // Word with no additional string data.
  20     LEAFMORE,   // Word with additional suffix following.
  21     LIST16,     // List of sub-nodes with 16-bit relative offsets.
  22     LIST8,      // List of sub-nodes with 8-bit relative offsets.
  23     LOOKUP32,   // Lookup table with 32-bit absolute offsets.
  24     LOOKUP16,   // LOokup table with 16-bit relative offsets.
  25   };
  26
  27   DicNode() : addition(0), storage(UNDEFINED) {
  28   }
  29
  30   ~DicNode() {
  31     for (size_t i = 0; i < children.size(); i++)
  32       delete children[i];
  33   }
  34
  35   bool is_leaf() const { return children.empty(); }
  36
  37   // When non-zero, this character is the additional level that this
  38   // node represents. This will be 0 for some leaf nodes when there is no
  39   // addition and for the root node.
  40   char addition;
  41
  42   std::vector<DicNode*> children;
  43
  44   // When there are no children, this is a leaf node and this "addition string"
  45   // is appended to the result. When there are children, this will be empty.
  46   std::string leaf_addition;
  47
  48   // For leaf nodes, this are the indices into the affix table.
  49   std::vector<int> affix_indices;
  50
  51   // Initially uninitialized, ComputeStorage() will fill this in with the
  52   // desired serialization method.
  53   StorageType storage;
  54 };
  55
  56 namespace {
  57
  58 void SerializeTrie(const DicNode* node, std::string* output);
  59
  60 // Returns true if the nth character in the given word is |ch|. Will return
  61 // false when there is no nth character. Note that this will also match an
  62 // implicit NULL at the end of the string.
  63 bool NthCharacterIs(const std::string& word, size_t n, char ch) {
  64   if (word.length() < n)  // Want to allow n == length() to catch the NULL.
  65     return false;
  66   return word.c_str()[n] == ch;  // Use c_str() to get NULL terminator.
  67 }
  68
  69 // Recursively build the trie data structure for the range in the |words| list
  70 // in [begin, end). It is assumed that all words in that range will have the
  71 // same |node_depth - 2| characters at the beginning. This node will key off of
  72 // the |node_depth - 1| character, with a special case for the root.
  73 //
  74 // |prefix_chars| is how deep this node is in the trie (and corresponds to how
  75 // many letters of the word we will skip). The root level will have
  76 // |prefix_chars| of 0.
  77 //
  78 // The given |node| will be filled with the data. The return value is the
  79 // index into the |words| vector of the next word to process. It will be
  80 // equal to |end| when all words have been consumed.
  81 size_t BuildTrie(const BDictWriter::WordList& words,
  82                  size_t begin, size_t end,
  83                  size_t node_depth, DicNode* node) {
  84   // Find the prefix that this node represents.
  85   const std::string& begin_str = words[begin].first;
  86   if (begin_str.length() < node_depth) {
  87     // Singleton.
  88     node->addition = 0;
  89     node->affix_indices = words[begin].second;
  90     return begin + 1;
  91   }
  92
  93   // Now find the range of words sharing this prefix.
  94   size_t match_count;
  95   if (node_depth == 0 && begin == 0) {
  96     // Special case the root node.
  97     match_count = end - begin;
  98     node->addition = 0;
  99   } else {
 100     match_count = 0;
 101     node->addition = begin_str[node_depth - 1];
 102     // We know the strings should have [0, node_depth-1) characters at the
 103     // beginning already matching, so we only need to check the new one.
 104     while (begin + match_count < end &&
 105            NthCharacterIs(words[begin + match_count].first,
 106                           node_depth - 1, node->addition))
 107       match_count++;
 108   }
 109
 110   if (match_count == 1) {
 111     // Just found a leaf node with no other words sharing its prefix. Save any
 112     // remaining characters and we're done.
 113     node->affix_indices = words[begin].second;
 114     node->leaf_addition = begin_str.substr(node_depth);
 115     return begin + 1;
 116   }
 117
 118   // We have a range of words, add them as children of this node.
 119   size_t i = begin;
 120   while (i < begin + match_count) {
 121     DicNode* cur = new DicNode;
 122     i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);
 123     node->children.push_back(cur);
 124   }
 125
 126   return begin + match_count;
 127 }
 128
 129 // Lookup tables are complicated. They can have a magic 0th entry not counted
 130 // in the table dimensions, and also have indices only for the used sub-range.
 131 // This function will compute the starting point and size of a lookup table,
 132 // in addition to whether it should have the magic 0th entry for the given
 133 // list of child nodes.
 134 void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,
 135                                   bool* has_0th_entry,
 136                                   int* first_item,
 137                                   int* list_size) {
 138   *has_0th_entry = false;
 139   *first_item = 0;
 140   *list_size = 0;
 141   if (children.empty())
 142     return;
 143
 144   size_t first_offset = 0;
 145   if (children[0]->addition == 0) {
 146     *has_0th_entry = true;
 147     first_offset++;
 148   }
 149
 150   if (children.size() == first_offset)
 151     return;
 152
 153   *first_item = static_cast<unsigned char>(children[first_offset]->addition);
 154   unsigned char last_item = children[children.size() - 1]->addition;
 155   *list_size = last_item - *first_item + 1;
 156 }
 157
 158 // Recursively fills in the storage strategy for this node and each of its
 159 // children. This must be done before actually serializing because the storage
 160 // mode will depend on the size of the children.
 161 size_t ComputeTrieStorage(DicNode* node) {
 162   if (node->is_leaf()) {
 163     // The additional affix list holds affixes when there is more than one. Each
 164     // entry is two bytes, plus an additional FFFF terminator.
 165     size_t supplimentary_size = 0;
 166     if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
 167       // We cannot store the first affix ID of the affix list into a leaf node.
 168       // In this case, we have to store all the affix IDs and a terminator
 169       // into a supplimentary list.
 170       supplimentary_size = node->affix_indices.size() * 2 + 2;
 171     } else if (node->affix_indices.size() > 1) {
 172       // We can store the first affix ID of the affix list into a leaf node.
 173       // In this case, we need to store the remaining affix IDs and a
 174       // terminator into a supplimentary list.
 175       supplimentary_size = node->affix_indices.size() * 2;
 176     }
 177
 178     if (node->leaf_addition.empty()) {
 179       node->storage = DicNode::LEAF;
 180       return 2 + supplimentary_size;
 181     }
 182     node->storage = DicNode::LEAFMORE;
 183     // Signature & affix (2) + null for leaf_addition (1) = 3
 184     return 3 + node->leaf_addition.size() + supplimentary_size;
 185   }
 186
 187   // Recursively compute the size of the children for non-leaf nodes.
 188   size_t child_size = 0;
 189   for (size_t i = 0; i < node->children.size(); i++)
 190     child_size += ComputeTrieStorage(node->children[i]);
 191
 192   // Fixed size is only 1 byte which is the ID byte and the count combined.
 193   static const int kListHeaderSize = 1;
 194
 195   // Lists can only store up to 16 items.
 196   static const size_t kListThreshold = 16;
 197   if (node->children.size() < kListThreshold && child_size <= 0xFF) {
 198     node->storage = DicNode::LIST8;
 199     return kListHeaderSize + node->children.size() * 2 + child_size;
 200   }
 201
 202   if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {
 203     node->storage = DicNode::LIST16;
 204     // Fixed size is one byte plus 3 for each table entry.
 205     return kListHeaderSize + node->children.size() * 3 + child_size;
 206   }
 207
 208   static const int kTableHeaderSize = 2;  // Type + table size.
 209
 210   bool has_0th_item;
 211   int first_table_item, table_item_count;
 212   ComputeLookupStrategyDetails(node->children, &has_0th_item,
 213                                &first_table_item, &table_item_count);
 214   if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +
 215       table_item_count * 2 < 0xFFFF) {
 216     // Use 16-bit addressing since the children will fit.
 217     node->storage = DicNode::LOOKUP16;
 218     return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +
 219         child_size;
 220   }
 221
 222   // Use 32-bit addressing as a last resort.
 223   node->storage = DicNode::LOOKUP32;
 224   return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +
 225       child_size;
 226 }
 227
 228 // Serializes the given node when it is DicNode::LEAF* to the output.
 229 void SerializeLeaf(const DicNode* node, std::string* output) {
 230   // The low 6 bits of the ID byte are the high 6 bits of the first affix ID.
 231   int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;
 232
 233   // We may store the first value with the node or in the supplimentary list.
 234   size_t first_affix_in_supplimentary_list = 1;
 235   if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
 236     // There are not enough bits for this value, move it to the supplimentary
 237     // list where there are more bits per value.
 238     first_affix_in_supplimentary_list = 0;
 239     first_affix = BDict::FIRST_AFFIX_IS_UNUSED;
 240   }
 241
 242   unsigned char id_byte = (first_affix >> 8) &
 243       BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;
 244
 245   // The next two bits indicates an additional string and more affixes.
 246   if (node->storage == DicNode::LEAFMORE)
 247     id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE;
 248   if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0)
 249     id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE;
 250   output->push_back(id_byte);
 251
 252   // Following is the low 8 bits of the affix index.
 253   output->push_back(first_affix & 0xff);
 254
 255   // Handle the optional addition with NULL terminator.
 256   if (node->storage == DicNode::LEAFMORE) {
 257     for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)
 258       output->push_back(node->leaf_addition.c_str()[i]);
 259   }
 260
 261   // Handle any following affixes. We already wrote the 0th one.
 262   if (node->affix_indices.size() > first_affix_in_supplimentary_list) {
 263     for (size_t i = first_affix_in_supplimentary_list;
 264          i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;
 265          i++) {
 266       output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));
 267       output->push_back(
 268           static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));
 269     }
 270
 271     // Terminator for affix list. We use 0xFFFF.
 272     output->push_back(static_cast<unsigned char>(0xFF));
 273     output->push_back(static_cast<unsigned char>(0xFF));
 274   }
 275 }
 276
 277 // Serializes the given node when it is DicNode::LIST* to the output.
 278 void SerializeList(const DicNode* node, std::string* output) {
 279   bool is_8_bit = node->storage == DicNode::LIST8;
 280   unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE |
 281       (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);
 282   id_byte |= node->children.size();  // We assume the size is < 4 bits.
 283   output->push_back(id_byte);
 284
 285   // Reserve enough room for the lookup table (either 2 or 3 bytes per entry).
 286   int bytes_per_entry = (is_8_bit ? 2 : 3);
 287   size_t table_begin = output->size();
 288   output->resize(output->size() + node->children.size() * bytes_per_entry);
 289   size_t children_begin = output->size();
 290
 291   for (size_t i = 0; i < node->children.size(); i++) {
 292     // First is the character this entry represents.
 293     (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition;
 294
 295     // Next is the 8- or 16-bit offset.
 296     size_t offset = output->size() - children_begin;
 297     if (is_8_bit) {
 298       DCHECK(offset <= 0xFF);
 299       (*output)[table_begin + i * bytes_per_entry + 1] =
 300           static_cast<char>(offset & 0xFF);
 301     } else {
 302       unsigned short* output16 = reinterpret_cast<unsigned short*>(
 303           &(*output)[table_begin + i * bytes_per_entry + 1]);
 304       *output16 = static_cast<unsigned short>(offset);
 305     }
 306
 307     // Now append the children's data.
 308     SerializeTrie(node->children[i], output);
 309   }
 310 }
 311
 312 // Serializes the given node when it is DicNode::LOOKUP* to the output.
 313 void SerializeLookup(const DicNode* node, std::string* output) {
 314   unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;
 315
 316   bool has_0th_item;
 317   int first_table_item, table_item_count;
 318   ComputeLookupStrategyDetails(node->children, &has_0th_item,
 319                                &first_table_item, &table_item_count);
 320
 321   // Set the extra bits in the ID byte.
 322   bool is_32_bit = (node->storage == DicNode::LOOKUP32);
 323   if (is_32_bit)
 324     id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE;
 325   if (has_0th_item)
 326     id_byte |= BDict::LOOKUP_NODE_0TH_VALUE;
 327
 328   size_t begin_offset = output->size();
 329
 330   output->push_back(id_byte);
 331   output->push_back(static_cast<char>(first_table_item));
 332   output->push_back(static_cast<char>(table_item_count));
 333
 334   // Save room for the lookup table and the optional 0th item.
 335   int bytes_per_entry = (is_32_bit ? 4 : 2);
 336   size_t zeroth_item_offset = output->size();
 337   if (has_0th_item)
 338     output->resize(output->size() + bytes_per_entry);
 339   size_t table_begin = output->size();
 340   output->resize(output->size() + table_item_count * bytes_per_entry);
 341
 342   // Append the children.
 343   for (size_t i = 0; i < node->children.size(); i++) {
 344     size_t offset = output->size();
 345
 346     // Compute the location at which we'll store the offset of the child data.
 347     // We may be writing the magic 0th item.
 348     size_t offset_offset;
 349     if (i == 0 && has_0th_item) {
 350       offset_offset = zeroth_item_offset;
 351     } else {
 352       int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;
 353       offset_offset = table_begin + table_index * bytes_per_entry;
 354     }
 355
 356     // Write the offset.
 357     if (is_32_bit) {
 358       // Use 32-bit absolute offsets.
 359       // FIXME(brettw) use bit cast.
 360       unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]);
 361       *offset32 = static_cast<unsigned>(output->size());
 362     } else {
 363       // Use 16-bit relative offsets.
 364       unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[offset_offset]);
 365       *offset16 = static_cast<unsigned short>(output->size() - begin_offset);
 366     }
 367
 368     SerializeTrie(node->children[i], output);
 369   }
 370 }
 371
 372 // Recursively serializes this node and all of its children to the output.
 373 void SerializeTrie(const DicNode* node, std::string* output) {
 374   if (node->storage == DicNode::LEAF ||
 375       node->storage == DicNode::LEAFMORE) {
 376     SerializeLeaf(node, output);
 377   } else if (node->storage == DicNode::LIST8 ||
 378              node->storage == DicNode::LIST16) {
 379     SerializeList(node, output);
 380   } else if (node->storage == DicNode::LOOKUP16 ||
 381              node->storage == DicNode::LOOKUP32) {
 382     SerializeLookup(node, output);
 383   }
 384 }
 385
 386 // Serializes the given list of strings with 0 bytes separating them. The end
 387 // will be marked by a double-0.
 388 void SerializeStringListNullTerm(const std::vector<std::string>& strings,
 389                                  std::string* output) {
 390   for (size_t i = 0; i < strings.size(); i++) {
 391     // Can't tolerate empty strings since the'll mark the end.
 392     if (strings[i].empty())
 393       output->push_back(' ');
 394     else
 395       output->append(strings[i]);
 396     output->push_back(0);
 397   }
 398   output->push_back(0);
 399 }
 400
 401 void SerializeReplacements(
 402     const std::vector< std::pair<std::string, std::string> >& repl,
 403     std::string* output) {
 404   for (size_t i = 0; i < repl.size(); i++) {
 405     output->append(repl[i].first);
 406     output->push_back(0);
 407     output->append(repl[i].second);
 408     output->push_back(0);
 409   }
 410   output->push_back(0);
 411 }
 412
 413 }  // namespace
 414
 415 BDictWriter::BDictWriter() : trie_root_(NULL) {
 416 }
 417
 418 BDictWriter::~BDictWriter() {
 419   delete trie_root_;
 420 }
 421
 422 void BDictWriter::SetWords(const WordList& words) {
 423   trie_root_ = new DicNode;
 424   BuildTrie(words, 0, words.size(), 0, trie_root_);
 425 }
 426
 427 std::string BDictWriter::GetBDict() const {
 428   std::string ret;
 429
 430   // Save room for the header. This will be populated at the end.
 431   ret.resize(sizeof(hunspell::BDict::Header));
 432
 433   // Serialize the affix portion.
 434   size_t aff_offset = ret.size();
 435   SerializeAff(&ret);
 436
 437   // Serialize the dictionary words.
 438   size_t dic_offset = ret.size();
 439   ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));
 440   SerializeTrie(trie_root_, &ret);
 441
 442   // Fill the header last, now that we have the data.
 443   hunspell::BDict::Header* header =
 444       reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);
 445   header->signature = hunspell::BDict::SIGNATURE;
 446   header->major_version = hunspell::BDict::MAJOR_VERSION;
 447   header->minor_version = hunspell::BDict::MINOR_VERSION;
 448   header->aff_offset = static_cast<uint32>(aff_offset);
 449   header->dic_offset = static_cast<uint32>(dic_offset);
 450
 451   // Write the MD5 digest of the affix information and the dictionary words at
 452   // the end of the BDic header.
 453   if (header->major_version >= 2)
 454     base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest);
 455
 456   return ret;
 457 }
 458
 459 void BDictWriter::SerializeAff(std::string* output) const {
 460   // Reserve enough room for the header.
 461   size_t header_offset = output->size();
 462   output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));
 463
 464   // Write the comment.
 465   output->push_back('\n');
 466   output->append(comment_);
 467   output->push_back('\n');
 468
 469   // We need a magic first AF line that lists the number of following ones.
 470   size_t affix_group_offset = output->size();
 471   output->append(base::StringPrintf("AF %d",
 472                                     static_cast<int>(affix_groups_.size())));
 473   output->push_back(0);
 474   SerializeStringListNullTerm(affix_groups_, output);
 475
 476   size_t affix_rule_offset = output->size();
 477   SerializeStringListNullTerm(affix_rules_, output);
 478
 479   size_t rep_offset = output->size();
 480   SerializeReplacements(replacements_, output);
 481
 482   size_t other_offset = output->size();
 483   SerializeStringListNullTerm(other_commands_, output);
 484
 485   // Add the header now that we know the offsets.
 486   hunspell::BDict::AffHeader* header =
 487       reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]);
 488   header->affix_group_offset = static_cast<uint32>(affix_group_offset);
 489   header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);
 490   header->rep_offset = static_cast<uint32>(rep_offset);
 491   header->other_offset = static_cast<uint32>(other_offset);
 492 }
 493
 494 }  // namespace hunspell