third_party/hunspell/google/bdict_reader.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "third_party/hunspell/google/bdict_reader.h"
   6
   7 #include "base/logging.h"
   8
   9 namespace hunspell {
  10
  11 // Like the "Visitor" design pattern, this lightweight object provides an
  12 // interface around a serialized trie node at the given address in the memory.
  13 class NodeReader {
  14  public:
  15   // Return values for GetChildAt.
  16   enum FindResult {
  17     // A node is found.
  18     FIND_NODE,
  19
  20     // There are no more children for this node, no child node is returned.
  21     FIND_DONE,
  22
  23     // There is no node at this location, but there are more if you continue
  24     // iterating. This happens when there is a lookup node with empty entries.
  25     FIND_NOTHING
  26   };
  27
  28   // The default constructor makes an invalid reader.
  29   NodeReader();
  30   NodeReader(const unsigned char* bdict_data, size_t bdict_length,
  31              size_t node_offset, int node_depth);
  32
  33   // Returns true if the reader is valid. False means you shouldn't use it.
  34   bool is_valid() const { return is_valid_; }
  35
  36   // Recursively finds the given NULL terminated word.
  37   // See BDictReader::FindWord.
  38   int FindWord(const unsigned char* word,
  39                int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
  40
  41   // Allows iterating over the children of this node. When it returns
  42   // FIND_NODE, |*result| will be populated with the reader for the found node.
  43   // The first index is 0. The single character for this node will be placed
  44   // into |*found_char|.
  45   FindResult GetChildAt(int index, char* found_char, NodeReader* result) const;
  46
  47   // Leaf ----------------------------------------------------------------------
  48
  49   inline bool is_leaf() const {
  50     // If id_byte() sets is_valid_ to false, we need an extra check to avoid
  51     // returning true for this type.
  52     return (id_byte() & BDict::LEAF_NODE_TYPE_MASK) ==
  53         BDict::LEAF_NODE_TYPE_VALUE && is_valid_;
  54   }
  55
  56   // If this is a leaf node with an additional string, this function will return
  57   // a pointer to the beginning of the additional string. It will be NULL
  58   // terminated. If it is not a leaf or has no additional string, it will return
  59   // NULL.
  60   inline const unsigned char* additional_string_for_leaf() const {
  61     // Leaf nodes with additional strings start with bits "01" in the ID byte.
  62     if ((id_byte() & BDict::LEAF_NODE_ADDITIONAL_MASK) ==
  63       BDict::LEAF_NODE_ADDITIONAL_VALUE) {
  64       if (node_offset_ < (bdict_length_ - 2))
  65         return &bdict_data_[node_offset_ + 2];  // Starts after the 2 byte ID.
  66       // Otherwise the dictionary is corrupt.
  67       is_valid_ = false;
  68     }
  69     return NULL;
  70   }
  71
  72   // Returns the first affix ID corresponding to the given leaf node. The
  73   // current node must be a leaf or this will do the wrong thing. There may be
  74   // additional affix IDs following the node when leaf_has_following is set,
  75   // but this will not handle those.
  76   inline int affix_id_for_leaf() const {
  77     if (node_offset_ >= bdict_length_ - 2) {
  78       is_valid_ = false;
  79       return 0;
  80     }
  81     // Take the lowest 6 bits of the first byte, and all 8 bits of the second.
  82     return ((bdict_data_[node_offset_ + 0] &
  83              BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK) << 8) +
  84            bdict_data_[node_offset_ + 1];
  85   }
  86
  87   // Returns true if there is a list of additional affix matches following this
  88   // leaf node.
  89   inline bool leaf_has_following() const {
  90     return ((id_byte() & BDict::LEAF_NODE_FOLLOWING_MASK) ==
  91         BDict::LEAF_NODE_FOLLOWING_VALUE);
  92   }
  93
  94   // Fills the affix indices into the output array given a matching leaf node.
  95   // |additional_bytes| is the number of bytes of the additional string,
  96   // including the NULL terminator, following this leaf node. This will be 0 if
  97   // there is no additional string.
  98   int FillAffixesForLeafMatch(
  99       size_t additional_bytes,
 100       int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
 101
 102   // Lookup --------------------------------------------------------------------
 103
 104   inline bool is_lookup() const {
 105     return (id_byte() & BDict::LOOKUP_NODE_TYPE_MASK) ==
 106         BDict::LOOKUP_NODE_TYPE_VALUE;
 107   }
 108
 109   inline bool is_lookup_32() const {
 110     return (id_byte() & BDict::LOOKUP_NODE_32BIT_MASK) ==
 111         BDict::LOOKUP_NODE_32BIT_VALUE;
 112   }
 113
 114   inline bool lookup_has_0th() const {
 115     return (id_byte() & BDict::LOOKUP_NODE_0TH_MASK) ==
 116         BDict::LOOKUP_NODE_0TH_VALUE;
 117   }
 118
 119   // Returns the first entry after the lookup table header. When there is a
 120   // magic 0th entry, it will be that address.
 121   // The caller checks that the result is in-bounds.
 122   inline size_t zeroth_entry_offset() const {
 123     return node_offset_ + 3;
 124   }
 125
 126   // Returns the index of the first element in the lookup table. This skips any
 127   // magic 0th entry.
 128   // The caller checks that the result is in-bounds.
 129   size_t lookup_table_offset() const {
 130     size_t table_offset = zeroth_entry_offset();
 131     if (lookup_has_0th())
 132       return table_offset + (is_lookup_32() ? 4 : 2);
 133     return table_offset;
 134   }
 135
 136   inline int lookup_first_char() const {
 137     if (node_offset_ >= bdict_length_ - 1) {
 138       is_valid_ = false;
 139       return 0;
 140     }
 141     return bdict_data_[node_offset_ + 1];
 142   }
 143
 144   inline int lookup_num_chars() const {
 145     if (node_offset_ >= bdict_length_ - 2) {
 146       is_valid_ = false;
 147       return 0;
 148     }
 149     return bdict_data_[node_offset_ + 2];
 150   }
 151
 152   // Computes a node reader for the magic 0th entry of the table. This assumes
 153   // it has a 0th entry. This will always return FOUND_NODE (for compatilibility
 154   // with GetChildAt).
 155   FindResult ReaderForLookup0th(NodeReader* result) const;
 156
 157   // Gets a node reader for the |offset|th element in the table, not counting
 158   // the magic 0th element, if any (so passing 0 here will give you the first
 159   // element in the regular lookup table). The offset is assumed to be valid.
 160   //
 161   // |child_node_char| is the character value that the child node will
 162   // represent. The single character for this node will be placed into
 163   // |*found_char|.
 164   FindResult ReaderForLookupAt(size_t index, char* found_char,
 165                                NodeReader* result) const;
 166
 167   // List ----------------------------------------------------------------------
 168
 169   inline bool is_list() const {
 170     return (id_byte() & BDict::LIST_NODE_TYPE_MASK) ==
 171         BDict::LIST_NODE_TYPE_VALUE;
 172   }
 173
 174   inline int is_list_16() const {
 175     // 16 bit lst nodes have the high 4 bits of 1.
 176     return (id_byte() & BDict::LIST_NODE_16BIT_MASK) ==
 177         BDict::LIST_NODE_16BIT_VALUE;
 178   }
 179
 180   inline size_t list_item_count() const {
 181     // The list count is stored in the low 4 bits of the ID.
 182     return id_byte() & BDict::LIST_NODE_COUNT_MASK;
 183   }
 184
 185   // Returns a NodeReader for the list item with the given index. The single
 186   // character for this node will be placed into |*found_char|.
 187   FindResult ReaderForListAt(size_t index, char* found_char,
 188                              NodeReader* result) const;
 189
 190  private:
 191   inline unsigned char id_byte() const {
 192     if (!is_valid_)
 193       return 0;  // Don't continue with a corrupt node.
 194     if (node_offset_ >= bdict_length_) {
 195       // Return zero if out of bounds; we'll check is_valid_ in caller.
 196       is_valid_ = false;
 197       return 0;
 198     }
 199     return bdict_data_[node_offset_];
 200   }
 201
 202   // Checks the given leaf node to see if it's a match for the given word.
 203   // The parameters and return values are the same as BDictReader::FindWord.
 204   int CompareLeafNode(const unsigned char* word,
 205                       int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
 206
 207   // Recursive calls used by FindWord to look up child nodes of different types.
 208   int FindInLookup(const unsigned char* word,
 209                    int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
 210   int FindInList(const unsigned char* word,
 211                  int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
 212
 213   // The entire bdict file. This will be NULL if it is invalid.
 214   const unsigned char* bdict_data_;
 215   size_t bdict_length_;
 216   // Points to the end of the file (for length checking convenience).
 217   const unsigned char* bdict_end_;
 218
 219   // Absolute offset within |bdict_data_| of the beginning of this node.
 220   size_t node_offset_;
 221
 222   // The character index into the word that this node represents.
 223   int node_depth_;
 224
 225   // Signals that dictionary corruption was found during node traversal.
 226   mutable bool is_valid_;
 227 };
 228
 229 NodeReader::NodeReader()
 230     : bdict_data_(NULL),
 231       bdict_length_(0),
 232       bdict_end_(NULL),
 233       node_offset_(0),
 234       node_depth_(0),
 235       is_valid_(false) {
 236 }
 237
 238 NodeReader::NodeReader(const unsigned char* bdict_data, size_t bdict_length,
 239                        size_t node_offset, int node_depth)
 240     : bdict_data_(bdict_data),
 241       bdict_length_(bdict_length),
 242       bdict_end_(bdict_data + bdict_length),
 243       node_offset_(node_offset),
 244       node_depth_(node_depth),
 245       is_valid_(bdict_data != NULL && node_offset < bdict_length) {
 246 }
 247
 248 int NodeReader::FindWord(const unsigned char* word,
 249                          int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
 250   // Return 0 if the dictionary is corrupt as BDictReader::FindWord() does.
 251   if (!bdict_data_ || node_offset_ > bdict_length_)
 252     return 0;
 253
 254   if (is_leaf())
 255     return CompareLeafNode(word, affix_indices);
 256
 257   if (is_lookup())
 258     return FindInLookup(word, affix_indices);
 259   if (is_list())
 260     return FindInList(word, affix_indices);
 261   return 0;  // Corrupt file.
 262 }
 263
 264 NodeReader::FindResult NodeReader::GetChildAt(int index, char* found_char,
 265                                               NodeReader* result) const {
 266   if (is_lookup()) {
 267     if (lookup_has_0th()) {
 268       if (index == 0) {
 269         *found_char = 0;
 270         return ReaderForLookup0th(result);
 271       }
 272       index--;  // Make index relative to the non-0th-element table.
 273     }
 274     return ReaderForLookupAt(index, found_char, result);
 275   }
 276   if (is_list()) {
 277     return ReaderForListAt(index, found_char, result);
 278   }
 279   return FIND_DONE;
 280 }
 281
 282 int NodeReader::CompareLeafNode(
 283     const unsigned char* word,
 284     int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
 285   // See if there is an additional string.
 286   const unsigned char* additional = additional_string_for_leaf();
 287   if (!additional) {
 288     // No additional string. This means we should have reached the end of the
 289     // word to get a match.
 290     if (word[node_depth_] != 0)
 291       return 0;
 292     return FillAffixesForLeafMatch(0, affix_indices);
 293   }
 294
 295   // Check the additional string.
 296   int cur = 0;
 297   while (&additional[cur] < bdict_end_ && additional[cur]) {
 298     if (word[node_depth_ + cur] != additional[cur])
 299       return 0;  // Not a match.
 300     cur++;
 301   }
 302
 303   if (&additional[cur] == bdict_end_) {
 304     is_valid_ = false;
 305     return 0;
 306   }
 307
 308   // Got to the end of the additional string, the word should also be over for
 309   // a match (the same as above).
 310   if (word[node_depth_ + cur] != 0)
 311     return 0;
 312   return FillAffixesForLeafMatch(cur + 1, affix_indices);
 313 }
 314
 315 int NodeReader::FillAffixesForLeafMatch(
 316     size_t additional_bytes,
 317     int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
 318   // The first match is easy, it always comes from the affix_id included in the
 319   // leaf node.
 320   affix_indices[0] = affix_id_for_leaf();
 321
 322   if (!leaf_has_following() && affix_indices[0] != BDict::FIRST_AFFIX_IS_UNUSED)
 323     return 1;  // Common case: no additional affix group IDs.
 324
 325   // We may or may not need to ignore that first value we just read, since it
 326   // could be a dummy placeholder value. The |list_offset| is the starting
 327   // position in the output list to write the rest of the values, which may
 328   // overwrite the first value.
 329   int list_offset = 1;
 330   if (affix_indices[0] == BDict::FIRST_AFFIX_IS_UNUSED)
 331     list_offset = 0;
 332
 333   // Save the end pointer (accounting for an odd number of bytes).
 334   size_t array_start = node_offset_ + additional_bytes + 2;
 335   const uint16* const bdict_short_end = reinterpret_cast<const uint16*>(
 336       &bdict_data_[((bdict_length_ - array_start) & -2) + array_start]);
 337   // Process all remaining matches.
 338   const uint16* following_array = reinterpret_cast<const uint16*>(
 339       &bdict_data_[array_start]);
 340   for (int i = 0; i < BDict::MAX_AFFIXES_PER_WORD - list_offset; i++) {
 341     if (&following_array[i] >= bdict_short_end) {
 342       is_valid_ = false;
 343       return 0;
 344     }
 345     if (following_array[i] == BDict::LEAF_NODE_FOLLOWING_LIST_TERMINATOR)
 346       return i + list_offset;  // Found the end of the list.
 347     affix_indices[i + list_offset] = following_array[i];
 348   }
 349   return BDict::MAX_AFFIXES_PER_WORD;
 350 }
 351
 352 int NodeReader::FindInLookup(
 353     const unsigned char* word,
 354     int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
 355   unsigned char next_char = word[node_depth_];
 356
 357   NodeReader child_reader;
 358   if (next_char == 0 && lookup_has_0th()) {
 359     if (ReaderForLookup0th(&child_reader) != FIND_NODE)
 360       return 0;
 361   } else {
 362     // Look up in the regular part of the table.
 363     int offset_in_table = static_cast<int>(next_char) - lookup_first_char();
 364     if (offset_in_table < 0 || offset_in_table > lookup_num_chars())
 365       return 0;  // Table can not include this value.
 366
 367     char dummy_char;
 368     if (ReaderForLookupAt(offset_in_table, &dummy_char, &child_reader) !=
 369         FIND_NODE)
 370       return 0;
 371     DCHECK(dummy_char == static_cast<char>(next_char));
 372   }
 373
 374   if (!child_reader.is_valid())
 375     return 0;  // Something is messed up.
 376
 377   // Now recurse into that child node.
 378   return child_reader.FindWord(word, affix_indices);
 379 }
 380
 381 NodeReader::FindResult NodeReader::ReaderForLookup0th(
 382     NodeReader* result) const {
 383   size_t child_offset;
 384   if (is_lookup_32()) {
 385     child_offset = *reinterpret_cast<const unsigned int*>(
 386         &bdict_data_[zeroth_entry_offset()]);
 387   } else {
 388     child_offset = *reinterpret_cast<const unsigned short*>(
 389         &bdict_data_[zeroth_entry_offset()]);
 390     child_offset += node_offset_;
 391   }
 392
 393   // Range check the offset;
 394   if (child_offset >= bdict_length_) {
 395     is_valid_ = false;
 396     return FIND_DONE;
 397   }
 398
 399   // Now recurse into that child node. We don't advance to the next character
 400   // here since the 0th element will be a leaf (see ReaderForLookupAt).
 401   *result = NodeReader(bdict_data_, bdict_length_, child_offset, node_depth_);
 402   return FIND_NODE;
 403 }
 404
 405 NodeReader::FindResult NodeReader::ReaderForLookupAt(
 406     size_t index,
 407     char* found_char,
 408     NodeReader* result) const {
 409   const unsigned char* table_begin = &bdict_data_[lookup_table_offset()];
 410
 411   if (index >= static_cast<size_t>(lookup_num_chars()) || !is_valid_)
 412     return FIND_DONE;
 413
 414   size_t child_offset;
 415   if (is_lookup_32()) {
 416     // Table contains 32-bit absolute offsets.
 417     child_offset =
 418         reinterpret_cast<const unsigned int*>(table_begin)[index];
 419     if (!child_offset)
 420       return FIND_NOTHING;  // This entry in the table is empty.
 421   } else {
 422     // Table contains 16-bit offsets relative to the current node.
 423     child_offset =
 424         reinterpret_cast<const unsigned short*>(table_begin)[index];
 425     if (!child_offset)
 426       return FIND_NOTHING;  // This entry in the table is empty.
 427     child_offset += node_offset_;
 428   }
 429
 430   // Range check the offset;
 431   if (child_offset >= bdict_length_) {
 432     is_valid_ = false;
 433     return FIND_DONE;  // Error.
 434   }
 435
 436   // This is a bit tricky. When we've just reached the end of a word, the word
 437   // itself will be stored in a leaf "node" off of this node. That node, of
 438   // course, will want to know that it's the end of the word and so we have to
 439   // have it use the same index into the word as we're using at this level.
 440   //
 441   // This happens when there is a word in the dictionary that is a strict
 442   // prefix of other words in the dictionary, and so we'll have a non-leaf
 443   // node representing the entire word before the ending leaf node.
 444   //
 445   // In all other cases, we want to advance to the next character. Even if the
 446   // child node is a leaf, it will have an additional character that it will
 447   // want to check.
 448   *found_char = static_cast<char>(index + lookup_first_char());
 449   if (!is_valid_)
 450     return FIND_DONE;
 451   int char_advance = *found_char == 0 ? 0 : 1;
 452
 453   *result = NodeReader(bdict_data_, bdict_length_,
 454                        child_offset, node_depth_ + char_advance);
 455   return FIND_NODE;
 456 }
 457
 458 int NodeReader::FindInList(
 459     const unsigned char* word,
 460     int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
 461   unsigned char next_char = word[node_depth_];
 462
 463   // TODO(brettw) replace with binary search.
 464   size_t list_count = list_item_count();
 465   const unsigned char* list_begin = &bdict_data_[node_offset_ + 1];
 466
 467   int bytes_per_index = (is_list_16() ? 3 : 2);
 468
 469   for (size_t i = 0; i < list_count; i++) {
 470     const unsigned char* list_current = &list_begin[i * bytes_per_index];
 471     if (list_current >= bdict_end_) {
 472       is_valid_ = false;
 473       return 0;
 474     }
 475     if (*list_current == next_char) {
 476       // Found a match.
 477       char dummy_char;
 478       NodeReader child_reader;
 479       if (ReaderForListAt(i, &dummy_char, &child_reader) != FIND_NODE)
 480         return 0;
 481       DCHECK(dummy_char == static_cast<char>(next_char));
 482       return child_reader.FindWord(word, affix_indices);
 483     }
 484   }
 485   return 0;
 486 }
 487
 488 NodeReader::FindResult NodeReader::ReaderForListAt(
 489     size_t index,
 490     char* found_char,
 491     NodeReader* result) const {
 492   size_t list_begin = node_offset_ + 1;
 493
 494   if (index >= list_item_count())
 495     return FIND_DONE;
 496
 497   size_t offset;
 498   if (is_list_16()) {
 499     const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 3;
 500     *found_char = static_cast<char>(list_item_begin[0]);
 501
 502     // The children begin right after the list.
 503     size_t children_begin = list_begin + list_item_count() * 3;
 504     offset = children_begin + *reinterpret_cast<const unsigned short*>(
 505         &list_item_begin[1]);
 506   } else {
 507     const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 2;
 508     *found_char = list_item_begin[0];
 509
 510     size_t children_begin = list_begin + list_item_count() * 2;
 511     offset = children_begin + list_item_begin[1];
 512   }
 513
 514   if (offset == 0 || node_offset_ >= bdict_length_) {
 515     is_valid_ = false;
 516     return FIND_DONE;  // Error, should not happen except for corruption.
 517   }
 518
 519   int char_advance = *found_char == 0 ? 0 : 1;  // See ReaderForLookupAt.
 520   *result = NodeReader(bdict_data_, bdict_length_,
 521                        offset, node_depth_ + char_advance);
 522   return FIND_NODE;
 523 }
 524
 525 // WordIterator ----------------------------------------------------------------
 526
 527 struct WordIterator::NodeInfo {
 528   // The current offset is set to -1 so we start iterating at 0 when Advance
 529   // is called.
 530   NodeInfo(const NodeReader& rdr, char add)
 531       : reader(rdr),
 532         addition(add),
 533         cur_offset(-1) {
 534   }
 535
 536   // The reader for this node level.
 537   NodeReader reader;
 538
 539   // The character that this level represents. For the 0th level, this will
 540   // be 0 (since it is the root that represents no characters).
 541   char addition;
 542
 543   // The current index into the reader that we're reading. Combined with the
 544   // |stack_|, this allows us to iterate over the tree in depth-first order.
 545   int cur_offset;
 546 };
 547
 548 WordIterator::WordIterator(const NodeReader& reader) {
 549   NodeInfo info(reader, 0);
 550   stack_.push_back(info);
 551 }
 552
 553 WordIterator::WordIterator(const WordIterator& other) {
 554   operator=(other);
 555 }
 556
 557 WordIterator::~WordIterator() {
 558   // Can't be in the header for the NodeReader destructor.
 559 }
 560
 561 WordIterator& WordIterator::operator=(const WordIterator& other) {
 562   stack_ = other.stack_;
 563   return *this;
 564 }
 565
 566 int WordIterator::Advance(char* output_buffer, size_t output_len,
 567                           int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) {
 568   // In-order tree walker. This uses a loop for fake tail recursion.
 569   while (!stack_.empty()) {
 570     NodeInfo& cur = stack_.back();
 571     cur.cur_offset++;
 572     char cur_char;
 573     NodeReader child_reader;
 574
 575     /*if (cur.reader.is_leaf()) {
 576       child_reader = cur.reader;
 577       cur_char = cur.addition;
 578       stack_.pop_back();
 579       return FoundLeaf(child_reader, cur_char, output_buffer, output_len,
 580                        affix_ids);
 581     }*/
 582
 583     switch (cur.reader.GetChildAt(cur.cur_offset, &cur_char, &child_reader)) {
 584       case NodeReader::FIND_NODE:
 585         // Got a valid child node.
 586         if (child_reader.is_leaf()) {
 587           return FoundLeaf(child_reader, cur_char, output_buffer, output_len,
 588                            affix_ids);
 589         }
 590
 591         // Not a leaf. Add the new node to our stack and try again.
 592         stack_.push_back(NodeInfo(child_reader, cur_char));
 593         break;
 594
 595       case NodeReader::FIND_NOTHING:
 596         // This one is empty, but we're not done. Continue on.
 597         break;
 598
 599       case NodeReader::FIND_DONE:
 600         // No more children at this level, pop the stack and go back one.
 601         stack_.pop_back();
 602     }
 603   }
 604
 605   return false;
 606 }
 607
 608 int WordIterator::FoundLeaf(const NodeReader& reader, char cur_char,
 609                             char* output_buffer, size_t output_len,
 610                             int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) {
 611   // Remember that the first item in the stack is the root and so doesn't count.
 612   int i;
 613   for (i = 0; i < static_cast<int>(stack_.size()) - 1 && i < static_cast<int>(output_len) - 1; i++)
 614     output_buffer[i] = stack_[i + 1].addition;
 615   output_buffer[i++] = cur_char;  // The one we just found.
 616
 617   // Possibly add any extra parts.
 618   size_t additional_string_length = 0;
 619   const char* additional = reinterpret_cast<const char*>(
 620       reader.additional_string_for_leaf());
 621   for (; i < static_cast<int>(output_len) - 1 && additional &&
 622            additional[additional_string_length] != 0;
 623        i++, additional_string_length++)
 624     output_buffer[i] = additional[additional_string_length];
 625   if (additional_string_length)
 626     additional_string_length++;  // Account for the null terminator.
 627   output_buffer[i] = 0;
 628
 629   return reader.FillAffixesForLeafMatch(additional_string_length,
 630                                         affix_ids);
 631 }
 632
 633 // LineIterator ----------------------------------------------------------------
 634
 635 LineIterator::LineIterator(
 636     const unsigned char* bdict_data,
 637     size_t bdict_length,
 638     size_t first_offset)
 639     : bdict_data_(bdict_data),
 640       bdict_length_(bdict_length),
 641       cur_offset_(first_offset) {
 642 }
 643
 644 // Returns true when all data has been read. We're done when we reach a
 645 // double-NULL or a the end of the input (shouldn't happen).
 646 bool LineIterator::IsDone() const {
 647   return cur_offset_ >= bdict_length_ || bdict_data_[cur_offset_] == 0;
 648 }
 649
 650 const char* LineIterator::Advance() {
 651   if (IsDone())
 652     return NULL;
 653
 654   const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]);
 655
 656   // Advance over this word to find the end.
 657   while (cur_offset_ < bdict_length_ && bdict_data_[cur_offset_])
 658     cur_offset_++;
 659   cur_offset_++;  // Advance over the NULL terminator.
 660
 661   return begin;
 662 }
 663
 664 bool LineIterator::AdvanceAndCopy(char* buf, size_t buf_len) {
 665   if (IsDone())
 666     return false;
 667
 668   const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]);
 669
 670   // Advance over this word to find the end.
 671   size_t i;
 672   for (i = 0;
 673        i < buf_len && cur_offset_ < bdict_length_ && bdict_data_[cur_offset_];
 674        i++, cur_offset_++) {
 675     buf[i] = bdict_data_[cur_offset_];
 676   }
 677   // Handle the NULL terminator.
 678   cur_offset_++;  // Consume in the input
 679   if (i < buf_len)
 680     buf[i] = 0;  // Save in the output.
 681   else
 682     buf[buf_len - 1] = 0;  // Overflow, make sure it's terminated.
 683
 684   return !!buf[0];
 685 }
 686
 687 // ReplacementIterator ---------------------------------------------------------
 688
 689 // Fills pointers to NULL terminated strings into the given output params.
 690 // Returns false if there are no more pairs and nothing was filled in.
 691 bool ReplacementIterator::GetNext(const char** first, const char** second) {
 692   if (IsDone())
 693     return false;
 694   *first = Advance();
 695   *second = Advance();
 696   return *first && *second;
 697 }
 698
 699 // BDictReader -----------------------------------------------------------------
 700
 701 BDictReader::BDictReader()
 702     : bdict_data_(NULL),
 703       bdict_length_(0),
 704       header_(NULL) {
 705 }
 706
 707 bool BDictReader::Init(const unsigned char* bdict_data, size_t bdict_length) {
 708   if (bdict_length < sizeof(BDict::Header))
 709     return false;
 710
 711   // Check header.
 712   header_ = reinterpret_cast<const BDict::Header*>(bdict_data);
 713   if (header_->signature != BDict::SIGNATURE ||
 714       header_->major_version > BDict::MAJOR_VERSION ||
 715       header_->dic_offset > bdict_length)
 716     return false;
 717
 718   // Get the affix header, make sure there is enough room for it.
 719   if (header_->aff_offset + sizeof(BDict::AffHeader) > bdict_length)
 720     return false;
 721   aff_header_ = reinterpret_cast<const BDict::AffHeader*>(
 722       &bdict_data[header_->aff_offset]);
 723
 724   // Make sure there is enough room for the affix group count dword.
 725   if (aff_header_->affix_group_offset > bdict_length - sizeof(uint32))
 726     return false;
 727
 728   // This function is called from SpellCheck::SpellCheckWord(), which blocks
 729   // WebKit. To avoid blocking WebKit for a long time, we do not check the MD5
 730   // digest here. Instead we check the MD5 digest when Chrome finishes
 731   // downloading a dictionary.
 732
 733   // Don't set these until the end. This way, NULL bdict_data_ will indicate
 734   // failure.
 735   bdict_data_ = bdict_data;
 736   bdict_length_ = bdict_length;
 737   return true;
 738 }
 739
 740 int BDictReader::FindWord(
 741     const char* word,
 742     int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
 743   if (!bdict_data_ ||
 744       header_->dic_offset >= bdict_length_) {
 745     // When the dictionary is corrupt, we return 0 which means the word is valid
 746     // and has no rules. This means when there is some problem, we'll default
 747     // to no spellchecking rather than marking everything as misspelled.
 748     return 0;
 749   }
 750   NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0);
 751   return reader.FindWord(reinterpret_cast<const unsigned char*>(word),
 752                          affix_indices);
 753 }
 754
 755 LineIterator BDictReader::GetAfLineIterator() const {
 756   if (!bdict_data_ ||
 757       aff_header_->affix_group_offset == 0 ||
 758       aff_header_->affix_group_offset >= bdict_length_)
 759     return LineIterator(bdict_data_, 0, 0);  // Item is empty or invalid.
 760   return LineIterator(bdict_data_, bdict_length_,
 761                       aff_header_->affix_group_offset);
 762 }
 763
 764 LineIterator BDictReader::GetAffixLineIterator() const {
 765   if (!bdict_data_ ||
 766       aff_header_->affix_rule_offset == 0 ||
 767       aff_header_->affix_rule_offset >= bdict_length_)
 768     return LineIterator(bdict_data_, 0, 0);  // Item is empty or invalid.
 769   return LineIterator(bdict_data_, bdict_length_,
 770                       aff_header_->affix_rule_offset);
 771 }
 772
 773 LineIterator BDictReader::GetOtherLineIterator() const {
 774   if (!bdict_data_ ||
 775       aff_header_->other_offset == 0 ||
 776       aff_header_->other_offset >= bdict_length_)
 777     return LineIterator(bdict_data_, 0, 0);  // Item is empty or invalid.
 778   return LineIterator(bdict_data_, bdict_length_,
 779                       aff_header_->other_offset);
 780 }
 781
 782 ReplacementIterator BDictReader::GetReplacementIterator() const {
 783   return ReplacementIterator(bdict_data_, bdict_length_,
 784                              aff_header_->rep_offset);
 785 }
 786
 787
 788 WordIterator BDictReader::GetAllWordIterator() const {
 789   NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0);
 790   return WordIterator(reader);
 791 }
 792
 793 }  // namespace hunspell