Pin Chrome's shortcut to the Win10 Start menu on install and OS upgrade.
[chromium-blink-merge.git] / third_party / hunspell / google / bdict_reader.cc
blobddd3c885ed2cd2916790e4e251c208604ee13379
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "third_party/hunspell/google/bdict_reader.h"
7 #include "base/logging.h"
9 namespace hunspell {
11 // Like the "Visitor" design pattern, this lightweight object provides an
12 // interface around a serialized trie node at the given address in the memory.
13 class NodeReader {
14 public:
15 // Return values for GetChildAt.
16 enum FindResult {
17 // A node is found.
18 FIND_NODE,
20 // There are no more children for this node, no child node is returned.
21 FIND_DONE,
23 // There is no node at this location, but there are more if you continue
24 // iterating. This happens when there is a lookup node with empty entries.
25 FIND_NOTHING
28 // The default constructor makes an invalid reader.
29 NodeReader();
30 NodeReader(const unsigned char* bdict_data, size_t bdict_length,
31 size_t node_offset, int node_depth);
33 // Returns true if the reader is valid. False means you shouldn't use it.
34 bool is_valid() const { return is_valid_; }
36 // Recursively finds the given NULL terminated word.
37 // See BDictReader::FindWord.
38 int FindWord(const unsigned char* word,
39 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
41 // Allows iterating over the children of this node. When it returns
42 // FIND_NODE, |*result| will be populated with the reader for the found node.
43 // The first index is 0. The single character for this node will be placed
44 // into |*found_char|.
45 FindResult GetChildAt(int index, char* found_char, NodeReader* result) const;
47 // Leaf ----------------------------------------------------------------------
49 inline bool is_leaf() const {
50 // If id_byte() sets is_valid_ to false, we need an extra check to avoid
51 // returning true for this type.
52 return (id_byte() & BDict::LEAF_NODE_TYPE_MASK) ==
53 BDict::LEAF_NODE_TYPE_VALUE && is_valid_;
56 // If this is a leaf node with an additional string, this function will return
57 // a pointer to the beginning of the additional string. It will be NULL
58 // terminated. If it is not a leaf or has no additional string, it will return
59 // NULL.
60 inline const unsigned char* additional_string_for_leaf() const {
61 // Leaf nodes with additional strings start with bits "01" in the ID byte.
62 if ((id_byte() & BDict::LEAF_NODE_ADDITIONAL_MASK) ==
63 BDict::LEAF_NODE_ADDITIONAL_VALUE) {
64 if (node_offset_ < (bdict_length_ - 2))
65 return &bdict_data_[node_offset_ + 2]; // Starts after the 2 byte ID.
66 // Otherwise the dictionary is corrupt.
67 is_valid_ = false;
69 return NULL;
72 // Returns the first affix ID corresponding to the given leaf node. The
73 // current node must be a leaf or this will do the wrong thing. There may be
74 // additional affix IDs following the node when leaf_has_following is set,
75 // but this will not handle those.
76 inline int affix_id_for_leaf() const {
77 if (node_offset_ >= bdict_length_ - 2) {
78 is_valid_ = false;
79 return 0;
81 // Take the lowest 6 bits of the first byte, and all 8 bits of the second.
82 return ((bdict_data_[node_offset_ + 0] &
83 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK) << 8) +
84 bdict_data_[node_offset_ + 1];
87 // Returns true if there is a list of additional affix matches following this
88 // leaf node.
89 inline bool leaf_has_following() const {
90 return ((id_byte() & BDict::LEAF_NODE_FOLLOWING_MASK) ==
91 BDict::LEAF_NODE_FOLLOWING_VALUE);
94 // Fills the affix indices into the output array given a matching leaf node.
95 // |additional_bytes| is the number of bytes of the additional string,
96 // including the NULL terminator, following this leaf node. This will be 0 if
97 // there is no additional string.
98 int FillAffixesForLeafMatch(
99 size_t additional_bytes,
100 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
102 // Lookup --------------------------------------------------------------------
104 inline bool is_lookup() const {
105 return (id_byte() & BDict::LOOKUP_NODE_TYPE_MASK) ==
106 BDict::LOOKUP_NODE_TYPE_VALUE;
109 inline bool is_lookup_32() const {
110 return (id_byte() & BDict::LOOKUP_NODE_32BIT_MASK) ==
111 BDict::LOOKUP_NODE_32BIT_VALUE;
114 inline bool lookup_has_0th() const {
115 return (id_byte() & BDict::LOOKUP_NODE_0TH_MASK) ==
116 BDict::LOOKUP_NODE_0TH_VALUE;
119 // Returns the first entry after the lookup table header. When there is a
120 // magic 0th entry, it will be that address.
121 // The caller checks that the result is in-bounds.
122 inline size_t zeroth_entry_offset() const {
123 return node_offset_ + 3;
126 // Returns the index of the first element in the lookup table. This skips any
127 // magic 0th entry.
128 // The caller checks that the result is in-bounds.
129 size_t lookup_table_offset() const {
130 size_t table_offset = zeroth_entry_offset();
131 if (lookup_has_0th())
132 return table_offset + (is_lookup_32() ? 4 : 2);
133 return table_offset;
136 inline int lookup_first_char() const {
137 if (node_offset_ >= bdict_length_ - 1) {
138 is_valid_ = false;
139 return 0;
141 return bdict_data_[node_offset_ + 1];
144 inline int lookup_num_chars() const {
145 if (node_offset_ >= bdict_length_ - 2) {
146 is_valid_ = false;
147 return 0;
149 return bdict_data_[node_offset_ + 2];
152 // Computes a node reader for the magic 0th entry of the table. This assumes
153 // it has a 0th entry. This will always return FOUND_NODE (for compatilibility
154 // with GetChildAt).
155 FindResult ReaderForLookup0th(NodeReader* result) const;
157 // Gets a node reader for the |offset|th element in the table, not counting
158 // the magic 0th element, if any (so passing 0 here will give you the first
159 // element in the regular lookup table). The offset is assumed to be valid.
161 // |child_node_char| is the character value that the child node will
162 // represent. The single character for this node will be placed into
163 // |*found_char|.
164 FindResult ReaderForLookupAt(size_t index, char* found_char,
165 NodeReader* result) const;
167 // List ----------------------------------------------------------------------
169 inline bool is_list() const {
170 return (id_byte() & BDict::LIST_NODE_TYPE_MASK) ==
171 BDict::LIST_NODE_TYPE_VALUE;
174 inline int is_list_16() const {
175 // 16 bit lst nodes have the high 4 bits of 1.
176 return (id_byte() & BDict::LIST_NODE_16BIT_MASK) ==
177 BDict::LIST_NODE_16BIT_VALUE;
180 inline size_t list_item_count() const {
181 // The list count is stored in the low 4 bits of the ID.
182 return id_byte() & BDict::LIST_NODE_COUNT_MASK;
185 // Returns a NodeReader for the list item with the given index. The single
186 // character for this node will be placed into |*found_char|.
187 FindResult ReaderForListAt(size_t index, char* found_char,
188 NodeReader* result) const;
190 private:
191 inline unsigned char id_byte() const {
192 if (!is_valid_)
193 return 0; // Don't continue with a corrupt node.
194 if (node_offset_ >= bdict_length_) {
195 // Return zero if out of bounds; we'll check is_valid_ in caller.
196 is_valid_ = false;
197 return 0;
199 return bdict_data_[node_offset_];
202 // Checks the given leaf node to see if it's a match for the given word.
203 // The parameters and return values are the same as BDictReader::FindWord.
204 int CompareLeafNode(const unsigned char* word,
205 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
207 // Recursive calls used by FindWord to look up child nodes of different types.
208 int FindInLookup(const unsigned char* word,
209 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
210 int FindInList(const unsigned char* word,
211 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
213 // The entire bdict file. This will be NULL if it is invalid.
214 const unsigned char* bdict_data_;
215 size_t bdict_length_;
216 // Points to the end of the file (for length checking convenience).
217 const unsigned char* bdict_end_;
219 // Absolute offset within |bdict_data_| of the beginning of this node.
220 size_t node_offset_;
222 // The character index into the word that this node represents.
223 int node_depth_;
225 // Signals that dictionary corruption was found during node traversal.
226 mutable bool is_valid_;
229 NodeReader::NodeReader()
230 : bdict_data_(NULL),
231 bdict_length_(0),
232 bdict_end_(NULL),
233 node_offset_(0),
234 node_depth_(0),
235 is_valid_(false) {
238 NodeReader::NodeReader(const unsigned char* bdict_data, size_t bdict_length,
239 size_t node_offset, int node_depth)
240 : bdict_data_(bdict_data),
241 bdict_length_(bdict_length),
242 bdict_end_(bdict_data + bdict_length),
243 node_offset_(node_offset),
244 node_depth_(node_depth),
245 is_valid_(bdict_data != NULL && node_offset < bdict_length) {
248 int NodeReader::FindWord(const unsigned char* word,
249 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
250 // Return 0 if the dictionary is corrupt as BDictReader::FindWord() does.
251 if (!bdict_data_ || node_offset_ > bdict_length_)
252 return 0;
254 if (is_leaf())
255 return CompareLeafNode(word, affix_indices);
257 if (is_lookup())
258 return FindInLookup(word, affix_indices);
259 if (is_list())
260 return FindInList(word, affix_indices);
261 return 0; // Corrupt file.
264 NodeReader::FindResult NodeReader::GetChildAt(int index, char* found_char,
265 NodeReader* result) const {
266 if (is_lookup()) {
267 if (lookup_has_0th()) {
268 if (index == 0) {
269 *found_char = 0;
270 return ReaderForLookup0th(result);
272 index--; // Make index relative to the non-0th-element table.
274 return ReaderForLookupAt(index, found_char, result);
276 if (is_list()) {
277 return ReaderForListAt(index, found_char, result);
279 return FIND_DONE;
282 int NodeReader::CompareLeafNode(
283 const unsigned char* word,
284 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
285 // See if there is an additional string.
286 const unsigned char* additional = additional_string_for_leaf();
287 if (!additional) {
288 // No additional string. This means we should have reached the end of the
289 // word to get a match.
290 if (word[node_depth_] != 0)
291 return 0;
292 return FillAffixesForLeafMatch(0, affix_indices);
295 // Check the additional string.
296 int cur = 0;
297 while (&additional[cur] < bdict_end_ && additional[cur]) {
298 if (word[node_depth_ + cur] != additional[cur])
299 return 0; // Not a match.
300 cur++;
303 if (&additional[cur] == bdict_end_) {
304 is_valid_ = false;
305 return 0;
308 // Got to the end of the additional string, the word should also be over for
309 // a match (the same as above).
310 if (word[node_depth_ + cur] != 0)
311 return 0;
312 return FillAffixesForLeafMatch(cur + 1, affix_indices);
315 int NodeReader::FillAffixesForLeafMatch(
316 size_t additional_bytes,
317 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
318 // The first match is easy, it always comes from the affix_id included in the
319 // leaf node.
320 affix_indices[0] = affix_id_for_leaf();
322 if (!leaf_has_following() && affix_indices[0] != BDict::FIRST_AFFIX_IS_UNUSED)
323 return 1; // Common case: no additional affix group IDs.
325 // We may or may not need to ignore that first value we just read, since it
326 // could be a dummy placeholder value. The |list_offset| is the starting
327 // position in the output list to write the rest of the values, which may
328 // overwrite the first value.
329 int list_offset = 1;
330 if (affix_indices[0] == BDict::FIRST_AFFIX_IS_UNUSED)
331 list_offset = 0;
333 // Save the end pointer (accounting for an odd number of bytes).
334 size_t array_start = node_offset_ + additional_bytes + 2;
335 const uint16* const bdict_short_end = reinterpret_cast<const uint16*>(
336 &bdict_data_[((bdict_length_ - array_start) & -2) + array_start]);
337 // Process all remaining matches.
338 const uint16* following_array = reinterpret_cast<const uint16*>(
339 &bdict_data_[array_start]);
340 for (int i = 0; i < BDict::MAX_AFFIXES_PER_WORD - list_offset; i++) {
341 if (&following_array[i] >= bdict_short_end) {
342 is_valid_ = false;
343 return 0;
345 if (following_array[i] == BDict::LEAF_NODE_FOLLOWING_LIST_TERMINATOR)
346 return i + list_offset; // Found the end of the list.
347 affix_indices[i + list_offset] = following_array[i];
349 return BDict::MAX_AFFIXES_PER_WORD;
352 int NodeReader::FindInLookup(
353 const unsigned char* word,
354 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
355 unsigned char next_char = word[node_depth_];
357 NodeReader child_reader;
358 if (next_char == 0 && lookup_has_0th()) {
359 if (ReaderForLookup0th(&child_reader) != FIND_NODE)
360 return 0;
361 } else {
362 // Look up in the regular part of the table.
363 int offset_in_table = static_cast<int>(next_char) - lookup_first_char();
364 if (offset_in_table < 0 || offset_in_table > lookup_num_chars())
365 return 0; // Table can not include this value.
367 char dummy_char;
368 if (ReaderForLookupAt(offset_in_table, &dummy_char, &child_reader) !=
369 FIND_NODE)
370 return 0;
371 DCHECK(dummy_char == static_cast<char>(next_char));
374 if (!child_reader.is_valid())
375 return 0; // Something is messed up.
377 // Now recurse into that child node.
378 return child_reader.FindWord(word, affix_indices);
381 NodeReader::FindResult NodeReader::ReaderForLookup0th(
382 NodeReader* result) const {
383 size_t child_offset;
384 if (is_lookup_32()) {
385 child_offset = *reinterpret_cast<const unsigned int*>(
386 &bdict_data_[zeroth_entry_offset()]);
387 } else {
388 child_offset = *reinterpret_cast<const unsigned short*>(
389 &bdict_data_[zeroth_entry_offset()]);
390 child_offset += node_offset_;
393 // Range check the offset;
394 if (child_offset >= bdict_length_) {
395 is_valid_ = false;
396 return FIND_DONE;
399 // Now recurse into that child node. We don't advance to the next character
400 // here since the 0th element will be a leaf (see ReaderForLookupAt).
401 *result = NodeReader(bdict_data_, bdict_length_, child_offset, node_depth_);
402 return FIND_NODE;
405 NodeReader::FindResult NodeReader::ReaderForLookupAt(
406 size_t index,
407 char* found_char,
408 NodeReader* result) const {
409 const unsigned char* table_begin = &bdict_data_[lookup_table_offset()];
411 if (index >= static_cast<size_t>(lookup_num_chars()) || !is_valid_)
412 return FIND_DONE;
414 size_t child_offset;
415 if (is_lookup_32()) {
416 // Table contains 32-bit absolute offsets.
417 child_offset =
418 reinterpret_cast<const unsigned int*>(table_begin)[index];
419 if (!child_offset)
420 return FIND_NOTHING; // This entry in the table is empty.
421 } else {
422 // Table contains 16-bit offsets relative to the current node.
423 child_offset =
424 reinterpret_cast<const unsigned short*>(table_begin)[index];
425 if (!child_offset)
426 return FIND_NOTHING; // This entry in the table is empty.
427 child_offset += node_offset_;
430 // Range check the offset;
431 if (child_offset >= bdict_length_) {
432 is_valid_ = false;
433 return FIND_DONE; // Error.
436 // This is a bit tricky. When we've just reached the end of a word, the word
437 // itself will be stored in a leaf "node" off of this node. That node, of
438 // course, will want to know that it's the end of the word and so we have to
439 // have it use the same index into the word as we're using at this level.
441 // This happens when there is a word in the dictionary that is a strict
442 // prefix of other words in the dictionary, and so we'll have a non-leaf
443 // node representing the entire word before the ending leaf node.
445 // In all other cases, we want to advance to the next character. Even if the
446 // child node is a leaf, it will have an additional character that it will
447 // want to check.
448 *found_char = static_cast<char>(index + lookup_first_char());
449 if (!is_valid_)
450 return FIND_DONE;
451 int char_advance = *found_char == 0 ? 0 : 1;
453 *result = NodeReader(bdict_data_, bdict_length_,
454 child_offset, node_depth_ + char_advance);
455 return FIND_NODE;
458 int NodeReader::FindInList(
459 const unsigned char* word,
460 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
461 unsigned char next_char = word[node_depth_];
463 // TODO(brettw) replace with binary search.
464 size_t list_count = list_item_count();
465 const unsigned char* list_begin = &bdict_data_[node_offset_ + 1];
467 int bytes_per_index = (is_list_16() ? 3 : 2);
469 for (size_t i = 0; i < list_count; i++) {
470 const unsigned char* list_current = &list_begin[i * bytes_per_index];
471 if (list_current >= bdict_end_) {
472 is_valid_ = false;
473 return 0;
475 if (*list_current == next_char) {
476 // Found a match.
477 char dummy_char;
478 NodeReader child_reader;
479 if (ReaderForListAt(i, &dummy_char, &child_reader) != FIND_NODE)
480 return 0;
481 DCHECK(dummy_char == static_cast<char>(next_char));
482 return child_reader.FindWord(word, affix_indices);
485 return 0;
488 NodeReader::FindResult NodeReader::ReaderForListAt(
489 size_t index,
490 char* found_char,
491 NodeReader* result) const {
492 size_t list_begin = node_offset_ + 1;
494 if (index >= list_item_count())
495 return FIND_DONE;
497 size_t offset;
498 if (is_list_16()) {
499 const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 3;
500 *found_char = static_cast<char>(list_item_begin[0]);
502 // The children begin right after the list.
503 size_t children_begin = list_begin + list_item_count() * 3;
504 offset = children_begin + *reinterpret_cast<const unsigned short*>(
505 &list_item_begin[1]);
506 } else {
507 const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 2;
508 *found_char = list_item_begin[0];
510 size_t children_begin = list_begin + list_item_count() * 2;
511 offset = children_begin + list_item_begin[1];
514 if (offset == 0 || node_offset_ >= bdict_length_) {
515 is_valid_ = false;
516 return FIND_DONE; // Error, should not happen except for corruption.
519 int char_advance = *found_char == 0 ? 0 : 1; // See ReaderForLookupAt.
520 *result = NodeReader(bdict_data_, bdict_length_,
521 offset, node_depth_ + char_advance);
522 return FIND_NODE;
525 // WordIterator ----------------------------------------------------------------
527 struct WordIterator::NodeInfo {
528 // The current offset is set to -1 so we start iterating at 0 when Advance
529 // is called.
530 NodeInfo(const NodeReader& rdr, char add)
531 : reader(rdr),
532 addition(add),
533 cur_offset(-1) {
536 // The reader for this node level.
537 NodeReader reader;
539 // The character that this level represents. For the 0th level, this will
540 // be 0 (since it is the root that represents no characters).
541 char addition;
543 // The current index into the reader that we're reading. Combined with the
544 // |stack_|, this allows us to iterate over the tree in depth-first order.
545 int cur_offset;
548 WordIterator::WordIterator(const NodeReader& reader) {
549 NodeInfo info(reader, 0);
550 stack_.push_back(info);
553 WordIterator::WordIterator(const WordIterator& other) {
554 operator=(other);
557 WordIterator::~WordIterator() {
558 // Can't be in the header for the NodeReader destructor.
561 WordIterator& WordIterator::operator=(const WordIterator& other) {
562 stack_ = other.stack_;
563 return *this;
566 int WordIterator::Advance(char* output_buffer, size_t output_len,
567 int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) {
568 // In-order tree walker. This uses a loop for fake tail recursion.
569 while (!stack_.empty()) {
570 NodeInfo& cur = stack_.back();
571 cur.cur_offset++;
572 char cur_char;
573 NodeReader child_reader;
575 /*if (cur.reader.is_leaf()) {
576 child_reader = cur.reader;
577 cur_char = cur.addition;
578 stack_.pop_back();
579 return FoundLeaf(child_reader, cur_char, output_buffer, output_len,
580 affix_ids);
583 switch (cur.reader.GetChildAt(cur.cur_offset, &cur_char, &child_reader)) {
584 case NodeReader::FIND_NODE:
585 // Got a valid child node.
586 if (child_reader.is_leaf()) {
587 return FoundLeaf(child_reader, cur_char, output_buffer, output_len,
588 affix_ids);
591 // Not a leaf. Add the new node to our stack and try again.
592 stack_.push_back(NodeInfo(child_reader, cur_char));
593 break;
595 case NodeReader::FIND_NOTHING:
596 // This one is empty, but we're not done. Continue on.
597 break;
599 case NodeReader::FIND_DONE:
600 // No more children at this level, pop the stack and go back one.
601 stack_.pop_back();
605 return false;
608 int WordIterator::FoundLeaf(const NodeReader& reader, char cur_char,
609 char* output_buffer, size_t output_len,
610 int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) {
611 // Remember that the first item in the stack is the root and so doesn't count.
612 int i;
613 for (i = 0; i < static_cast<int>(stack_.size()) - 1 && i < static_cast<int>(output_len) - 1; i++)
614 output_buffer[i] = stack_[i + 1].addition;
615 output_buffer[i++] = cur_char; // The one we just found.
617 // Possibly add any extra parts.
618 size_t additional_string_length = 0;
619 const char* additional = reinterpret_cast<const char*>(
620 reader.additional_string_for_leaf());
621 for (; i < static_cast<int>(output_len) - 1 && additional &&
622 additional[additional_string_length] != 0;
623 i++, additional_string_length++)
624 output_buffer[i] = additional[additional_string_length];
625 if (additional_string_length)
626 additional_string_length++; // Account for the null terminator.
627 output_buffer[i] = 0;
629 return reader.FillAffixesForLeafMatch(additional_string_length,
630 affix_ids);
633 // LineIterator ----------------------------------------------------------------
635 LineIterator::LineIterator(
636 const unsigned char* bdict_data,
637 size_t bdict_length,
638 size_t first_offset)
639 : bdict_data_(bdict_data),
640 bdict_length_(bdict_length),
641 cur_offset_(first_offset) {
644 // Returns true when all data has been read. We're done when we reach a
645 // double-NULL or a the end of the input (shouldn't happen).
646 bool LineIterator::IsDone() const {
647 return cur_offset_ >= bdict_length_ || bdict_data_[cur_offset_] == 0;
650 const char* LineIterator::Advance() {
651 if (IsDone())
652 return NULL;
654 const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]);
656 // Advance over this word to find the end.
657 while (cur_offset_ < bdict_length_ && bdict_data_[cur_offset_])
658 cur_offset_++;
659 cur_offset_++; // Advance over the NULL terminator.
661 return begin;
664 bool LineIterator::AdvanceAndCopy(char* buf, size_t buf_len) {
665 if (IsDone())
666 return false;
668 const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]);
670 // Advance over this word to find the end.
671 size_t i;
672 for (i = 0;
673 i < buf_len && cur_offset_ < bdict_length_ && bdict_data_[cur_offset_];
674 i++, cur_offset_++) {
675 buf[i] = bdict_data_[cur_offset_];
677 // Handle the NULL terminator.
678 cur_offset_++; // Consume in the input
679 if (i < buf_len)
680 buf[i] = 0; // Save in the output.
681 else
682 buf[buf_len - 1] = 0; // Overflow, make sure it's terminated.
684 return !!buf[0];
687 // ReplacementIterator ---------------------------------------------------------
689 // Fills pointers to NULL terminated strings into the given output params.
690 // Returns false if there are no more pairs and nothing was filled in.
691 bool ReplacementIterator::GetNext(const char** first, const char** second) {
692 if (IsDone())
693 return false;
694 *first = Advance();
695 *second = Advance();
696 return *first && *second;
699 // BDictReader -----------------------------------------------------------------
701 BDictReader::BDictReader()
702 : bdict_data_(NULL),
703 bdict_length_(0),
704 header_(NULL) {
707 bool BDictReader::Init(const unsigned char* bdict_data, size_t bdict_length) {
708 if (bdict_length < sizeof(BDict::Header))
709 return false;
711 // Check header.
712 header_ = reinterpret_cast<const BDict::Header*>(bdict_data);
713 if (header_->signature != BDict::SIGNATURE ||
714 header_->major_version > BDict::MAJOR_VERSION ||
715 header_->dic_offset > bdict_length)
716 return false;
718 // Get the affix header, make sure there is enough room for it.
719 if (header_->aff_offset + sizeof(BDict::AffHeader) > bdict_length)
720 return false;
721 aff_header_ = reinterpret_cast<const BDict::AffHeader*>(
722 &bdict_data[header_->aff_offset]);
724 // Make sure there is enough room for the affix group count dword.
725 if (aff_header_->affix_group_offset > bdict_length - sizeof(uint32))
726 return false;
728 // This function is called from SpellCheck::SpellCheckWord(), which blocks
729 // WebKit. To avoid blocking WebKit for a long time, we do not check the MD5
730 // digest here. Instead we check the MD5 digest when Chrome finishes
731 // downloading a dictionary.
733 // Don't set these until the end. This way, NULL bdict_data_ will indicate
734 // failure.
735 bdict_data_ = bdict_data;
736 bdict_length_ = bdict_length;
737 return true;
740 int BDictReader::FindWord(
741 const char* word,
742 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {
743 if (!bdict_data_ ||
744 header_->dic_offset >= bdict_length_) {
745 // When the dictionary is corrupt, we return 0 which means the word is valid
746 // and has no rules. This means when there is some problem, we'll default
747 // to no spellchecking rather than marking everything as misspelled.
748 return 0;
750 NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0);
751 return reader.FindWord(reinterpret_cast<const unsigned char*>(word),
752 affix_indices);
755 LineIterator BDictReader::GetAfLineIterator() const {
756 if (!bdict_data_ ||
757 aff_header_->affix_group_offset == 0 ||
758 aff_header_->affix_group_offset >= bdict_length_)
759 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid.
760 return LineIterator(bdict_data_, bdict_length_,
761 aff_header_->affix_group_offset);
764 LineIterator BDictReader::GetAffixLineIterator() const {
765 if (!bdict_data_ ||
766 aff_header_->affix_rule_offset == 0 ||
767 aff_header_->affix_rule_offset >= bdict_length_)
768 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid.
769 return LineIterator(bdict_data_, bdict_length_,
770 aff_header_->affix_rule_offset);
773 LineIterator BDictReader::GetOtherLineIterator() const {
774 if (!bdict_data_ ||
775 aff_header_->other_offset == 0 ||
776 aff_header_->other_offset >= bdict_length_)
777 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid.
778 return LineIterator(bdict_data_, bdict_length_,
779 aff_header_->other_offset);
782 ReplacementIterator BDictReader::GetReplacementIterator() const {
783 return ReplacementIterator(bdict_data_, bdict_length_,
784 aff_header_->rep_offset);
788 WordIterator BDictReader::GetAllWordIterator() const {
789 NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0);
790 return WordIterator(reader);
793 } // namespace hunspell