chrome/tools/convert_dict/convert_dict_unittest.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <map>
   6 #include <string>
   7
   8 #include "base/file_util.h"
   9 #include "base/format_macros.h"
  10 #include "base/i18n/icu_string_conversions.h"
  11 #include "base/strings/stringprintf.h"
  12 #include "base/strings/utf_string_conversions.h"
  13 #include "chrome/tools/convert_dict/aff_reader.h"
  14 #include "chrome/tools/convert_dict/dic_reader.h"
  15 #include "testing/gtest/include/gtest/gtest.h"
  16 #include "third_party/hunspell/google/bdict_reader.h"
  17 #include "third_party/hunspell/google/bdict_writer.h"
  18
  19 namespace {
  20
  21 // Compares the given word list with the serialized trie to make sure they
  22 // are the same.
  23 // (This function is copied from "chrome/tools/convert_dict/convert_dict.cc").
  24 bool VerifyWords(const convert_dict::DicReader::WordList& org_words,
  25                  const std::string& serialized) {
  26   hunspell::BDictReader reader;
  27   EXPECT_TRUE(
  28       reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()),
  29       serialized.size()));
  30
  31   hunspell::WordIterator iter = reader.GetAllWordIterator();
  32
  33   int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
  34
  35   static const int kBufSize = 128;
  36   char buf[kBufSize];
  37   for (size_t i = 0; i < org_words.size(); i++) {
  38     SCOPED_TRACE(base::StringPrintf(
  39         "org_words[%" PRIuS "]: %s", i, org_words[i].first.c_str()));
  40
  41     int affix_matches = iter.Advance(buf, kBufSize, affix_ids);
  42     EXPECT_NE(0, affix_matches);
  43     EXPECT_EQ(org_words[i].first, std::string(buf));
  44     EXPECT_EQ(affix_matches, static_cast<int>(org_words[i].second.size()));
  45
  46     // Check the individual affix indices.
  47     for (size_t affix_index = 0; affix_index < org_words[i].second.size();
  48          affix_index++) {
  49       EXPECT_EQ(affix_ids[affix_index], org_words[i].second[affix_index]);
  50     }
  51   }
  52
  53   return true;
  54 }
  55
  56 // Implements the test process used by ConvertDictTest.
  57 // This function encapsulates all complicated operations used by
  58 // ConvertDictTest so we can conceal them from the tests themselves.
  59 // This function consists of the following parts:
  60 // * Creates a dummy affix file and a dictionary file.
  61 // * Reads the dummy files.
  62 // * Creates bdict data.
  63 // * Verify the bdict data.
  64 void RunDictionaryTest(const char* codepage,
  65                        const std::map<base::string16, bool>& word_list) {
  66   // Create an affix data and a dictionary data.
  67   std::string aff_data(base::StringPrintf("SET %s\n", codepage));
  68
  69   std::string dic_data(base::StringPrintf("%" PRIuS "\n", word_list.size()));
  70   for (std::map<base::string16, bool>::const_iterator it = word_list.begin();
  71        it != word_list.end(); ++it) {
  72     std::string encoded_word;
  73     EXPECT_TRUE(UTF16ToCodepage(it->first,
  74                                 codepage,
  75                                 base::OnStringConversionError::FAIL,
  76                                 &encoded_word));
  77     dic_data += encoded_word;
  78     dic_data += "\n";
  79   }
  80
  81   // Create a temporary affix file and a dictionary file from the test data.
  82   base::FilePath aff_file;
  83   base::CreateTemporaryFile(&aff_file);
  84   base::WriteFile(aff_file, aff_data.c_str(), aff_data.length());
  85
  86   base::FilePath dic_file;
  87   base::CreateTemporaryFile(&dic_file);
  88   base::WriteFile(dic_file, dic_data.c_str(), dic_data.length());
  89
  90   {
  91     // Read the above affix file with AffReader and read the dictionary file
  92     // with DicReader, respectively.
  93     convert_dict::AffReader aff_reader(aff_file);
  94     EXPECT_TRUE(aff_reader.Read());
  95
  96     convert_dict::DicReader dic_reader(dic_file);
  97     EXPECT_TRUE(dic_reader.Read(&aff_reader));
  98
  99     // Verify this DicReader includes all the input words.
 100     EXPECT_EQ(word_list.size(), dic_reader.words().size());
 101     for (size_t i = 0; i < dic_reader.words().size(); ++i) {
 102       SCOPED_TRACE(base::StringPrintf("dic_reader.words()[%" PRIuS "]: %s",
 103                                       i, dic_reader.words()[i].first.c_str()));
 104       base::string16 word(base::UTF8ToUTF16(dic_reader.words()[i].first));
 105       EXPECT_TRUE(word_list.find(word) != word_list.end());
 106     }
 107
 108     // Create BDICT data and verify it.
 109     hunspell::BDictWriter writer;
 110     writer.SetComment(aff_reader.comments());
 111     writer.SetAffixRules(aff_reader.affix_rules());
 112     writer.SetAffixGroups(aff_reader.GetAffixGroups());
 113     writer.SetReplacements(aff_reader.replacements());
 114     writer.SetOtherCommands(aff_reader.other_commands());
 115     writer.SetWords(dic_reader.words());
 116
 117     std::string bdict_data = writer.GetBDict();
 118     VerifyWords(dic_reader.words(), bdict_data);
 119     EXPECT_TRUE(hunspell::BDict::Verify(bdict_data.data(), bdict_data.size()));
 120
 121     // Trim the end of this BDICT and verify our verifier tells these trimmed
 122     // BDICTs are corrupted.
 123     for (size_t i = 1; i < bdict_data.size(); ++i) {
 124       SCOPED_TRACE(base::StringPrintf("i = %" PRIuS, i));
 125       EXPECT_FALSE(hunspell::BDict::Verify(bdict_data.data(),
 126                                            bdict_data.size() - i));
 127     }
 128   }
 129
 130   // Deletes the temporary files.
 131   // We need to delete them after the above AffReader and DicReader are deleted
 132   // since they close the input files in their destructors.
 133   base::DeleteFile(aff_file, false);
 134   base::DeleteFile(dic_file, false);
 135 }
 136
 137 }  // namespace
 138
 139 // Tests whether or not our DicReader can read all the input English words
 140 TEST(ConvertDictTest, English) {
 141   const char kCodepage[] = "UTF-8";
 142   const wchar_t* kWords[] = {
 143     L"I",
 144     L"he",
 145     L"she",
 146     L"it",
 147     L"we",
 148     L"you",
 149     L"they",
 150   };
 151
 152   std::map<base::string16, bool> word_list;
 153   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
 154     word_list.insert(
 155         std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
 156                                              true));
 157
 158   RunDictionaryTest(kCodepage, word_list);
 159 }
 160
 161 // Tests whether or not our DicReader can read all the input Russian words.
 162 TEST(ConvertDictTest, Russian) {
 163   const char kCodepage[] = "KOI8-R";
 164   const wchar_t* kWords[] = {
 165     L"\x044f",
 166     L"\x0442\x044b",
 167     L"\x043e\x043d",
 168     L"\x043e\x043d\x0430",
 169     L"\x043e\x043d\x043e",
 170     L"\x043c\x044b",
 171     L"\x0432\x044b",
 172     L"\x043e\x043d\x0438",
 173   };
 174
 175   std::map<base::string16, bool> word_list;
 176   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
 177     word_list.insert(
 178         std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
 179                                              true));
 180
 181   RunDictionaryTest(kCodepage, word_list);
 182 }
 183
 184 // Tests whether or not our DicReader can read all the input Hungarian words.
 185 TEST(ConvertDictTest, Hungarian) {
 186   const char kCodepage[] = "ISO8859-2";
 187   const wchar_t* kWords[] = {
 188     L"\x00e9\x006e",
 189     L"\x0074\x0065",
 190     L"\x0151",
 191     L"\x00f6\x006e",
 192     L"\x006d\x0061\x0067\x0061",
 193     L"\x006d\x0069",
 194     L"\x0074\x0069",
 195     L"\x0151\x006b",
 196     L"\x00f6\x006e\x00f6\x006b",
 197     L"\x006d\x0061\x0067\x0075\x006b",
 198   };
 199
 200   std::map<base::string16, bool> word_list;
 201   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
 202     word_list.insert(
 203         std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
 204                                              true));
 205
 206   RunDictionaryTest(kCodepage, word_list);
 207 }