chrome/tools/convert_dict/aff_reader.cc

   1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/tools/convert_dict/aff_reader.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/file_util.h"
  10 #include "base/i18n/icu_string_conversions.h"
  11 #include "base/stringprintf.h"
  12 #include "base/string_split.h"
  13 #include "base/utf_string_conversions.h"
  14 #include "chrome/tools/convert_dict/hunspell_reader.h"
  15
  16 namespace convert_dict {
  17
  18 namespace {
  19
  20 // Returns true if the given line begins with the given case-sensitive
  21 // NULL-terminated ASCII string.
  22 bool StringBeginsWith(const std::string& str, const char* with) {
  23   size_t cur = 0;
  24   while (cur < str.size() && with[cur] != 0) {
  25     if (str[cur] != with[cur])
  26       return false;
  27     cur++;
  28   }
  29   return with[cur] == 0;
  30 }
  31
  32 // Collapses runs of spaces to only one space.
  33 void CollapseDuplicateSpaces(std::string* str) {
  34   int prev_space = false;
  35   for (size_t i = 0; i < str->length(); i++) {
  36     if ((*str)[i] == ' ') {
  37       if (prev_space) {
  38         str->erase(str->begin() + i);
  39         i--;
  40       }
  41       prev_space = true;
  42     } else {
  43       prev_space = false;
  44     }
  45   }
  46 }
  47
  48 }  // namespace
  49
  50 AffReader::AffReader(const FilePath& path)
  51     : has_indexed_affixes_(false) {
  52   file_ = file_util::OpenFile(path, "r");
  53
  54   // Default to Latin1 in case the file doesn't specify it.
  55   encoding_ = "ISO8859-1";
  56 }
  57
  58 AffReader::~AffReader() {
  59   if (file_)
  60     file_util::CloseFile(file_);
  61 }
  62
  63 bool AffReader::Read() {
  64   if (!file_)
  65     return false;
  66
  67   // TODO(brettw) handle byte order mark.
  68
  69   bool got_command = false;
  70   bool got_first_af = false;
  71   bool got_first_rep = false;
  72
  73   has_indexed_affixes_ = false;
  74
  75   while (!feof(file_)) {
  76     std::string line = ReadLine(file_);
  77
  78     // Save comment lines before any commands.
  79     if (!got_command && !line.empty() && line[0] == '#') {
  80       intro_comment_.append(line);
  81       intro_comment_.push_back('\n');
  82       continue;
  83     }
  84
  85     StripComment(&line);
  86     if (line.empty())
  87       continue;
  88     got_command = true;
  89
  90     if (StringBeginsWith(line, "SET ")) {
  91       // Character set encoding.
  92       encoding_ = line.substr(4);
  93       TrimLine(&encoding_);
  94     } else if (StringBeginsWith(line, "AF ")) {
  95       // Affix. The first one is the number of ones following which we don't
  96       // bother with.
  97       has_indexed_affixes_ = true;
  98       if (got_first_af) {
  99         std::string group(line.substr(3));
 100         AddAffixGroup(&group);
 101       } else {
 102         got_first_af = true;
 103       }
 104     } else if (StringBeginsWith(line, "SFX ") ||
 105                StringBeginsWith(line, "PFX ")) {
 106       AddAffix(&line);
 107     } else if (StringBeginsWith(line, "REP ")) {
 108       // The first rep line is the number of ones following which we don't
 109       // bother with.
 110       if (got_first_rep) {
 111         std::string replacement(line.substr(4));
 112         AddReplacement(&replacement);
 113       } else {
 114         got_first_rep = true;
 115       }
 116     } else if (StringBeginsWith(line, "TRY ") ||
 117                StringBeginsWith(line, "MAP ")) {
 118       HandleEncodedCommand(line);
 119     } else if (StringBeginsWith(line, "IGNORE ")) {
 120       printf("We don't support the IGNORE command yet. This would change how "
 121         "we would insert things in our lookup table.\n");
 122       exit(1);
 123     } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
 124       printf("We don't support the COMPLEXPREFIXES command yet. This would "
 125         "mean we have to insert words backwards as well (I think)\n");
 126       exit(1);
 127     } else {
 128       // All other commands get stored in the other commands list.
 129       HandleRawCommand(line);
 130     }
 131   }
 132
 133   return true;
 134 }
 135
 136 bool AffReader::EncodingToUTF8(const std::string& encoded,
 137                                std::string* utf8) const {
 138   std::wstring wide_word;
 139   if (!base::CodepageToWide(encoded, encoding(),
 140                             base::OnStringConversionError::FAIL, &wide_word))
 141     return false;
 142   *utf8 = WideToUTF8(wide_word);
 143   return true;
 144 }
 145
 146 int AffReader::GetAFIndexForAFString(const std::string& af_string) {
 147   std::map<std::string, int>::iterator found = affix_groups_.find(af_string);
 148   if (found != affix_groups_.end())
 149     return found->second;
 150   std::string my_string(af_string);
 151   return AddAffixGroup(&my_string);
 152 }
 153
 154 // We convert the data from our map to an indexed list, and also prefix each
 155 // line with "AF" for the parser to read later.
 156 std::vector<std::string> AffReader::GetAffixGroups() const {
 157   int max_id = 0;
 158   for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
 159        i != affix_groups_.end(); ++i) {
 160     if (i->second > max_id)
 161       max_id = i->second;
 162   }
 163
 164   std::vector<std::string> ret;
 165
 166   ret.resize(max_id);
 167   for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
 168        i != affix_groups_.end(); ++i) {
 169     // Convert the indices into 1-based.
 170     ret[i->second - 1] = std::string("AF ") + i->first;
 171   }
 172
 173   return ret;
 174 }
 175
 176 int AffReader::AddAffixGroup(std::string* rule) {
 177   TrimLine(rule);
 178
 179   // We use the 1-based index of the rule. This matches the way Hunspell
 180   // refers to the numbers.
 181   int affix_id = static_cast<int>(affix_groups_.size()) + 1;
 182   affix_groups_.insert(std::make_pair(*rule, affix_id));
 183   return affix_id;
 184 }
 185
 186 void AffReader::AddAffix(std::string* rule) {
 187   TrimLine(rule);
 188   CollapseDuplicateSpaces(rule);
 189
 190   // These lines have two forms:
 191   //   AFX D Y 4       <- First line, lists how many affixes for "D" there are.
 192   //   AFX D   0 d e   <- Following lines.
 193   // We want to ensure the two last groups on the last line are encoded in
 194   // UTF-8, and we want to make sure that the affix identifier "D" is *not*
 195   // encoded, since that's basically an 8-bit identifier.
 196
 197   // Count to the third space. Everything after that will be re-encoded. This
 198   // will re-encode the number on the first line, but that will be a NOP. If
 199   // there are not that many groups, we won't reencode it, but pass it through.
 200   int found_spaces = 0;
 201   std::string token;
 202   for (size_t i = 0; i < rule->length(); i++) {
 203     if ((*rule)[i] == ' ') {
 204       found_spaces++;
 205       if (found_spaces == 3) {
 206         size_t part_start = i;
 207         std::string part;
 208         if (token[0] != 'Y' && token[0] != 'N') {
 209           // This token represents a stripping prefix or suffix, which is
 210           // either a length or a string to be replaced.
 211           // We also reencode them to UTF-8.
 212           part_start = i - token.length();
 213         }
 214         part = rule->substr(part_start);  // From here to end.
 215
 216         if (part.find('-') != std::string::npos) {
 217           // This rule has a morph rule used by old Hungarian dictionaries.
 218           // When a line has a morph rule, its format becomes as listed below.
 219           //   AFX D   0 d e - M
 220           // To make hunspell work more happily, replace this morph rule with
 221           // a compound flag as listed below.
 222           //   AFX D   0 d/M e
 223           std::vector<std::string> tokens;
 224           base::SplitString(part, ' ', &tokens);
 225           if (tokens.size() >= 5) {
 226             part = base::StringPrintf("%s %s/%s %s",
 227                                       tokens[0].c_str(),
 228                                       tokens[1].c_str(),
 229                                       tokens[4].c_str(),
 230                                       tokens[2].c_str());
 231           }
 232         }
 233
 234         size_t slash_index = part.find('/');
 235         if (slash_index != std::string::npos && !has_indexed_affixes()) {
 236           // This can also have a rule string associated with it following a
 237           // slash. For example:
 238           //    PFX P   0 foo/Y  .
 239           // The "Y" is a flag. For example, the aff file might have a line:
 240           //    COMPOUNDFLAG Y
 241           // so that means that this prefix would be a compound one.
 242           //
 243           // It expects these rules to use the same alias rules as the .dic
 244           // file. We've forced it to use aliases, which is a numerical index
 245           // instead of these character flags, and this needs to be consistent.
 246
 247           std::string before_flags = part.substr(0, slash_index + 1);
 248
 249           // After the slash are both the flags, then whitespace, then the part
 250           // that tells us what to strip.
 251           std::vector<std::string> after_slash;
 252           base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
 253           if (after_slash.size() == 0) {
 254             printf("ERROR: Found 0 terms after slash in affix rule '%s', "
 255                       "but need at least 2.\n",
 256                    part.c_str());
 257             return;
 258           }
 259           if (after_slash.size() == 1) {
 260             printf("WARNING: Found 1 term after slash in affix rule '%s', "
 261                       "but expected at least 2. Adding '.'.\n",
 262                    part.c_str());
 263             after_slash.push_back(".");
 264           }
 265           // Note that we may get a third term here which is the morphological
 266           // description of this rule. This happens in the tests only, so we can
 267           // just ignore it.
 268
 269           part = base::StringPrintf("%s%d %s",
 270                                     before_flags.c_str(),
 271                                     GetAFIndexForAFString(after_slash[0]),
 272                                     after_slash[1].c_str());
 273         }
 274
 275         // Reencode from here
 276         std::string reencoded;
 277         if (!EncodingToUTF8(part, &reencoded)) {
 278           printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n",
 279                  part.c_str());
 280           break;
 281         }
 282
 283         *rule = rule->substr(0, part_start) + reencoded;
 284         break;
 285       }
 286       token.clear();
 287     } else {
 288       token.push_back((*rule)[i]);
 289     }
 290   }
 291
 292   affix_rules_.push_back(*rule);
 293 }
 294
 295 void AffReader::AddReplacement(std::string* rule) {
 296   TrimLine(rule);
 297   CollapseDuplicateSpaces(rule);
 298
 299   std::string utf8rule;
 300   if (!EncodingToUTF8(*rule, &utf8rule)) {
 301     printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n",
 302            rule->c_str());
 303     return;
 304   }
 305
 306   // The first space separates key and value.
 307   size_t space_index = utf8rule.find(' ');
 308   if (space_index == std::string::npos) {
 309     printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str());
 310     return;
 311   }
 312   std::vector<std::string> split;
 313   split.push_back(utf8rule.substr(0, space_index));
 314   split.push_back(utf8rule.substr(space_index + 1));
 315
 316   // Underscores are used to represent spaces in most aff files
 317   // (since the line is parsed on spaces).
 318   std::replace(split[0].begin(), split[0].end(), '_', ' ');
 319   std::replace(split[1].begin(), split[1].end(), '_', ' ');
 320
 321   replacements_.push_back(std::make_pair(split[0], split[1]));
 322 }
 323
 324 void AffReader::HandleRawCommand(const std::string& line) {
 325   other_commands_.push_back(line);
 326 }
 327
 328 void AffReader::HandleEncodedCommand(const std::string& line) {
 329   std::string utf8;
 330   if (!EncodingToUTF8(line, &utf8)) {
 331     printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str());
 332     return;
 333   }
 334   other_commands_.push_back(utf8);
 335 }
 336
 337 }  // namespace convert_dict