net/tools/tld_cleanup/tld_cleanup_util.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
   6
   7 #include "base/file_util.h"
   8 #include "base/logging.h"
   9 #include "base/strings/string_number_conversions.h"
  10 #include "base/strings/string_util.h"
  11 #include "url/gurl.h"
  12 #include "url/url_parse.h"
  13
  14 namespace {
  15
  16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
  17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
  18
  19 const int kExceptionRule = 1;
  20 const int kWildcardRule = 2;
  21 const int kPrivateRule = 4;
  22 }
  23
  24 namespace net {
  25 namespace tld_cleanup {
  26
  27 // Writes the list of domain rules contained in the 'rules' set to the
  28 // 'outfile', with each rule terminated by a LF.  The file must already have
  29 // been created with write access.
  30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
  31   std::string data;
  32   data.append("%{\n"
  33               "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
  34               "// Use of this source code is governed by a BSD-style license "
  35               "that can be\n"
  36               "// found in the LICENSE file.\n\n"
  37               "// This file is generated by net/tools/tld_cleanup/.\n"
  38               "// DO NOT MANUALLY EDIT!\n"
  39               "%}\n"
  40               "struct DomainRule {\n"
  41               "  int name_offset;\n"
  42               "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
  43               "};\n"
  44               "%%\n");
  45
  46   for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
  47     data.append(i->first);
  48     data.append(", ");
  49     int type = 0;
  50     if (i->second.exception) {
  51       type = kExceptionRule;
  52     } else if (i->second.wildcard) {
  53       type = kWildcardRule;
  54     }
  55     if (i->second.is_private) {
  56       type += kPrivateRule;
  57     }
  58     data.append(base::IntToString(type));
  59     data.append("\n");
  60   }
  61
  62   data.append("%%\n");
  63
  64   int written = base::WriteFile(outfile,
  65                                      data.data(),
  66                                      static_cast<int>(data.size()));
  67
  68   return written == static_cast<int>(data.size());
  69 }
  70
  71 // Adjusts the rule to a standard form: removes single extraneous dots and
  72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
  73 // valid; logs a warning and returns kWarning if it is probably invalid; and
  74 // logs an error and returns kError if the rule is (almost) certainly invalid.
  75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
  76   NormalizeResult result = kSuccess;
  77
  78   // Strip single leading and trailing dots.
  79   if (domain->at(0) == '.')
  80     domain->erase(0, 1);
  81   if (domain->empty()) {
  82     LOG(WARNING) << "Ignoring empty rule";
  83     return kWarning;
  84   }
  85   if (domain->at(domain->size() - 1) == '.')
  86     domain->erase(domain->size() - 1, 1);
  87   if (domain->empty()) {
  88     LOG(WARNING) << "Ignoring empty rule";
  89     return kWarning;
  90   }
  91
  92   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
  93   size_t start_offset = 0;
  94   if (domain->at(0) == '!') {
  95     domain->erase(0, 1);
  96     rule->exception = true;
  97   } else if (domain->find("*.") == 0) {
  98     domain->erase(0, 2);
  99     rule->wildcard = true;
 100   }
 101   if (domain->empty()) {
 102     LOG(WARNING) << "Ignoring empty rule";
 103     return kWarning;
 104   }
 105
 106   // Warn about additional '*.' or '!'.
 107   if (domain->find("*.", start_offset) != std::string::npos ||
 108       domain->find('!', start_offset) != std::string::npos) {
 109     LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
 110     result = kWarning;
 111   }
 112
 113   // Make a GURL and normalize it, then get the host back out.
 114   std::string url = "http://";
 115   url.append(*domain);
 116   GURL gurl(url);
 117   const std::string& spec = gurl.possibly_invalid_spec();
 118   url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
 119   if (host.len < 0) {
 120     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
 121     return kError;
 122   }
 123   if (!gurl.is_valid()) {
 124     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
 125     result = kWarning;
 126   }
 127   domain->assign(spec.substr(host.begin, host.len));
 128
 129   return result;
 130 }
 131
 132 NormalizeResult NormalizeDataToRuleMap(const std::string data,
 133                                        RuleMap* rules) {
 134   CHECK(rules);
 135   // We do a lot of string assignment during parsing, but simplicity is more
 136   // important than performance here.
 137   std::string domain;
 138   NormalizeResult result = kSuccess;
 139   size_t line_start = 0;
 140   size_t line_end = 0;
 141   bool is_private = false;
 142   RuleMap extra_rules;
 143   int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
 144   int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
 145   while (line_start < data.size()) {
 146     if (line_start + begin_private_length < data.size() &&
 147         !data.compare(line_start, begin_private_length,
 148                       kBeginPrivateDomainsComment)) {
 149       is_private = true;
 150       line_end = line_start + begin_private_length;
 151     } else if (line_start + end_private_length < data.size() &&
 152         !data.compare(line_start, end_private_length,
 153                       kEndPrivateDomainsComment)) {
 154       is_private = false;
 155       line_end = line_start + end_private_length;
 156     } else if (line_start + 1 < data.size() &&
 157         data[line_start] == '/' &&
 158         data[line_start + 1] == '/') {
 159       // Skip comments.
 160       line_end = data.find_first_of("\r\n", line_start);
 161       if (line_end == std::string::npos)
 162         line_end = data.size();
 163     } else {
 164       // Truncate at first whitespace.
 165       line_end = data.find_first_of("\r\n \t", line_start);
 166       if (line_end == std::string::npos)
 167         line_end = data.size();
 168       domain.assign(data.data(), line_start, line_end - line_start);
 169
 170       Rule rule;
 171       rule.wildcard = false;
 172       rule.exception = false;
 173       rule.is_private = is_private;
 174       NormalizeResult new_result = NormalizeRule(&domain, &rule);
 175       if (new_result != kError) {
 176         // Check the existing rules to make sure we don't have an exception and
 177         // wildcard for the same rule, or that the same domain is listed as both
 178         // private and not private. If we did, we'd have to update our
 179         // parsing code to handle this case.
 180         CHECK(rules->find(domain) == rules->end())
 181             << "Duplicate rule found for " << domain;
 182
 183         (*rules)[domain] = rule;
 184         // Add true TLD for multi-level rules.  We don't add them right now, in
 185         // case there's an exception or wild card that either exists or might be
 186         // added in a later iteration.  In those cases, there's no need to add
 187         // it and it would just slow down parsing the data.
 188         size_t tld_start = domain.find_last_of('.');
 189         if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
 190           std::string extra_rule_domain = domain.substr(tld_start + 1);
 191           RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
 192           Rule extra_rule;
 193           extra_rule.exception = false;
 194           extra_rule.wildcard = false;
 195           if (iter == extra_rules.end()) {
 196             extra_rule.is_private = is_private;
 197           } else {
 198             // A rule already exists, so we ensure that if any of the entries is
 199             // not private the result should be that the entry is not private.
 200             // An example is .au which is not listed as a real TLD, but only
 201             // lists second-level domains such as com.au. Subdomains of .au
 202             // (eg. blogspot.com.au) are also listed in the private section,
 203             // which is processed later, so this ensures that the real TLD
 204             // (eg. .au) is listed as public.
 205             extra_rule.is_private = is_private && iter->second.is_private;
 206           }
 207           extra_rules[extra_rule_domain] = extra_rule;
 208         }
 209       }
 210       result = std::max(result, new_result);
 211     }
 212
 213     // Find beginning of next non-empty line.
 214     line_start = data.find_first_of("\r\n", line_end);
 215     if (line_start == std::string::npos)
 216       line_start = data.size();
 217     line_start = data.find_first_not_of("\r\n", line_start);
 218     if (line_start == std::string::npos)
 219       line_start = data.size();
 220   }
 221
 222   for (RuleMap::const_iterator iter = extra_rules.begin();
 223        iter != extra_rules.end();
 224        ++iter) {
 225     if (rules->find(iter->first) == rules->end()) {
 226       (*rules)[iter->first] = iter->second;
 227     }
 228   }
 229
 230   return result;
 231 }
 232
 233 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
 234                               const base::FilePath& out_filename) {
 235   RuleMap rules;
 236   std::string data;
 237   if (!base::ReadFileToString(in_filename, &data)) {
 238     LOG(ERROR) << "Unable to read file";
 239     // We return success since we've already reported the error.
 240     return kSuccess;
 241   }
 242
 243   NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
 244
 245   if (!WriteRules(rules, out_filename)) {
 246     LOG(ERROR) << "Error(s) writing output file";
 247     result = kError;
 248   }
 249
 250   return result;
 251 }
 252
 253
 254 }  // namespace tld_cleanup
 255 }  // namespace net