Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / net / tools / tld_cleanup / tld_cleanup_util.cc
blobf4d93ac0b1744afe135ea1c6e2799fc1cc9a5083
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
7 #include "base/files/file_util.h"
8 #include "base/logging.h"
9 #include "base/strings/string_number_conversions.h"
10 #include "base/strings/string_util.h"
11 #include "url/gurl.h"
12 #include "url/third_party/mozilla/url_parse.h"
14 namespace {
16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
19 const int kExceptionRule = 1;
20 const int kWildcardRule = 2;
21 const int kPrivateRule = 4;
24 namespace net {
25 namespace tld_cleanup {
27 // Writes the list of domain rules contained in the 'rules' set to the
28 // 'outfile', with each rule terminated by a LF. The file must already have
29 // been created with write access.
30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
31 std::string data;
32 data.append("%{\n"
33 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
34 "// Use of this source code is governed by a BSD-style license "
35 "that can be\n"
36 "// found in the LICENSE file.\n\n"
37 "// This file is generated by net/tools/tld_cleanup/.\n"
38 "// DO NOT MANUALLY EDIT!\n"
39 "%}\n"
40 "struct DomainRule {\n"
41 " int name_offset;\n"
42 " int type; // flags: 1: exception, 2: wildcard, 4: private\n"
43 "};\n"
44 "%%\n");
46 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
47 data.append(i->first);
48 data.append(", ");
49 int type = 0;
50 if (i->second.exception) {
51 type = kExceptionRule;
52 } else if (i->second.wildcard) {
53 type = kWildcardRule;
55 if (i->second.is_private) {
56 type += kPrivateRule;
58 data.append(base::IntToString(type));
59 data.append("\n");
62 data.append("%%\n");
64 int written = base::WriteFile(outfile,
65 data.data(),
66 static_cast<int>(data.size()));
68 return written == static_cast<int>(data.size());
71 // Adjusts the rule to a standard form: removes single extraneous dots and
72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
73 // valid; logs a warning and returns kWarning if it is probably invalid; and
74 // logs an error and returns kError if the rule is (almost) certainly invalid.
75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
76 NormalizeResult result = kSuccess;
78 // Strip single leading and trailing dots.
79 if (domain->at(0) == '.')
80 domain->erase(0, 1);
81 if (domain->empty()) {
82 LOG(WARNING) << "Ignoring empty rule";
83 return kWarning;
85 if (domain->at(domain->size() - 1) == '.')
86 domain->erase(domain->size() - 1, 1);
87 if (domain->empty()) {
88 LOG(WARNING) << "Ignoring empty rule";
89 return kWarning;
92 // Allow single leading '*.' or '!', saved here so it's not canonicalized.
93 size_t start_offset = 0;
94 if (domain->at(0) == '!') {
95 domain->erase(0, 1);
96 rule->exception = true;
97 } else if (domain->find("*.") == 0) {
98 domain->erase(0, 2);
99 rule->wildcard = true;
101 if (domain->empty()) {
102 LOG(WARNING) << "Ignoring empty rule";
103 return kWarning;
106 // Warn about additional '*.' or '!'.
107 if (domain->find("*.", start_offset) != std::string::npos ||
108 domain->find('!', start_offset) != std::string::npos) {
109 LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
110 result = kWarning;
113 // Make a GURL and normalize it, then get the host back out.
114 std::string url = "http://";
115 url.append(*domain);
116 GURL gurl(url);
117 const std::string& spec = gurl.possibly_invalid_spec();
118 url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
119 if (host.len < 0) {
120 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
121 return kError;
123 if (!gurl.is_valid()) {
124 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
125 result = kWarning;
127 domain->assign(spec.substr(host.begin, host.len));
129 return result;
132 NormalizeResult NormalizeDataToRuleMap(const std::string data,
133 RuleMap* rules) {
134 CHECK(rules);
135 // We do a lot of string assignment during parsing, but simplicity is more
136 // important than performance here.
137 std::string domain;
138 NormalizeResult result = kSuccess;
139 size_t line_start = 0;
140 size_t line_end = 0;
141 bool is_private = false;
142 RuleMap extra_rules;
143 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
144 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
145 while (line_start < data.size()) {
146 if (line_start + begin_private_length < data.size() &&
147 !data.compare(line_start, begin_private_length,
148 kBeginPrivateDomainsComment)) {
149 is_private = true;
150 line_end = line_start + begin_private_length;
151 } else if (line_start + end_private_length < data.size() &&
152 !data.compare(line_start, end_private_length,
153 kEndPrivateDomainsComment)) {
154 is_private = false;
155 line_end = line_start + end_private_length;
156 } else if (line_start + 1 < data.size() &&
157 data[line_start] == '/' &&
158 data[line_start + 1] == '/') {
159 // Skip comments.
160 line_end = data.find_first_of("\r\n", line_start);
161 if (line_end == std::string::npos)
162 line_end = data.size();
163 } else {
164 // Truncate at first whitespace.
165 line_end = data.find_first_of("\r\n \t", line_start);
166 if (line_end == std::string::npos)
167 line_end = data.size();
168 domain.assign(data.data(), line_start, line_end - line_start);
170 Rule rule;
171 rule.wildcard = false;
172 rule.exception = false;
173 rule.is_private = is_private;
174 NormalizeResult new_result = NormalizeRule(&domain, &rule);
175 if (new_result != kError) {
176 // Check the existing rules to make sure we don't have an exception and
177 // wildcard for the same rule, or that the same domain is listed as both
178 // private and not private. If we did, we'd have to update our
179 // parsing code to handle this case.
180 CHECK(rules->find(domain) == rules->end())
181 << "Duplicate rule found for " << domain;
183 (*rules)[domain] = rule;
184 // Add true TLD for multi-level rules. We don't add them right now, in
185 // case there's an exception or wild card that either exists or might be
186 // added in a later iteration. In those cases, there's no need to add
187 // it and it would just slow down parsing the data.
188 size_t tld_start = domain.find_last_of('.');
189 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
190 std::string extra_rule_domain = domain.substr(tld_start + 1);
191 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
192 Rule extra_rule;
193 extra_rule.exception = false;
194 extra_rule.wildcard = false;
195 if (iter == extra_rules.end()) {
196 extra_rule.is_private = is_private;
197 } else {
198 // A rule already exists, so we ensure that if any of the entries is
199 // not private the result should be that the entry is not private.
200 // An example is .au which is not listed as a real TLD, but only
201 // lists second-level domains such as com.au. Subdomains of .au
202 // (eg. blogspot.com.au) are also listed in the private section,
203 // which is processed later, so this ensures that the real TLD
204 // (eg. .au) is listed as public.
205 extra_rule.is_private = is_private && iter->second.is_private;
207 extra_rules[extra_rule_domain] = extra_rule;
210 result = std::max(result, new_result);
213 // Find beginning of next non-empty line.
214 line_start = data.find_first_of("\r\n", line_end);
215 if (line_start == std::string::npos)
216 line_start = data.size();
217 line_start = data.find_first_not_of("\r\n", line_start);
218 if (line_start == std::string::npos)
219 line_start = data.size();
222 for (RuleMap::const_iterator iter = extra_rules.begin();
223 iter != extra_rules.end();
224 ++iter) {
225 if (rules->find(iter->first) == rules->end()) {
226 (*rules)[iter->first] = iter->second;
230 return result;
233 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
234 const base::FilePath& out_filename) {
235 RuleMap rules;
236 std::string data;
237 if (!base::ReadFileToString(in_filename, &data)) {
238 LOG(ERROR) << "Unable to read file";
239 // We return success since we've already reported the error.
240 return kSuccess;
243 NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
245 if (!WriteRules(rules, out_filename)) {
246 LOG(ERROR) << "Error(s) writing output file";
247 result = kError;
250 return result;
254 } // namespace tld_cleanup
255 } // namespace net