1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
7 #include "base/files/file_util.h"
8 #include "base/logging.h"
9 #include "base/strings/string_number_conversions.h"
10 #include "base/strings/string_util.h"
12 #include "url/third_party/mozilla/url_parse.h"
16 const char kBeginPrivateDomainsComment
[] = "// ===BEGIN PRIVATE DOMAINS===";
17 const char kEndPrivateDomainsComment
[] = "// ===END PRIVATE DOMAINS===";
19 const int kExceptionRule
= 1;
20 const int kWildcardRule
= 2;
21 const int kPrivateRule
= 4;
25 namespace tld_cleanup
{
27 // Writes the list of domain rules contained in the 'rules' set to the
28 // 'outfile', with each rule terminated by a LF. The file must already have
29 // been created with write access.
30 bool WriteRules(const RuleMap
& rules
, const base::FilePath
& outfile
) {
33 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
34 "// Use of this source code is governed by a BSD-style license "
36 "// found in the LICENSE file.\n\n"
37 "// This file is generated by net/tools/tld_cleanup/.\n"
38 "// DO NOT MANUALLY EDIT!\n"
40 "struct DomainRule {\n"
42 " int type; // flags: 1: exception, 2: wildcard, 4: private\n"
46 for (RuleMap::const_iterator i
= rules
.begin(); i
!= rules
.end(); ++i
) {
47 data
.append(i
->first
);
50 if (i
->second
.exception
) {
51 type
= kExceptionRule
;
52 } else if (i
->second
.wildcard
) {
55 if (i
->second
.is_private
) {
58 data
.append(base::IntToString(type
));
64 int written
= base::WriteFile(outfile
,
66 static_cast<int>(data
.size()));
68 return written
== static_cast<int>(data
.size());
71 // Adjusts the rule to a standard form: removes single extraneous dots and
72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
73 // valid; logs a warning and returns kWarning if it is probably invalid; and
74 // logs an error and returns kError if the rule is (almost) certainly invalid.
75 NormalizeResult
NormalizeRule(std::string
* domain
, Rule
* rule
) {
76 NormalizeResult result
= kSuccess
;
78 // Strip single leading and trailing dots.
79 if (domain
->at(0) == '.')
81 if (domain
->empty()) {
82 LOG(WARNING
) << "Ignoring empty rule";
85 if (domain
->at(domain
->size() - 1) == '.')
86 domain
->erase(domain
->size() - 1, 1);
87 if (domain
->empty()) {
88 LOG(WARNING
) << "Ignoring empty rule";
92 // Allow single leading '*.' or '!', saved here so it's not canonicalized.
93 size_t start_offset
= 0;
94 if (domain
->at(0) == '!') {
96 rule
->exception
= true;
97 } else if (domain
->find("*.") == 0) {
99 rule
->wildcard
= true;
101 if (domain
->empty()) {
102 LOG(WARNING
) << "Ignoring empty rule";
106 // Warn about additional '*.' or '!'.
107 if (domain
->find("*.", start_offset
) != std::string::npos
||
108 domain
->find('!', start_offset
) != std::string::npos
) {
109 LOG(WARNING
) << "Keeping probably invalid rule: " << *domain
;
113 // Make a GURL and normalize it, then get the host back out.
114 std::string url
= "http://";
117 const std::string
& spec
= gurl
.possibly_invalid_spec();
118 url::Component host
= gurl
.parsed_for_possibly_invalid_spec().host
;
120 LOG(ERROR
) << "Ignoring rule that couldn't be normalized: " << *domain
;
123 if (!gurl
.is_valid()) {
124 LOG(WARNING
) << "Keeping rule that GURL says is invalid: " << *domain
;
127 domain
->assign(spec
.substr(host
.begin
, host
.len
));
132 NormalizeResult
NormalizeDataToRuleMap(const std::string data
,
135 // We do a lot of string assignment during parsing, but simplicity is more
136 // important than performance here.
138 NormalizeResult result
= kSuccess
;
139 size_t line_start
= 0;
141 bool is_private
= false;
143 int begin_private_length
= arraysize(kBeginPrivateDomainsComment
) - 1;
144 int end_private_length
= arraysize(kEndPrivateDomainsComment
) - 1;
145 while (line_start
< data
.size()) {
146 if (line_start
+ begin_private_length
< data
.size() &&
147 !data
.compare(line_start
, begin_private_length
,
148 kBeginPrivateDomainsComment
)) {
150 line_end
= line_start
+ begin_private_length
;
151 } else if (line_start
+ end_private_length
< data
.size() &&
152 !data
.compare(line_start
, end_private_length
,
153 kEndPrivateDomainsComment
)) {
155 line_end
= line_start
+ end_private_length
;
156 } else if (line_start
+ 1 < data
.size() &&
157 data
[line_start
] == '/' &&
158 data
[line_start
+ 1] == '/') {
160 line_end
= data
.find_first_of("\r\n", line_start
);
161 if (line_end
== std::string::npos
)
162 line_end
= data
.size();
164 // Truncate at first whitespace.
165 line_end
= data
.find_first_of("\r\n \t", line_start
);
166 if (line_end
== std::string::npos
)
167 line_end
= data
.size();
168 domain
.assign(data
.data(), line_start
, line_end
- line_start
);
171 rule
.wildcard
= false;
172 rule
.exception
= false;
173 rule
.is_private
= is_private
;
174 NormalizeResult new_result
= NormalizeRule(&domain
, &rule
);
175 if (new_result
!= kError
) {
176 // Check the existing rules to make sure we don't have an exception and
177 // wildcard for the same rule, or that the same domain is listed as both
178 // private and not private. If we did, we'd have to update our
179 // parsing code to handle this case.
180 CHECK(rules
->find(domain
) == rules
->end())
181 << "Duplicate rule found for " << domain
;
183 (*rules
)[domain
] = rule
;
184 // Add true TLD for multi-level rules. We don't add them right now, in
185 // case there's an exception or wild card that either exists or might be
186 // added in a later iteration. In those cases, there's no need to add
187 // it and it would just slow down parsing the data.
188 size_t tld_start
= domain
.find_last_of('.');
189 if (tld_start
!= std::string::npos
&& tld_start
+ 1 < domain
.size()) {
190 std::string extra_rule_domain
= domain
.substr(tld_start
+ 1);
191 RuleMap::const_iterator iter
= extra_rules
.find(extra_rule_domain
);
193 extra_rule
.exception
= false;
194 extra_rule
.wildcard
= false;
195 if (iter
== extra_rules
.end()) {
196 extra_rule
.is_private
= is_private
;
198 // A rule already exists, so we ensure that if any of the entries is
199 // not private the result should be that the entry is not private.
200 // An example is .au which is not listed as a real TLD, but only
201 // lists second-level domains such as com.au. Subdomains of .au
202 // (eg. blogspot.com.au) are also listed in the private section,
203 // which is processed later, so this ensures that the real TLD
204 // (eg. .au) is listed as public.
205 extra_rule
.is_private
= is_private
&& iter
->second
.is_private
;
207 extra_rules
[extra_rule_domain
] = extra_rule
;
210 result
= std::max(result
, new_result
);
213 // Find beginning of next non-empty line.
214 line_start
= data
.find_first_of("\r\n", line_end
);
215 if (line_start
== std::string::npos
)
216 line_start
= data
.size();
217 line_start
= data
.find_first_not_of("\r\n", line_start
);
218 if (line_start
== std::string::npos
)
219 line_start
= data
.size();
222 for (RuleMap::const_iterator iter
= extra_rules
.begin();
223 iter
!= extra_rules
.end();
225 if (rules
->find(iter
->first
) == rules
->end()) {
226 (*rules
)[iter
->first
] = iter
->second
;
233 NormalizeResult
NormalizeFile(const base::FilePath
& in_filename
,
234 const base::FilePath
& out_filename
) {
237 if (!base::ReadFileToString(in_filename
, &data
)) {
238 LOG(ERROR
) << "Unable to read file";
239 // We return success since we've already reported the error.
243 NormalizeResult result
= NormalizeDataToRuleMap(data
, &rules
);
245 if (!WriteRules(rules
, out_filename
)) {
246 LOG(ERROR
) << "Error(s) writing output file";
254 } // namespace tld_cleanup