1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 from make_dafsa
import words_to_bin
, words_to_cxx
13 Processes a file containing effective TLD data. See the following URL for a
14 description of effective TLDs and of the file format that this script
15 processes (although for the latter you're better off just reading this file's
18 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
22 def getEffectiveTLDs(path
):
23 file = codecs
.open(path
, "r", "UTF-8")
26 # line always contains a line terminator unless the file is empty
30 # comment, empty, or superfluous line for explicitness purposes
31 if line
.startswith("//") or not line
.strip():
33 line
= re
.split(r
"[ \t\n]", line
, 1)[0]
34 entry
= EffectiveTLDEntry(line
)
35 domain
= entry
.domain()
36 assert domain
not in domains
, "repeating domain %s makes no sense" % domain
41 def _normalizeHostname(domain
):
43 Normalizes the given domain, component by component. ASCII components are
44 lowercased, while non-ASCII components are processed using the ToASCII
48 def convertLabel(label
):
51 return encodings
.idna
.ToASCII(label
).decode("utf-8")
53 return ".".join(map(convertLabel
, domain
.split(".")))
57 "True if s consists entirely of ASCII characters, false otherwise."
64 class EffectiveTLDEntry
:
66 Stores an entry in an effective-TLD name file.
72 def __init__(self
, line
):
74 Creates a TLD entry from a line of data, which must have been stripped of
77 if line
.startswith("!"):
78 self
._exception
= True
80 elif line
.startswith("*."):
85 self
._domain
= _normalizeHostname(domain
)
88 "The domain this represents."
92 "True if this entry's domain denotes does not denote an effective TLD."
93 return self
._exception
96 "True if this entry represents a class of effective TLDs."
105 def main(output
, effective_tld_filename
, output_format
="cxx"):
107 effective_tld_filename is the effective TLD file to parse.
108 based on the output format, either a C++ array of a binary representation
109 of a DAFSA representing the eTLD file is then printed to standard output
110 or a binary file is written to disk.
115 Maps the flags to the DAFSA's enum types.
126 make_dafsa expects lines of the form "<domain_name><enum_value>"
128 for etld
in getEffectiveTLDs(effective_tld_filename
):
129 yield "%s%d" % (etld
.domain(), typeEnum(etld
))
131 """ words_to_bin() returns a bytes while words_to_cxx() returns string """
132 if output_format
== "bin":
133 output
.write(words_to_bin(dafsa_words()))
135 output
.write(words_to_cxx(dafsa_words()))
138 if __name__
== "__main__":
140 This program can output the DAFSA in two formats:
141 as C++ code that will be included and compiled at build time
142 or as a binary file that will be published in Remote Settings.
144 Flags for format options:
145 "cxx" -> C++ array [default]
149 output_format
= "bin" if "--bin" in sys
.argv
else "cxx"
150 main(sys
.stdout
, sys
.argv
[1], output_format
=output_format
)