1 # -*- coding: utf-8 ; test-case-name: bridgedb.test.test_parse_descriptors ; -*-
2 #_____________________________________________________________________________
4 # This file is part of BridgeDB, a Tor bridge distribution system.
6 # :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
7 # please also see AUTHORS file
8 # :copyright: (c) 2007-2017, The Tor Project, Inc.
9 # (c) 2014-2017, Isis Lovecruft
10 # :license: see LICENSE for licensing information
11 #_____________________________________________________________________________
13 """Parsers for Tor Bridge descriptors, including ``bridge-networkstatus``
14 documents, ``bridge-server-descriptor``s, and ``bridge-extrainfo``
17 .. py:module:: bridgedb.parse.descriptors
18 :synopsis: Parsers for Tor Bridge descriptors.
20 bridgedb.parse.descriptors
21 ===========================
24 DescriptorWarning - Raised when we parse a very odd descriptor.
25 deduplicate - Deduplicate a container of descriptors, keeping only the newest
26 descriptor for each router.
27 parseNetworkStatusFile - Parse a bridge-networkstatus document generated and
28 given to us by the BridgeAuthority.
29 parseServerDescriptorsFile - Parse a file containing
30 bridge-server-descriptors.
31 parseExtraInfoFiles - Parse (multiple) file(s) containing bridge-extrainfo
36 from __future__
import print_function
43 from stem
import ProtocolError
44 from stem
.descriptor
import parse_file
45 from stem
.descriptor
.router_status_entry
import _parse_file
as _parseNSFile
46 from stem
.descriptor
.router_status_entry
import RouterStatusEntryV3
48 from bridgedb
import safelog
49 from bridgedb
.parse
.nickname
import InvalidRouterNickname
52 class DescriptorWarning(Warning):
53 """Raised when we parse a very odd descriptor."""
56 def _copyUnparseableDescriptorFile(filename
):
57 """Save a copy of the bad descriptor file for later debugging.
59 If the old filename was ``'descriptors/cached-extrainfo.new'``, then the
60 name of the copy will be something like
61 ``'descriptors/2014-11-05-01:57:23_cached-extrainfo.new.unparseable'``.
63 :param str filename: The path to the unparseable descriptor file that we
64 should save a copy of.
66 :returns: ``True`` if a copy of the file was saved successfully, and
69 timestamp
= datetime
.datetime
.now()
70 timestamp
= timestamp
.isoformat(sep
=chr(0x2d))
71 timestamp
= timestamp
.rsplit('.', 1)[0]
73 path
, sep
, fname
= filename
.rpartition(os
.path
.sep
)
74 newfilename
= "%s%s%s_%s%sunparseable" % (path
, sep
, timestamp
,
75 fname
, os
.path
.extsep
)
77 logging
.info(("Unparseable descriptor file '%s' will be copied to '%s' "
78 "for debugging.") % (filename
, newfilename
))
81 shutil
.copyfile(filename
, newfilename
)
82 except Exception as error
: # pragma: no cover
83 logging
.error(("Could not save copy of unparseable descriptor file "
84 "in '%s': %s") % (newfilename
, str(error
)))
87 logging
.debug(("Successfully finished saving a copy of an unparseable "
91 def parseNetworkStatusFile(filename
, validate
=True, skipAnnotations
=True,
92 descriptorClass
=RouterStatusEntryV3
):
93 """Parse a file which contains an ``@type bridge-networkstatus`` document.
95 See :trac:`12254` for why networkstatus-bridges documents don't look
96 anything like the networkstatus v2 documents that they are purported to
97 look like. They are missing all headers, and the entire footer (including
98 authority signatures).
100 :param str filename: The location of the file containing bridge
101 networkstatus descriptors.
102 :param bool validate: Passed along to Stem's parsers. If ``True``, the
103 descriptors will raise exceptions if they do not meet some definition
105 :param bool skipAnnotations: If ``True``, skip parsing everything before the
107 :param descriptorClass: A class (probably from
108 :mod:`stem.descriptors.router_status_entry`, i.e.
109 :class:`stem.descriptor.router_status_entry.RouterStatusEntryV2` or
110 :class:`stem.descriptor.router_status_entry.RouterStatusEntryV3`)
111 which Stem will parse each descriptor it reads from **filename** into.
112 :raises InvalidRouterNickname: if one of the routers in the networkstatus
113 file had a nickname which does not conform to Tor's nickname
115 :raises ValueError: if the contents of a descriptor are malformed and
116 **validate** is ``True``.
117 :raises IOError: if the file at **filename** can't be read.
120 :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
124 logging
.info("Parsing networkstatus file: %s" % filename
)
125 with
open(filename
, 'rb') as fh
:
128 while not fh
.readline().startswith(b
'r '):
130 logging
.debug("Skipping %d bytes of networkstatus file." % position
)
132 document
= _parseNSFile(fh
, validate
, entry_class
=descriptorClass
)
135 routers
.extend(list(document
))
136 except ValueError as error
:
137 if "nickname isn't valid" in str(error
):
138 raise InvalidRouterNickname(str(error
))
140 raise ValueError(str(error
))
142 logging
.info("Closed networkstatus file: %s" % filename
)
146 def parseServerDescriptorsFile(filename
, validate
=True):
147 """Open and parse **filename**, which should contain
148 ``@type bridge-server-descriptor``.
150 .. note:: We have to lie to Stem, pretending that these are
151 ``@type server-descriptor``, **not**
152 ``@type bridge-server-descriptor``. See :trac:`11257`.
154 :param str filename: The file to parse descriptors from.
155 :param bool validate: Whether or not to validate descriptor
156 contents. (default: ``True``)
159 :class:`stem.descriptor.server_descriptor.RelayDescriptor`s.
161 logging
.info("Parsing server descriptors with Stem: %s" % filename
)
162 descriptorType
= 'server-descriptor 1.0'
163 document
= parse_file(filename
, descriptorType
, validate
=validate
)
164 return list(document
)
167 def deduplicate(descriptors
, statistics
=False):
168 """Deduplicate some descriptors, returning only the newest for each router.
170 .. note:: If two descriptors for the same router are discovered, AND both
171 descriptors have the **same** published timestamp, then the router's
172 fingerprint WILL BE LOGGED ON PURPOSE, because we assume that router
173 to be broken or malicious.
175 :param list descriptors: A list of
176 :class:`stem.descriptor.server_descriptor.RelayDescriptor`,
177 :class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`,
178 or :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
179 :param bool statistics: If ``True``, log some extra statistics about the
180 number of duplicates.
182 :returns: A dictionary mapping router fingerprints to their newest
183 available descriptor.
188 for descriptor
in descriptors
:
189 fingerprint
= descriptor
.fingerprint
190 if fingerprint
in duplicates
:
191 duplicates
[fingerprint
].append(descriptor
)
193 duplicates
[fingerprint
] = [descriptor
,]
195 for fingerprint
, dupes
in duplicates
.items():
196 dupes
.sort(key
=lambda x
: x
.published
)
198 newest
[fingerprint
] = first
199 duplicates
[fingerprint
] = dupes
202 # sorted() won't sort by values (or anything that isn't the first item
203 # in its container), period, no matter what the cmp function is.
204 totals
= sorted([(len(v
), k
,) for k
, v
in duplicates
.viewitems()])
205 total
= sum([k
for (k
, v
) in totals
])
206 bridges
= len(duplicates
)
207 top
= 10 if bridges
>= 10 else bridges
208 logging
.info("Number of bridges with duplicates: %5d" % bridges
)
209 logging
.info("Total duplicate descriptors: %5d" % total
)
210 logging
.info("Bridges with the most duplicates (Top %d):" % top
)
211 for i
, (subtotal
, bridge
) in zip(range(1, top
+ 1), totals
[:top
]):
212 logging
.info(" #%d %s: %d duplicates" % (i
, bridge
, subtotal
))
214 logging
.info("Descriptor deduplication finished.")
218 def parseExtraInfoFiles(*filenames
, **kwargs
):
219 """Open **filenames** and parse any ``@type bridge-extrainfo-descriptor``
222 .. warning:: This function will *not* check that the ``router-signature``
223 at the end of the extrainfo descriptor is valid. See
224 ``bridgedb.bridges.Bridge._verifyExtraInfoSignature`` for a method for
225 checking the signature. The signature cannot be checked here, because
226 to do so, we would need the latest, valid, corresponding
227 ``signing-key`` for the Bridge.
229 .. note:: This function will call :func:`deduplicate` to deduplicate the
230 extrainfo descriptors parsed from all **filenames**.
232 :kwargs validate: If there is a ``'validate'`` keyword argument, its value
233 will be passed along as the ``'validate'`` argument to
234 :class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`.
235 The ``'validate'`` keyword argument defaults to ``True``, meaning that
236 the hash digest stored in the ``router-digest`` line will be checked
237 against the actual contents of the descriptor and the extrainfo
238 document's signature will be verified.
240 :returns: A dictionary mapping bridge fingerprints to their corresponding,
242 :class:`stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor`.
246 # The ``stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor``
247 # class (with ``descriptorType = 'bridge-extra-info 1.1``) is unsuitable
248 # for our purposes for the following reasons:
250 # 1. It expects a ``router-digest`` line, which is only present in
251 # sanitised bridge extrainfo descriptors.
253 # 2. It doesn't check the ``router-signature`` (nor does it expect there
254 # to be a signature).
255 descriptorType
= 'extra-info 1.0'
258 if ('validate' in kwargs
) and (kwargs
['validate'] is False):
261 for filename
in filenames
:
262 logging
.info("Parsing %s descriptors in %s..."
263 % (descriptorType
, filename
))
265 document
= parse_file(filename
, descriptorType
, validate
=validate
)
268 for router
in document
:
269 descriptors
.append(router
)
270 except (ValueError, ProtocolError
) as error
:
272 ("Stem exception while parsing extrainfo descriptor from "
273 "file '%s':\n%s") % (filename
, str(error
)))
274 _copyUnparseableDescriptorFile(filename
)
276 routers
= deduplicate(descriptors
)