Parse bridge blocking info from SQL database.
[tor-bridgedb.git] / bridgedb / parse / descriptors.py
blobf6dee838ea89dea0a857bf2cd01fba35dd645f2b
1 # -*- coding: utf-8 ; test-case-name: bridgedb.test.test_parse_descriptors ; -*-
2 #_____________________________________________________________________________
4 # This file is part of BridgeDB, a Tor bridge distribution system.
6 # :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
7 # please also see AUTHORS file
8 # :copyright: (c) 2007-2017, The Tor Project, Inc.
9 # (c) 2014-2017, Isis Lovecruft
10 # :license: see LICENSE for licensing information
11 #_____________________________________________________________________________
13 """Parsers for Tor Bridge descriptors, including ``bridge-networkstatus``
14 documents, ``bridge-server-descriptor``s, and ``bridge-extrainfo``
15 descriptors.
17 .. py:module:: bridgedb.parse.descriptors
18 :synopsis: Parsers for Tor Bridge descriptors.
20 bridgedb.parse.descriptors
21 ===========================
24 DescriptorWarning - Raised when we parse a very odd descriptor.
25 deduplicate - Deduplicate a container of descriptors, keeping only the newest
26 descriptor for each router.
27 parseNetworkStatusFile - Parse a bridge-networkstatus document generated and
28 given to us by the BridgeAuthority.
29 parseServerDescriptorsFile - Parse a file containing
30 bridge-server-descriptors.
31 parseExtraInfoFiles - Parse (multiple) file(s) containing bridge-extrainfo
32 descriptors.
34 """
36 from __future__ import print_function
38 import datetime
39 import logging
40 import os
41 import shutil
43 from stem import ProtocolError
44 from stem.descriptor import parse_file
45 from stem.descriptor.router_status_entry import _parse_file as _parseNSFile
46 from stem.descriptor.router_status_entry import RouterStatusEntryV3
48 from bridgedb import safelog
49 from bridgedb.parse.nickname import InvalidRouterNickname
52 class DescriptorWarning(Warning):
53 """Raised when we parse a very odd descriptor."""
56 def _copyUnparseableDescriptorFile(filename):
57 """Save a copy of the bad descriptor file for later debugging.
59 If the old filename was ``'descriptors/cached-extrainfo.new'``, then the
60 name of the copy will be something like
61 ``'descriptors/2014-11-05-01:57:23_cached-extrainfo.new.unparseable'``.
63 :param str filename: The path to the unparseable descriptor file that we
64 should save a copy of.
65 :rtype: bool
66 :returns: ``True`` if a copy of the file was saved successfully, and
67 ``False`` otherwise.
68 """
69 timestamp = datetime.datetime.now()
70 timestamp = timestamp.isoformat(sep=chr(0x2d))
71 timestamp = timestamp.rsplit('.', 1)[0]
73 path, sep, fname = filename.rpartition(os.path.sep)
74 newfilename = "%s%s%s_%s%sunparseable" % (path, sep, timestamp,
75 fname, os.path.extsep)
77 logging.info(("Unparseable descriptor file '%s' will be copied to '%s' "
78 "for debugging.") % (filename, newfilename))
80 try:
81 shutil.copyfile(filename, newfilename)
82 except Exception as error: # pragma: no cover
83 logging.error(("Could not save copy of unparseable descriptor file "
84 "in '%s': %s") % (newfilename, str(error)))
85 return False
86 else:
87 logging.debug(("Successfully finished saving a copy of an unparseable "
88 "descriptor file."))
89 return True
91 def parseNetworkStatusFile(filename, validate=True, skipAnnotations=True,
92 descriptorClass=RouterStatusEntryV3):
93 """Parse a file which contains an ``@type bridge-networkstatus`` document.
95 See :trac:`12254` for why networkstatus-bridges documents don't look
96 anything like the networkstatus v2 documents that they are purported to
97 look like. They are missing all headers, and the entire footer (including
98 authority signatures).
100 :param str filename: The location of the file containing bridge
101 networkstatus descriptors.
102 :param bool validate: Passed along to Stem's parsers. If ``True``, the
103 descriptors will raise exceptions if they do not meet some definition
104 of correctness.
105 :param bool skipAnnotations: If ``True``, skip parsing everything before the
106 first ``r`` line.
107 :param descriptorClass: A class (probably from
108 :mod:`stem.descriptors.router_status_entry`, i.e.
109 :class:`stem.descriptor.router_status_entry.RouterStatusEntryV2` or
110 :class:`stem.descriptor.router_status_entry.RouterStatusEntryV3`)
111 which Stem will parse each descriptor it reads from **filename** into.
112 :raises InvalidRouterNickname: if one of the routers in the networkstatus
113 file had a nickname which does not conform to Tor's nickname
114 specification.
115 :raises ValueError: if the contents of a descriptor are malformed and
116 **validate** is ``True``.
117 :raises IOError: if the file at **filename** can't be read.
118 :rtype: list
119 :returns: A list of
120 :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
122 routers = []
124 logging.info("Parsing networkstatus file: %s" % filename)
125 with open(filename, 'rb') as fh:
126 position = fh.tell()
127 if skipAnnotations:
128 while not fh.readline().startswith(b'r '):
129 position = fh.tell()
130 logging.debug("Skipping %d bytes of networkstatus file." % position)
131 fh.seek(position)
132 document = _parseNSFile(fh, validate, entry_class=descriptorClass)
134 try:
135 routers.extend(list(document))
136 except ValueError as error:
137 if "nickname isn't valid" in str(error):
138 raise InvalidRouterNickname(str(error))
139 else:
140 raise ValueError(str(error))
142 logging.info("Closed networkstatus file: %s" % filename)
144 return routers
146 def parseServerDescriptorsFile(filename, validate=True):
147 """Open and parse **filename**, which should contain
148 ``@type bridge-server-descriptor``.
150 .. note:: We have to lie to Stem, pretending that these are
151 ``@type server-descriptor``, **not**
152 ``@type bridge-server-descriptor``. See :trac:`11257`.
154 :param str filename: The file to parse descriptors from.
155 :param bool validate: Whether or not to validate descriptor
156 contents. (default: ``True``)
157 :rtype: list
158 :returns: A list of
159 :class:`stem.descriptor.server_descriptor.RelayDescriptor`s.
161 logging.info("Parsing server descriptors with Stem: %s" % filename)
162 descriptorType = 'server-descriptor 1.0'
163 document = parse_file(filename, descriptorType, validate=validate)
164 return list(document)
167 def deduplicate(descriptors, statistics=False):
168 """Deduplicate some descriptors, returning only the newest for each router.
170 .. note:: If two descriptors for the same router are discovered, AND both
171 descriptors have the **same** published timestamp, then the router's
172 fingerprint WILL BE LOGGED ON PURPOSE, because we assume that router
173 to be broken or malicious.
175 :param list descriptors: A list of
176 :class:`stem.descriptor.server_descriptor.RelayDescriptor`,
177 :class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`,
178 or :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
179 :param bool statistics: If ``True``, log some extra statistics about the
180 number of duplicates.
181 :rtype: dict
182 :returns: A dictionary mapping router fingerprints to their newest
183 available descriptor.
185 duplicates = {}
186 newest = {}
188 for descriptor in descriptors:
189 fingerprint = descriptor.fingerprint
190 if fingerprint in duplicates:
191 duplicates[fingerprint].append(descriptor)
192 else:
193 duplicates[fingerprint] = [descriptor,]
195 for fingerprint, dupes in duplicates.items():
196 dupes.sort(key=lambda x: x.published)
197 first = dupes.pop()
198 newest[fingerprint] = first
199 duplicates[fingerprint] = dupes
201 if statistics:
202 # sorted() won't sort by values (or anything that isn't the first item
203 # in its container), period, no matter what the cmp function is.
204 totals = sorted([(len(v), k,) for k, v in duplicates.viewitems()])
205 total = sum([k for (k, v) in totals])
206 bridges = len(duplicates)
207 top = 10 if bridges >= 10 else bridges
208 logging.info("Number of bridges with duplicates: %5d" % bridges)
209 logging.info("Total duplicate descriptors: %5d" % total)
210 logging.info("Bridges with the most duplicates (Top %d):" % top)
211 for i, (subtotal, bridge) in zip(range(1, top + 1), totals[:top]):
212 logging.info(" #%d %s: %d duplicates" % (i, bridge, subtotal))
214 logging.info("Descriptor deduplication finished.")
216 return newest
218 def parseExtraInfoFiles(*filenames, **kwargs):
219 """Open **filenames** and parse any ``@type bridge-extrainfo-descriptor``
220 contained within.
222 .. warning:: This function will *not* check that the ``router-signature``
223 at the end of the extrainfo descriptor is valid. See
224 ``bridgedb.bridges.Bridge._verifyExtraInfoSignature`` for a method for
225 checking the signature. The signature cannot be checked here, because
226 to do so, we would need the latest, valid, corresponding
227 ``signing-key`` for the Bridge.
229 .. note:: This function will call :func:`deduplicate` to deduplicate the
230 extrainfo descriptors parsed from all **filenames**.
232 :kwargs validate: If there is a ``'validate'`` keyword argument, its value
233 will be passed along as the ``'validate'`` argument to
234 :class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`.
235 The ``'validate'`` keyword argument defaults to ``True``, meaning that
236 the hash digest stored in the ``router-digest`` line will be checked
237 against the actual contents of the descriptor and the extrainfo
238 document's signature will be verified.
239 :rtype: dict
240 :returns: A dictionary mapping bridge fingerprints to their corresponding,
241 deduplicated
242 :class:`stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor`.
244 descriptors = []
246 # The ``stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor``
247 # class (with ``descriptorType = 'bridge-extra-info 1.1``) is unsuitable
248 # for our purposes for the following reasons:
250 # 1. It expects a ``router-digest`` line, which is only present in
251 # sanitised bridge extrainfo descriptors.
253 # 2. It doesn't check the ``router-signature`` (nor does it expect there
254 # to be a signature).
255 descriptorType = 'extra-info 1.0'
257 validate = True
258 if ('validate' in kwargs) and (kwargs['validate'] is False):
259 validate = False
261 for filename in filenames:
262 logging.info("Parsing %s descriptors in %s..."
263 % (descriptorType, filename))
265 document = parse_file(filename, descriptorType, validate=validate)
267 try:
268 for router in document:
269 descriptors.append(router)
270 except (ValueError, ProtocolError) as error:
271 logging.error(
272 ("Stem exception while parsing extrainfo descriptor from "
273 "file '%s':\n%s") % (filename, str(error)))
274 _copyUnparseableDescriptorFile(filename)
276 routers = deduplicate(descriptors)
277 return routers