Make getCaptchaImage return (bytes, str).
[tor-bridgedb.git] / bridgedb / proxy.py
blob654b40c1c93bbc6cf920b2701937eceb83d2d2be
1 # -*- coding: utf-8 -*-
3 # This file is part of BridgeDB, a Tor bridge distribution system.
5 # :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
6 # please also see AUTHORS file
7 # :copyright: (c) 2013-2017 Isis Lovecruft
8 # (c) 2007-2017, The Tor Project, Inc.
9 # (c) 2007-2017, all entities within the AUTHORS file
10 # :license: 3-clause BSD, see included LICENSE for information
12 """Classes for finding and managing lists of open proxies."""
14 from __future__ import print_function
15 from collections import MutableSet
16 from functools import update_wrapper
17 from functools import wraps
19 import ipaddr
20 import logging
21 import os
22 import time
24 from twisted.internet import defer
25 from twisted.internet import protocol
26 from twisted.internet import reactor
27 from twisted.internet import utils as txutils
28 from bridgedb.runner import find
29 from bridgedb.parse.addr import isIPAddress
32 def downloadTorExits(proxyList, ipaddress, port=443, protocol=None):
33 """Run a script which downloads a list of Tor exit relays which allow their
34 clients to exit to the given **ipaddress** and **port**.
36 :param proxyList: The :class:`ProxySet` instance from :mod:`bridgedb.main`.
37 :param str ipaddress: The IP address that each Tor exit relay should be
38 capable of connecting to for clients, as specified by its ExitPolicy.
39 :param int port: The port corresponding to the above **ipaddress** that
40 each Tor exit relay should allow clients to exit to. (See
41 https://check.torproject.org/cgi-bin/TorBulkExitList.py.)
42 :type protocol: :api:`twisted.internet.protocol.Protocol`
43 :param protocol: A :class:`~bridgedb.proxy.ExitListProtocol`, or any other
44 :api:`~twisted.internet.protocol.Protocol` implementation for
45 processing the results of a process which downloads a list of Tor exit
46 relays. This parameter is mainly meant for use in testing, and should
47 not be changed.
48 :rtype: :class:`~twisted.internet.defer.Deferred`
49 :returns: A deferred which will callback with a list, each item in the
50 list is a string containing an IP of a Tor exit relay.
51 """
52 proto = ExitListProtocol() if protocol is None else protocol()
53 args = [proto.script, '--stdout', '-a', ipaddress, '-p', str(port)]
54 proto.deferred.addCallback(proxyList.replaceExitRelays)
55 proto.deferred.addErrback(logging.exception)
56 transport = reactor.spawnProcess(proto, proto.script, args=args, env={})
57 return proto.deferred
59 def loadProxiesFromFile(filename, proxySet=None, removeStale=False):
60 """Load proxy IP addresses from a list of files.
62 :param str filename: A filename whose path can be either absolute or
63 relative to the current working directory. The file should contain the
64 IP addresses of various open proxies, one per line, for example::
66 11.11.11.11
67 22.22.22.22
68 123.45.67.89
70 :type proxySet: None or :class:`~bridgedb.proxy.ProxySet`.
71 :param proxySet: If given, load the addresses read from the files into
72 this ``ProxySet``.
73 :param bool removeStale: If ``True``, remove proxies from the **proxySet**
74 which were not listed in any of the **files**.
75 (default: ``False``)
76 :returns: A list of all the proxies listed in the **files* (regardless of
77 whether they were added or removed).
78 """
79 logging.info("Reloading proxy lists from file %s" % filename)
81 addresses = []
83 # We have to check the instance because, if the ProxySet was newly
84 # created, it will likely be empty, causing it to evaluate to False:
85 if isinstance(proxySet, ProxySet):
86 oldProxySet = proxySet.copy()
88 try:
89 with open(filename, 'r') as proxyFile:
90 for line in proxyFile.readlines():
91 line = line.strip()
92 if isinstance(proxySet, ProxySet):
93 # ProxySet.add() will validate the IP address
94 if proxySet.add(line, tag=filename):
95 addresses.append(line)
96 else:
97 ip = isIPAddress(line)
98 if ip:
99 addresses.append(ip)
100 except Exception as error:
101 logging.warn("Error while reading a proxy list file: %s" % str(error))
103 if isinstance(proxySet, ProxySet):
104 stale = list(oldProxySet.difference(addresses))
106 if removeStale:
107 for ip in stale:
108 if proxySet.getTag(ip) == filename:
109 logging.info("Removing stale IP %s from proxy list." % ip)
110 proxySet.remove(ip)
111 else:
112 logging.info("Tag %s didn't match %s"
113 % (proxySet.getTag(ip), filename))
115 return addresses
118 class ProxySet(MutableSet):
119 """A :class:`collections.MutableSet` for storing validated IP addresses.
121 .. inheritance-diagram:: ProxySet
122 :parts: 1
124 #: A tag to apply to IP addresses within this ``ProxySet`` which are known
125 #: Tor exit relays.
126 _exitTag = 'exit_relay'
128 def __init__(self, proxies=dict()):
129 """Initialise a ``ProxySet``.
131 :type proxies: A tuple, list, dict, or set.
132 :param proxies: Optionally, initialise with an iterable, ``proxies``.
133 For each ``item`` in that iterable, ``item`` must either:
134 1. be a string or int representing an IP address, or,
135 2. be another iterable, whose first item satisfies #1.
137 super(ProxySet, self).__init__()
138 self._proxydict = dict()
139 self._proxies = set()
140 self.addProxies(proxies)
142 @property
143 def proxies(self):
144 """All proxies in this set, regardless of tags."""
145 return list(self._proxies)
147 @property
148 def exitRelays(self):
149 """Get all proxies in this ``ProxySet`` tagged as Tor exit relays.
151 :rtype: set
152 :returns: A set of all known Tor exit relays which are contained
153 within this :class:`~bridgedb.proxy.ProxySet`.
155 return self.getAllWithTag(self._exitTag)
157 def __add__(self, ip=None, tag=None):
158 """Add an **ip** to this set, with an optional **tag**.
160 This has no effect if the **ip** is already present. The **ip** is
161 only added if it passes the checks in
162 :func:`~bridgedb.parse.addr.isIPAddress`.
164 :type ip: str or int
165 :param ip: The IP address to add.
166 :param tag: An optional value to link to **ip**. If not given, it will
167 be a timestamp (seconds since epoch, as a float) for when **ip**
168 was first added to this set.
169 :rtype: bool
170 :returns: ``True`` if **ip** is in this set; ``False`` otherwise.
172 ip = isIPAddress(ip)
173 if ip:
174 if self._proxies.isdisjoint(set(ip)):
175 logging.debug("Adding %s to proxy list..." % ip)
176 self._proxies.add(ip)
177 self._proxydict[ip] = tag if tag else time.time()
178 return True
179 return False
181 def __radd__(self, *args, **kwargs): self.__add__(*args, **kwargs)
183 def __contains__(self, ip):
184 """x.__contains__(y) <==> y in x.
186 :type ip: str or int
187 :param ip: The IP address to check.
188 :rtype: boolean
189 :returns: True if ``ip`` is in this set; False otherwise.
191 ipset = [isIPAddress(ip),]
192 if ipset and len(self._proxies.intersection(ipset)) == len(ipset):
193 return True
194 return False
196 def __sub__(self, ip):
197 """Entirely remove **ip** from this set.
199 :type ip: str or int
200 :param ip: The IP address to remove.
202 try:
203 self._proxydict.pop(ip)
204 self._proxies.discard(ip)
205 except KeyError:
206 pass
208 def __rsub__(self, *args, **kwargs): raise NotImplemented
210 def _getErrorMessage(self, x=None, y=None):
211 """Make an error message describing how this class works."""
212 message = """\nParameter 'proxies' must be one of:
213 - a {1} of {0}
214 - a {2} of {0}
215 - a {3}, whose keys are {0} (the values can be anything)
216 - a {4} of {1}s, whose first object in each {1} must be a {0}
217 - a {4} of {0}
218 """.format(type(''), type(()), type([]), type({}), type(set(())))
219 end = "You gave: a {0}".format(type(y))
220 end += " of {0}".format(type(x))
221 return os.linesep.join((message, end))
223 def addProxies(self, proxies, tag=None):
224 """Add proxies to this set.
226 This calls :func:`add` for each item in the iterable **proxies**.
227 Each proxy, if added, will be tagged with a current timestamp.
229 :type proxies: A tuple, list, dict, or set.
230 :param proxies: An iterable. For each ``item`` in that iterable,
231 ``item`` must either:
232 1. be a string or int representing an IP address, or,
233 2. be another iterable, whose first item satisfies #1.
234 :keyword tag: An optional value to link to all untagged
235 **proxies**. If ``None``, it will be a timestamp (seconds since
236 epoch, as a float) for when the proxy was first added to this set.
238 if isinstance(proxies, dict):
239 [self.add(ip, value) for (ip, value) in proxies.items()]
240 else:
241 try:
242 for x in proxies:
243 if isinstance(x, (tuple, list, set)):
244 if len(x) == 2: self.add(x[0], x[1])
245 elif len(x) == 1: self.add(x, tag)
246 else: raise ValueError(self._getErrorMessage(x, proxies))
247 elif isinstance(x, (str, int)):
248 self.add(x, tag)
249 else:
250 raise ValueError(self._getErrorMessage(x, proxies))
251 except TypeError:
252 raise ValueError(self._getErrorMessage(proxies, None))
254 @wraps(addProxies)
255 def addExitRelays(self, relays):
256 logging.info("Loading exit relays into proxy list...")
257 [self.add(x, self._exitTag) for x in relays]
259 def replaceExitRelays(self, relays):
260 existingExitRelays = self.getAllWithTag(self._exitTag)
261 logging.debug("Replacing %d existing with %d new exit relays." %
262 (len(existingExitRelays), len(relays)))
264 for relay in existingExitRelays:
265 self.discard(relay)
267 self.addExitRelays(relays)
270 def getTag(self, ip):
271 """Get the tag for an **ip** in this ``ProxySet``, if available.
273 :type ip: str or int
274 :param ip: The IP address to obtain the tag for.
275 :rtype: ``None`` or str or int
276 :returns: The tag for that **ip**, iff **ip** exists in this
277 ``ProxySet`` and it has a tag.
279 return self._proxydict.get(ip)
281 def getAllWithTag(self, tag):
282 """Get all proxies in this ``ProxySet`` with a given tag.
284 :param str tag: A tag to search for.
285 :rtype: set
286 :returns: A set of all proxies which are contained within this
287 :class:`~bridgedb.proxy.ProxySet` which are also tagged with
288 **tag**.
290 return set([key for key, value in filter(lambda x: x[1] == tag,
291 self._proxydict.items())])
293 def firstSeen(self, ip):
294 """Get the timestamp when **ip** was first seen, if available.
296 :type ip: str or int
297 :param ip: The IP address to obtain a timestamp for.
298 :rtype: float or None
299 :returns: The timestamp (in seconds since epoch) if available.
300 Otherwise, returns None.
302 when = self.getTag(ip)
303 if isinstance(when, float):
304 return when
306 def isExitRelay(self, ip):
307 """Check if ``ip`` is a known Tor exit relay.
309 :type ip: str or int
310 :param ip: The IP address to check.
311 :rtype: boolean
312 :returns: True if ``ip`` is a known Tor exit relay; False otherwise.
314 if self.getTag(ip) == self._exitTag:
315 return True
316 return False
318 def replaceProxyList(self, proxies, tag=None):
319 """Clear everything and add all ``proxies``.
321 :type proxies: A tuple, list, dict, or set.
322 :param proxies: An iterable. For each ``item`` in that iterable,
323 ``item`` must either:
324 1. be a string or int representing an IP address, or,
325 2. be another iterable, whose first item satisfies #1.
327 try:
328 self.clear()
329 self.addProxies(proxies, tag=tag)
330 except Exception as error:
331 logging.error(str(error))
333 _assigned = ('__name__', '__doc__')
335 @wraps(MutableSet._hash)
336 def __hash__(self): return self._hash()
337 def __iter__(self): return self._proxies.__iter__()
338 def __len__(self): return len(self._proxydict.items())
339 def __repr__(self): return type('')(self.proxies)
340 def __str__(self): return os.linesep.join(self.proxies)
341 update_wrapper(__iter__, set.__iter__, _assigned)
342 update_wrapper(__len__, len, _assigned)
343 update_wrapper(__repr__, repr, _assigned)
344 update_wrapper(__str__, str, _assigned)
346 def add(self, ip, tag=None): return self.__add__(ip, tag)
347 def copy(self): return self.__class__(self._proxydict.copy())
348 def contains(self, ip): return self.__contains__(ip)
349 def discard(self, ip): return self.__sub__(ip)
350 def remove(self, other): return self.__sub__(other)
351 update_wrapper(add, __add__)
352 update_wrapper(copy, __init__)
353 update_wrapper(contains, __contains__)
354 update_wrapper(discard, __sub__)
355 update_wrapper(remove, __sub__)
357 def difference(self, other): return self._proxies.difference(other)
358 def issubset(self, other): return self._proxies.issubset(other)
359 def issuperset(self, other): return self._proxies.issuperset(other)
360 def intersection(self, other): return self._proxies.intersection(other)
361 def symmetric_difference(self, other): return self._proxies.symmetric_difference(other)
362 def union(self, other): return self._proxies.union(other)
363 update_wrapper(difference, set.difference, _assigned)
364 update_wrapper(issubset, set.issubset, _assigned)
365 update_wrapper(issuperset, set.issuperset, _assigned)
366 update_wrapper(intersection, set.intersection, _assigned)
367 update_wrapper(symmetric_difference, set.symmetric_difference, _assigned)
368 update_wrapper(union, set.union, _assigned)
371 class ExitListProtocol(protocol.ProcessProtocol):
372 """A :class:`~twisted.internet.protocol.Protocol` for ``get-exit-list``.
374 :attr boolean connected: True if our ``transport`` is connected.
376 :type transport: An implementer of
377 :interface:`twisted.internet.interface.IProcessTransport`.
378 :attr transport: If :func:`twisted.internet.reactor.spawnProcess` is
379 called with an instance of this class as it's ``protocol``, then
380 :func:`~twisted.internet.reactor.spawnProcess` will return this
381 ``transport``.
384 def __init__(self):
385 """Create a protocol for downloading a list of current Tor exit relays.
387 :type exitlist: :class:`ProxySet`
388 :ivar exitlist: A :class:`~collections.MutableSet` containing the IP
389 addresses of known Tor exit relays which can reach our public IP
390 address.
391 :ivar list data: A list containing a ``bytes`` object for each chuck
392 of data received from the ``transport``.
393 :ivar deferred: A deferred which will callback with the ``exitlist``
394 when the process has ended.
396 :param string script: The full pathname of the script to run.
398 self.data = []
399 self.script = find('get-tor-exits')
400 self.exitlist = ProxySet()
401 self.deferred = defer.Deferred()
403 def childConnectionLost(self, childFD):
404 """See :func:`t.i.protocol.ProcessProtocol.childConnectionLost`."""
405 protocol.ProcessProtocol.childConnectionLost(self, childFD)
407 def connectionMade(self):
408 """Called when a connection is made.
410 This may be considered the initializer of the protocol, because it is
411 called when the connection is completed. For clients, this is called
412 once the connection to the server has been established; for servers,
413 this is called after an accept() call stops blocking and a socket has
414 been received. If you need to send any greeting or initial message,
415 do it here.
417 logging.debug("ExitListProtocol: Connection made with remote server")
418 self.transport.closeStdin()
420 def errReceived(self, data):
421 """Some data was received from stderr."""
422 # The get-exit-list script uses twisted.python.log to log to stderr:
423 for line in data.splitlines(): # pragma: no cover
424 logging.debug(line)
426 def outReceived(self, data):
427 """Some data was received from stdout."""
428 self.data.append(data.decode("utf-8"))
430 def outConnectionLost(self):
431 """This will be called when stdout is closed."""
432 logging.debug("Finished downloading list of Tor exit relays.")
433 self.transport.loseConnection()
434 self.parseData()
436 def parseData(self):
437 """Parse all data received so far into our
438 :class:`<bridgedb.proxy.ProxySet> exitlist`.
440 unparseable = []
442 data = ''.join(self.data).split('\n')
444 for line in data:
445 line.strip()
446 if not line: continue
447 # If it reached an errorpage, then we grabbed raw HTML that starts
448 # with an HTML tag:
449 if line.startswith('<'): break
450 if line.startswith('#'): continue
451 ip = isIPAddress(line)
452 if ip:
453 logging.info("Discovered Tor exit relay: %s" % ip)
454 self.exitlist.add(ip)
455 else:
456 logging.debug("Got exitlist line that wasn't an IP: %s" % line)
457 unparseable.append(line)
459 if unparseable:
460 logging.warn(("There were unparseable lines in the downloaded "
461 "list of Tor exit relays: %r") % unparseable)
463 def processEnded(self, reason):
464 """Called when the child process exits and all file descriptors
465 associated with it have been closed.
467 :type reason: :class:`twisted.python.failure.Failure`
469 self.transport.loseConnection()
470 if reason.value.exitCode != 0: # pragma: no cover
471 logging.debug(reason.getTraceback())
472 logging.error("There was an error downloading Tor exit list: %s"
473 % reason.value)
474 else:
475 logging.info("Finished processing list of Tor exit relays.")
476 logging.debug("Transferring exit list to storage...")
477 # Avoid triggering the deferred twice, e.g. on processExited():
478 if not self.deferred.called:
479 self.deferred.callback(list(self.exitlist.proxies))
481 def processExited(self, reason):
482 """This will be called when the subprocess exits.
484 :type reason: :class:`twisted.python.failure.Failure`
486 logging.debug("%s exited with status code %d"
487 % (self.script, reason.value.exitCode))