1 # -*- coding: utf-8 -*-
3 # This file is part of BridgeDB, a Tor bridge distribution system.
5 # :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
6 # please also see AUTHORS file
7 # :copyright: (c) 2013-2017 Isis Lovecruft
8 # (c) 2007-2017, The Tor Project, Inc.
9 # (c) 2007-2017, all entities within the AUTHORS file
10 # :license: 3-clause BSD, see included LICENSE for information
12 """Classes for finding and managing lists of open proxies."""
14 from __future__
import print_function
15 from collections
import MutableSet
16 from functools
import update_wrapper
17 from functools
import wraps
24 from twisted
.internet
import defer
25 from twisted
.internet
import protocol
26 from twisted
.internet
import reactor
27 from twisted
.internet
import utils
as txutils
28 from bridgedb
.runner
import find
29 from bridgedb
.parse
.addr
import isIPAddress
32 def downloadTorExits(proxyList
, ipaddress
, port
=443, protocol
=None):
33 """Run a script which downloads a list of Tor exit relays which allow their
34 clients to exit to the given **ipaddress** and **port**.
36 :param proxyList: The :class:`ProxySet` instance from :mod:`bridgedb.main`.
37 :param str ipaddress: The IP address that each Tor exit relay should be
38 capable of connecting to for clients, as specified by its ExitPolicy.
39 :param int port: The port corresponding to the above **ipaddress** that
40 each Tor exit relay should allow clients to exit to. (See
41 https://check.torproject.org/cgi-bin/TorBulkExitList.py.)
42 :type protocol: :api:`twisted.internet.protocol.Protocol`
43 :param protocol: A :class:`~bridgedb.proxy.ExitListProtocol`, or any other
44 :api:`~twisted.internet.protocol.Protocol` implementation for
45 processing the results of a process which downloads a list of Tor exit
46 relays. This parameter is mainly meant for use in testing, and should
48 :rtype: :class:`~twisted.internet.defer.Deferred`
49 :returns: A deferred which will callback with a list, each item in the
50 list is a string containing an IP of a Tor exit relay.
52 proto
= ExitListProtocol() if protocol
is None else protocol()
53 args
= [proto
.script
, '--stdout', '-a', ipaddress
, '-p', str(port
)]
54 proto
.deferred
.addCallback(proxyList
.replaceExitRelays
)
55 proto
.deferred
.addErrback(logging
.exception
)
56 transport
= reactor
.spawnProcess(proto
, proto
.script
, args
=args
, env
={})
59 def loadProxiesFromFile(filename
, proxySet
=None, removeStale
=False):
60 """Load proxy IP addresses from a list of files.
62 :param str filename: A filename whose path can be either absolute or
63 relative to the current working directory. The file should contain the
64 IP addresses of various open proxies, one per line, for example::
70 :type proxySet: None or :class:`~bridgedb.proxy.ProxySet`.
71 :param proxySet: If given, load the addresses read from the files into
73 :param bool removeStale: If ``True``, remove proxies from the **proxySet**
74 which were not listed in any of the **files**.
76 :returns: A list of all the proxies listed in the **files* (regardless of
77 whether they were added or removed).
79 logging
.info("Reloading proxy lists from file %s" % filename
)
83 # We have to check the instance because, if the ProxySet was newly
84 # created, it will likely be empty, causing it to evaluate to False:
85 if isinstance(proxySet
, ProxySet
):
86 oldProxySet
= proxySet
.copy()
89 with
open(filename
, 'r') as proxyFile
:
90 for line
in proxyFile
.readlines():
92 if isinstance(proxySet
, ProxySet
):
93 # ProxySet.add() will validate the IP address
94 if proxySet
.add(line
, tag
=filename
):
95 addresses
.append(line
)
97 ip
= isIPAddress(line
)
100 except Exception as error
:
101 logging
.warn("Error while reading a proxy list file: %s" % str(error
))
103 if isinstance(proxySet
, ProxySet
):
104 stale
= list(oldProxySet
.difference(addresses
))
108 if proxySet
.getTag(ip
) == filename
:
109 logging
.info("Removing stale IP %s from proxy list." % ip
)
112 logging
.info("Tag %s didn't match %s"
113 % (proxySet
.getTag(ip
), filename
))
118 class ProxySet(MutableSet
):
119 """A :class:`collections.MutableSet` for storing validated IP addresses.
121 .. inheritance-diagram:: ProxySet
124 #: A tag to apply to IP addresses within this ``ProxySet`` which are known
126 _exitTag
= 'exit_relay'
128 def __init__(self
, proxies
=dict()):
129 """Initialise a ``ProxySet``.
131 :type proxies: A tuple, list, dict, or set.
132 :param proxies: Optionally, initialise with an iterable, ``proxies``.
133 For each ``item`` in that iterable, ``item`` must either:
134 1. be a string or int representing an IP address, or,
135 2. be another iterable, whose first item satisfies #1.
137 super(ProxySet
, self
).__init
__()
138 self
._proxydict
= dict()
139 self
._proxies
= set()
140 self
.addProxies(proxies
)
144 """All proxies in this set, regardless of tags."""
145 return list(self
._proxies
)
148 def exitRelays(self
):
149 """Get all proxies in this ``ProxySet`` tagged as Tor exit relays.
152 :returns: A set of all known Tor exit relays which are contained
153 within this :class:`~bridgedb.proxy.ProxySet`.
155 return self
.getAllWithTag(self
._exitTag
)
157 def __add__(self
, ip
=None, tag
=None):
158 """Add an **ip** to this set, with an optional **tag**.
160 This has no effect if the **ip** is already present. The **ip** is
161 only added if it passes the checks in
162 :func:`~bridgedb.parse.addr.isIPAddress`.
165 :param ip: The IP address to add.
166 :param tag: An optional value to link to **ip**. If not given, it will
167 be a timestamp (seconds since epoch, as a float) for when **ip**
168 was first added to this set.
170 :returns: ``True`` if **ip** is in this set; ``False`` otherwise.
174 if self
._proxies
.isdisjoint(set(ip
)):
175 logging
.debug("Adding %s to proxy list..." % ip
)
176 self
._proxies
.add(ip
)
177 self
._proxydict
[ip
] = tag
if tag
else time
.time()
181 def __radd__(self
, *args
, **kwargs
): self
.__add
__(*args
, **kwargs
)
183 def __contains__(self
, ip
):
184 """x.__contains__(y) <==> y in x.
187 :param ip: The IP address to check.
189 :returns: True if ``ip`` is in this set; False otherwise.
191 ipset
= [isIPAddress(ip
),]
192 if ipset
and len(self
._proxies
.intersection(ipset
)) == len(ipset
):
196 def __sub__(self
, ip
):
197 """Entirely remove **ip** from this set.
200 :param ip: The IP address to remove.
203 self
._proxydict
.pop(ip
)
204 self
._proxies
.discard(ip
)
208 def __rsub__(self
, *args
, **kwargs
): raise NotImplemented
210 def _getErrorMessage(self
, x
=None, y
=None):
211 """Make an error message describing how this class works."""
212 message
= """\nParameter 'proxies' must be one of:
215 - a {3}, whose keys are {0} (the values can be anything)
216 - a {4} of {1}s, whose first object in each {1} must be a {0}
218 """.format(type(''), type(()), type([]), type({}), type(set(())))
219 end
= "You gave: a {0}".format(type(y
))
220 end
+= " of {0}".format(type(x
))
221 return os
.linesep
.join((message
, end
))
223 def addProxies(self
, proxies
, tag
=None):
224 """Add proxies to this set.
226 This calls :func:`add` for each item in the iterable **proxies**.
227 Each proxy, if added, will be tagged with a current timestamp.
229 :type proxies: A tuple, list, dict, or set.
230 :param proxies: An iterable. For each ``item`` in that iterable,
231 ``item`` must either:
232 1. be a string or int representing an IP address, or,
233 2. be another iterable, whose first item satisfies #1.
234 :keyword tag: An optional value to link to all untagged
235 **proxies**. If ``None``, it will be a timestamp (seconds since
236 epoch, as a float) for when the proxy was first added to this set.
238 if isinstance(proxies
, dict):
239 [self
.add(ip
, value
) for (ip
, value
) in proxies
.items()]
243 if isinstance(x
, (tuple, list, set)):
244 if len(x
) == 2: self
.add(x
[0], x
[1])
245 elif len(x
) == 1: self
.add(x
, tag
)
246 else: raise ValueError(self
._getErrorMessage
(x
, proxies
))
247 elif isinstance(x
, (str, int)):
250 raise ValueError(self
._getErrorMessage
(x
, proxies
))
252 raise ValueError(self
._getErrorMessage
(proxies
, None))
255 def addExitRelays(self
, relays
):
256 logging
.info("Loading exit relays into proxy list...")
257 [self
.add(x
, self
._exitTag
) for x
in relays
]
259 def replaceExitRelays(self
, relays
):
260 existingExitRelays
= self
.getAllWithTag(self
._exitTag
)
261 logging
.debug("Replacing %d existing with %d new exit relays." %
262 (len(existingExitRelays
), len(relays
)))
264 for relay
in existingExitRelays
:
267 self
.addExitRelays(relays
)
270 def getTag(self
, ip
):
271 """Get the tag for an **ip** in this ``ProxySet``, if available.
274 :param ip: The IP address to obtain the tag for.
275 :rtype: ``None`` or str or int
276 :returns: The tag for that **ip**, iff **ip** exists in this
277 ``ProxySet`` and it has a tag.
279 return self
._proxydict
.get(ip
)
281 def getAllWithTag(self
, tag
):
282 """Get all proxies in this ``ProxySet`` with a given tag.
284 :param str tag: A tag to search for.
286 :returns: A set of all proxies which are contained within this
287 :class:`~bridgedb.proxy.ProxySet` which are also tagged with
290 return set([key
for key
, value
in filter(lambda x
: x
[1] == tag
,
291 self
._proxydict
.items())])
293 def firstSeen(self
, ip
):
294 """Get the timestamp when **ip** was first seen, if available.
297 :param ip: The IP address to obtain a timestamp for.
298 :rtype: float or None
299 :returns: The timestamp (in seconds since epoch) if available.
300 Otherwise, returns None.
302 when
= self
.getTag(ip
)
303 if isinstance(when
, float):
306 def isExitRelay(self
, ip
):
307 """Check if ``ip`` is a known Tor exit relay.
310 :param ip: The IP address to check.
312 :returns: True if ``ip`` is a known Tor exit relay; False otherwise.
314 if self
.getTag(ip
) == self
._exitTag
:
318 def replaceProxyList(self
, proxies
, tag
=None):
319 """Clear everything and add all ``proxies``.
321 :type proxies: A tuple, list, dict, or set.
322 :param proxies: An iterable. For each ``item`` in that iterable,
323 ``item`` must either:
324 1. be a string or int representing an IP address, or,
325 2. be another iterable, whose first item satisfies #1.
329 self
.addProxies(proxies
, tag
=tag
)
330 except Exception as error
:
331 logging
.error(str(error
))
333 _assigned
= ('__name__', '__doc__')
335 @wraps(MutableSet
._hash
)
336 def __hash__(self
): return self
._hash
()
337 def __iter__(self
): return self
._proxies
.__iter
__()
338 def __len__(self
): return len(self
._proxydict
.items())
339 def __repr__(self
): return type('')(self
.proxies
)
340 def __str__(self
): return os
.linesep
.join(self
.proxies
)
341 update_wrapper(__iter__
, set.__iter
__, _assigned
)
342 update_wrapper(__len__
, len, _assigned
)
343 update_wrapper(__repr__
, repr, _assigned
)
344 update_wrapper(__str__
, str, _assigned
)
346 def add(self
, ip
, tag
=None): return self
.__add
__(ip
, tag
)
347 def copy(self
): return self
.__class
__(self
._proxydict
.copy())
348 def contains(self
, ip
): return self
.__contains
__(ip
)
349 def discard(self
, ip
): return self
.__sub
__(ip
)
350 def remove(self
, other
): return self
.__sub
__(other
)
351 update_wrapper(add
, __add__
)
352 update_wrapper(copy
, __init__
)
353 update_wrapper(contains
, __contains__
)
354 update_wrapper(discard
, __sub__
)
355 update_wrapper(remove
, __sub__
)
357 def difference(self
, other
): return self
._proxies
.difference(other
)
358 def issubset(self
, other
): return self
._proxies
.issubset(other
)
359 def issuperset(self
, other
): return self
._proxies
.issuperset(other
)
360 def intersection(self
, other
): return self
._proxies
.intersection(other
)
361 def symmetric_difference(self
, other
): return self
._proxies
.symmetric_difference(other
)
362 def union(self
, other
): return self
._proxies
.union(other
)
363 update_wrapper(difference
, set.difference
, _assigned
)
364 update_wrapper(issubset
, set.issubset
, _assigned
)
365 update_wrapper(issuperset
, set.issuperset
, _assigned
)
366 update_wrapper(intersection
, set.intersection
, _assigned
)
367 update_wrapper(symmetric_difference
, set.symmetric_difference
, _assigned
)
368 update_wrapper(union
, set.union
, _assigned
)
371 class ExitListProtocol(protocol
.ProcessProtocol
):
372 """A :class:`~twisted.internet.protocol.Protocol` for ``get-exit-list``.
374 :attr boolean connected: True if our ``transport`` is connected.
376 :type transport: An implementer of
377 :interface:`twisted.internet.interface.IProcessTransport`.
378 :attr transport: If :func:`twisted.internet.reactor.spawnProcess` is
379 called with an instance of this class as it's ``protocol``, then
380 :func:`~twisted.internet.reactor.spawnProcess` will return this
385 """Create a protocol for downloading a list of current Tor exit relays.
387 :type exitlist: :class:`ProxySet`
388 :ivar exitlist: A :class:`~collections.MutableSet` containing the IP
389 addresses of known Tor exit relays which can reach our public IP
391 :ivar list data: A list containing a ``bytes`` object for each chuck
392 of data received from the ``transport``.
393 :ivar deferred: A deferred which will callback with the ``exitlist``
394 when the process has ended.
396 :param string script: The full pathname of the script to run.
399 self
.script
= find('get-tor-exits')
400 self
.exitlist
= ProxySet()
401 self
.deferred
= defer
.Deferred()
403 def childConnectionLost(self
, childFD
):
404 """See :func:`t.i.protocol.ProcessProtocol.childConnectionLost`."""
405 protocol
.ProcessProtocol
.childConnectionLost(self
, childFD
)
407 def connectionMade(self
):
408 """Called when a connection is made.
410 This may be considered the initializer of the protocol, because it is
411 called when the connection is completed. For clients, this is called
412 once the connection to the server has been established; for servers,
413 this is called after an accept() call stops blocking and a socket has
414 been received. If you need to send any greeting or initial message,
417 logging
.debug("ExitListProtocol: Connection made with remote server")
418 self
.transport
.closeStdin()
420 def errReceived(self
, data
):
421 """Some data was received from stderr."""
422 # The get-exit-list script uses twisted.python.log to log to stderr:
423 for line
in data
.splitlines(): # pragma: no cover
426 def outReceived(self
, data
):
427 """Some data was received from stdout."""
428 self
.data
.append(data
.decode("utf-8"))
430 def outConnectionLost(self
):
431 """This will be called when stdout is closed."""
432 logging
.debug("Finished downloading list of Tor exit relays.")
433 self
.transport
.loseConnection()
437 """Parse all data received so far into our
438 :class:`<bridgedb.proxy.ProxySet> exitlist`.
442 data
= ''.join(self
.data
).split('\n')
446 if not line
: continue
447 # If it reached an errorpage, then we grabbed raw HTML that starts
449 if line
.startswith('<'): break
450 if line
.startswith('#'): continue
451 ip
= isIPAddress(line
)
453 logging
.info("Discovered Tor exit relay: %s" % ip
)
454 self
.exitlist
.add(ip
)
456 logging
.debug("Got exitlist line that wasn't an IP: %s" % line
)
457 unparseable
.append(line
)
460 logging
.warn(("There were unparseable lines in the downloaded "
461 "list of Tor exit relays: %r") % unparseable
)
463 def processEnded(self
, reason
):
464 """Called when the child process exits and all file descriptors
465 associated with it have been closed.
467 :type reason: :class:`twisted.python.failure.Failure`
469 self
.transport
.loseConnection()
470 if reason
.value
.exitCode
!= 0: # pragma: no cover
471 logging
.debug(reason
.getTraceback())
472 logging
.error("There was an error downloading Tor exit list: %s"
475 logging
.info("Finished processing list of Tor exit relays.")
476 logging
.debug("Transferring exit list to storage...")
477 # Avoid triggering the deferred twice, e.g. on processExited():
478 if not self
.deferred
.called
:
479 self
.deferred
.callback(list(self
.exitlist
.proxies
))
481 def processExited(self
, reason
):
482 """This will be called when the subprocess exits.
484 :type reason: :class:`twisted.python.failure.Failure`
486 logging
.debug("%s exited with status code %d"
487 % (self
.script
, reason
.value
.exitCode
))