Parse bridge blocking info from SQL database.
[tor-bridgedb.git] / bridgedb / parse / addr.py
blob1d59c9a7244ac2e3c8e11b045deb5d458a8e84fb
1 # -*- coding: utf-8 -*-
3 # This file is part of BridgeDB, a Tor bridge distribution system.
5 # :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
6 # please also see AUTHORS file
7 # :copyright: (c) 2013 Isis Lovecruft
8 # (c) 2007-2013, The Tor Project, Inc.
9 # (c) 2007-2013, all entities within the AUTHORS file
10 # :license: 3-clause BSD, see included LICENSE for information
12 """Utilities for parsing IP and email addresses.
14 .. py:module:: bridgedb.parse.addr
15 :synopsis: Parsers for finding and validating IP addresses, email
16 addresses, and port ranges.
19 bridgedb.parse.addr
20 ===================
24 parse.addr
25 | |_ extractEmailAddress() - Validate a :rfc:2822 email address.
26 | |_ isIPAddress() - Check if an arbitrary string is an IP address.
27 | |_ isIPv4() - Check if an arbitrary string is an IPv4 address.
28 | |_ isIPv6() - Check if an arbitrary string is an IPv6 address.
29 | \_ isValidIP() - Check that an IP address is valid.
31 |_ PortList - A container class for validated port ranges.
36 How address validity is determined
37 ----------------------------------
39 The following terms define addresses which are **not** valid. All other
40 addresses are taken to be valid.
43 Private IP Address Ranges
44 ^^^^^^^^^^^^^^^^^^^^^^^^^
45 .. glossary::
47 Private Address
48 These address ranges are reserved by IANA for private intranets, and not
49 routable to the Internet::
50 10.0.0.0 - 10.255.255.255 (10.0.0.0/8)
51 172.16.0.0 - 172.31.255.255 (172.16.0.0/12)
52 192.168.0.0 - 192.168.255.255 (192.168.0.0/16)
53 For additional information, see :rfc:`1918`.
56 Reserved and Special Use Addresses
57 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
58 .. glossary::
60 Unspecified Address
61 Default Route
62 Current network (only valid as source address). See :rfc:`1122`. An
63 **Unspecified Address** in the context of firewalls means "all addresses
64 of the local machine". In a routing context, it is usually termed the
65 **Default Route**, and it means the default route (to "the rest of" the
66 internet). See :rfc:`1700`.
67 For example::
68 0.0.0.0/8
69 ::/128
71 Loopback Address
72 Reserved for loopback and IPC on the localhost. See :rfc:`1122`.
73 Example::
74 127.0.0.0
76 Localhost Address
77 Loopback IP addresses (refers to self). See :rfc:`5735`.
78 Examples include::
79 127.0.0.1 - 127.255.255.254 (127.0.0.0/8)
80 ::1
82 Link-Local Address
83 These are the link-local blocks, used for communication between hosts on
84 a single link. See :rfc:`3927`.
85 Examples::
86 169.254.0.0/16
87 fe80::/64
89 Multicast Address
90 Reserved for multicast addresses. See :rfc:`3171`.
91 For example::
92 224.0.0.0 - 239.255.255.255 (224.0.0.0/4)
94 Private Address
95 Reserved for private networks. See :rfc:`1918`.
96 Some examples include::
97 10.0.0.0/8
98 172.16.0.0/12
99 192.168.0.0/16
101 Reserved Address
102 Reserved (former Class E network). See :rfc:`1700`, :rfc:`3232`, and
103 :rfc:`5735`. The one exception to this rule is the :term:`Limited
104 Broadcast Address`, ``255.255.255.255`` for which packets at the IP
105 layer are not forwarded to the public internet. For example::
106 240.0.0.0 - 255.255.255.255 (240.0.0.0/4)
108 Limited Broadcast Address
109 Limited broadcast address (limited to all other nodes on the LAN). See
110 :rfc:`919`. For IPv4, ``255`` in any part of the IP is reserved for
111 broadcast addressing to the local LAN, e.g.::
112 255.255.255.255
115 .. warning:: The :mod:`ipaddr` module (as of version 2.1.10) does not
116 understand the following reserved_ addresses:
117 .. _reserved: https://tools.ietf.org/html/rfc5735#page-4
119 .. glossary::
121 Reserved Address (Protocol Assignments)
122 Reserved for IETF protocol assignments. See :rfc:`5735`.
123 Example::
124 192.0.0.0/24
126 Reserved Address (6to4 Relay Anycast)
127 IPv6 to IPv4 relay. See :rfc:`3068`.
128 Example::
129 192.88.99.0/24
131 Reserved Address (Network Benchmark)
132 Network benchmark tests. See :rfc:`2544`.
133 Example::
134 198.18.0.0/15
136 Reserved Address (TEST-NET-1)
137 Reserved for use in documentation and example code. It is often used in
138 conjunction with domain names ``example.com`` or ``example.net`` in
139 vendor and protocol documentation. See :rfc:`1166`.
140 For example::
141 192.0.2.0/24
143 Reserved Address (TEST-NET-2)
144 TEST-NET-2. See :rfc:`5737`.
145 Example::
146 198.51.100.0/24
148 Reserved Address (TEST-NET-3)
149 TEST-NET-3. See :rfc:`5737`.
150 Example::
151 203.0.113.0/24
153 Shared Address Space
154 See :rfc:`6598`.
155 Example::
156 100.64.0.0/10
158 Site-Local Address
159 Unique Local Address
160 Similar uses to :term:`Limited Broadcast Address`. For IPv6, everything
161 becomes convoluted_ and complicated_, and then redefined_. See
162 :rfc:`4193`, :rfc:`3879`, and :rfc:`3513`. The
163 :meth:`ipaddr.IPAddress.is_site_local` method *only* checks to see if
164 the address is a **Unique Local Address** vis-á-vis :rfc:`3513` §2.5.6,
165 e.g.::
166 ff00::0/8
167 fec0::/10
171 .. _convoluted: https://en.wikipedia.org/wiki/IPv6_address#Multicast_addresses
172 .. _complicated: https://en.wikipedia.org/wiki/IPv6_address#IPv6_address_scopes
173 .. _redefined: https://en.wikipedia.org/wiki/Unique_local_address
176 from __future__ import absolute_import
177 from __future__ import print_function
178 from __future__ import unicode_literals
180 import logging
181 import re
183 import ipaddr
186 #: These are the special characters which RFC2822 allows within email addresses:
187 #: ASPECIAL = '!#$%&*+-/=?^_`{|}~' + "\\\'"
188 #: …But these are the only ones we're confident that we can handle correctly:
189 #: ASPECIAL = '-_+/=_~'
190 ASPECIAL = '-_+/=_~'
191 ACHAR = r'[\w%s]' % "".join("\\%s" % c for c in ASPECIAL)
192 DOTATOM = r'%s+(?:\.%s+)*' % (ACHAR, ACHAR)
193 DOMAIN = r'\w+(?:\.\w+)*'
194 ADDRSPEC = r'(%s)\@(%s)' % (DOTATOM, DOMAIN)
195 # A compiled regexp which matches on any type and ammount of whitespace:
196 SPACE_PAT = re.compile(r'\s+')
197 # A compiled regexp which matches RFC2822 email address strings:
198 ADDRSPEC_PAT = re.compile(ADDRSPEC)
201 class BadEmail(Exception):
202 """Exception raised when we get a bad email address."""
203 def __init__(self, msg, email):
204 Exception.__init__(self, msg)
205 self.email = email
207 class InvalidPort(ValueError):
208 """Raised when a given port number is invalid."""
210 class UnsupportedDomain(ValueError):
211 """Raised when we get an email address from an unsupported domain."""
214 def canonicalizeEmailDomain(domain, domainmap):
215 """Decide if an email was sent from a permitted domain.
217 :param str domain: The domain portion of an email address to validate. It
218 will be checked that it is one of the domains allowed to email
219 requests for bridges to the
220 :class:`~bridgedb.distributors.email.distributor.EmailDistributor`.
221 :param dict domainmap: A map of permitted alternate domains (in lowercase)
222 to their canonical domain names (in lowercase). This can be configured
223 with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for
224 example::
225 EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com',
226 'googlemail.com': 'gmail.com'}
228 :raises UnsupportedDomain: if the domain portion of the email address is
229 not within the map of alternate to canonical allowed domain names.
230 :rtype: str
231 :returns: The canonical domain name for the email address.
233 permitted = None
234 domain = domain.decode('utf-8') if isinstance(domain, bytes) else domain
236 try:
237 permitted = domainmap.get(domain)
238 except AttributeError:
239 logging.debug("Got non-dict for 'domainmap' parameter: %r" % domainmap)
241 if not permitted:
242 raise UnsupportedDomain("Domain not permitted: %s" % domain)
244 return permitted
246 def extractEmailAddress(emailaddr):
247 """Given an email address, obtained for example, via a ``From:`` or
248 ``Sender:`` email header, try to extract and parse (according to
249 :rfc:`2822`) the local and domain portions.
251 We only allow the following form::
253 LOCAL_PART := DOTATOM
254 DOMAIN := DOTATOM
255 ADDRSPEC := LOCAL_PART "@" DOMAIN
257 In particular, we are disallowing: obs-local-part, obs-domain, comment,
258 and obs-FWS. Other forms exist, but none of the incoming services we
259 recognize support them.
261 :param emailaddr: An email address to validate.
262 :raises BadEmail: if the **emailaddr** couldn't be validated or parsed.
263 :returns: A tuple of the validated email address, containing the mail
264 local part and the domain::
265 (LOCAL_PART, DOMAIN)
267 orig = emailaddr
269 try:
270 addr = SPACE_PAT.sub(' ', emailaddr).strip()
271 except TypeError as error:
272 logging.debug(error)
273 raise BadEmail("Can't extract address from object type %r!"
274 % type(orig), orig)
276 # Only works on usual-form addresses; raises BadEmail on weird
277 # address form. That's okay, since we'll only get those when
278 # people are trying to fool us.
279 if '<' in addr:
280 # Take the _last_ index of <, so that we don't need to bother
281 # with quoting tricks.
282 idx = addr.rindex('<')
283 addr = addr[idx:]
284 m = re.search(r'<([^>]*)>', addr)
285 if m is None:
286 raise BadEmail("Couldn't extract address spec", orig)
287 addr = m.group(1)
289 # At this point, addr holds a putative addr-spec.
290 addr = addr.replace(" ", "")
291 m = ADDRSPEC_PAT.match(addr)
292 if not m:
293 raise BadEmail("Bad address spec format", orig)
295 localpart, domain = m.groups()
296 return localpart, domain
298 def isIPAddress(ip, compressed=True):
299 """Check if an arbitrary string is an IP address, and that it's valid.
301 :type ip: str or int
302 :param ip: The IP address to check.
303 :param boolean compressed: If True, return a string representing the
304 compressed form of the address. Otherwise, return an
305 :class:`ipaddr.IPAddress` instance.
306 :rtype: A :class:`ipaddr.IPAddress`, or a string, or False
307 :returns: The IP, as a string or a class, if it passed the
308 checks. Otherwise, returns False.
310 try:
311 ip = ipaddr.IPAddress(ip)
312 except ValueError:
313 return False
314 else:
315 if isValidIP(ip):
316 if compressed:
317 return ip.compressed
318 else:
319 return ip
320 return False
322 def isIPv(version, ip):
323 """Check if **ip** is a certain **version** (IPv4 or IPv6).
325 .. warning: Do *not* put any calls to the logging module in this function,
326 or else an infinite recursion will occur when the call is made, due
327 the the log :class:`~logging.Filter`s in :mod:`~bridgedb.safelog`
328 using this function to validate matches from the regular expression
329 for IP addresses.
331 :param integer version: The IPv[4|6] version to check; must be either
332 ``4`` or ``6``. Any other value will be silently changed to ``4``.
333 :param ip: The IP address to check. May be an any type which
334 :class:`ipaddr.IPAddress` will accept.
335 :rtype: boolean
336 :returns: ``True``, if the address is an IPv4 address.
338 try:
339 ipaddr.IPAddress(ip, version=version)
340 except (ipaddr.AddressValueError, Exception):
341 return False
342 else:
343 return True
344 return False
346 def isIPv4(ip):
347 """Check if an address is IPv4.
349 .. attention:: This does *not* check validity. See :func:`isValidIP`.
351 :type ip: str or int
352 :param ip: The IP address to check.
353 :rtype: boolean
354 :returns: True if the address is an IPv4 address.
356 return isIPv(4, ip)
358 def isIPv6(ip):
359 """Check if an address is IPv6.
361 .. attention:: This does *not* check validity. See :func:`isValidIP`.
363 :type ip: str or int
364 :param ip: The IP address to check.
365 :rtype: boolean
366 :returns: True if the address is an IPv6 address.
368 return isIPv(6, ip)
370 def isValidIP(ip):
371 """Check that an IP (v4 or v6) is valid.
373 The IP address, **ip**, must not be any of the following:
375 * A :term:`Link-Local Address`,
376 * A :term:`Loopback Address` or :term:`Localhost Address`,
377 * A :term:`Multicast Address`,
378 * An :term:`Unspecified Address` or :term:`Default Route`,
379 * Any other :term:`Private Address`, or address within a privately
380 allocated space, such as the IANA-reserved
381 :term:`Shared Address Space`.
383 If it is an IPv6 address, it also must not be:
385 * A :term:`Site-Local Address` or an :term:`Unique Local Address`.
387 >>> from bridgedb.parse.addr import isValidIP
388 >>> isValidIP('1.2.3.4')
389 True
390 >>> isValidIP('1.2.3.255')
391 True
392 >>> isValidIP('1.2.3.256')
393 False
394 >>> isValidIP('1')
395 False
396 >>> isValidIP('1.2.3')
397 False
398 >>> isValidIP('xyzzy')
399 False
401 :type ip: An :class:`ipaddr.IPAddress`, :class:`ipaddr.IPv4Address`,
402 :class:`ipaddr.IPv6Address`, or str
403 :param ip: An IP address. If it is a string, it will be converted to a
404 :class:`ipaddr.IPAddress`.
405 :rtype: boolean
406 :returns: ``True``, if **ip** passes the checks; False otherwise.
408 reasons = []
410 try:
411 if isinstance(ip, str):
412 ip = ipaddr.IPAddress(ip)
414 if ip.is_link_local:
415 reasons.append('link local')
416 if ip.is_loopback:
417 reasons.append('loopback')
418 if ip.is_multicast:
419 reasons.append('multicast')
420 if ip.is_private:
421 reasons.append('private')
422 if ip.is_unspecified:
423 reasons.append('unspecified')
425 if (ip.version == 6) and ip.is_site_local:
426 reasons.append('site local')
427 elif (ip.version == 4) and ip.is_reserved:
428 reasons.append('reserved')
429 except ValueError:
430 reasons.append('cannot convert to ip')
432 if reasons:
433 explain = ', '.join([r for r in reasons])
434 logging.debug("IP address %r is invalid! Reason(s): %s"
435 % (ip, explain))
436 return False
437 return True
439 def isLoopback(ip):
440 """Determine if ``ip`` is a loopback or localhost address (127.0.0.1/8).
442 :type: ``str`` or ``ipaddr.IPAddress``
443 :param ip: An IP address.
444 :rtype: bool
445 :returns: ``True`` if the IP was a loopback or localhost address; ``False``
446 otherwise.
448 try:
449 if isinstance(ip, str):
450 ip = ipaddr.IPAddress(ip)
452 if ip.is_loopback:
453 return True
454 else:
455 return False
456 except Exception as err:
457 logging.debug("Error attempting to parse IP address: %s", ip)
459 return False
461 def normalizeEmail(emailaddr, domainmap, domainrules, ignorePlus=True):
462 """Normalise an email address according to the processing rules for its
463 canonical originating domain.
465 The email address, **emailaddr**, will be parsed and validated, and then
466 checked that it originated from one of the domains allowed to email
467 requests for bridges to the
468 :class:`~bridgedb.distributors.email.distributor.EmailDistributor` via the
469 :func:`canonicaliseEmailDomain` function.
471 :param str emailaddr: An email address to normalise.
472 :param dict domainmap: A map of permitted alternate domains (in lowercase)
473 to their canonical domain names (in lowercase). This can be configured
474 with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for
475 example::
477 EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com',
478 'googlemail.com': 'gmail.com'}
480 :param dict domainrules: A mapping of canonical permitted domain names to
481 a list of rules which should be applied to processing them, for
482 example::
484 EMAIL_DOMAIN_RULES = {'gmail.com': ["ignore_dots", "dkim"]
486 Currently, ``"ignore_dots"`` means that all ``"."`` characters will be
487 removed from the local part of the validated email address.
489 :param bool ignorePlus: If ``True``, assume that
490 ``blackhole+kerr@torproject.org`` is an alias for
491 ``blackhole@torproject.org``, and remove everything after the first
492 ``'+'`` character.
494 :raises UnsupportedDomain: if the email address originated from a domain
495 that we do not explicitly support.
496 :raises BadEmail: if the email address could not be parsed or validated.
497 :rtype: str
498 :returns: The validated, normalised email address, if it was from a
499 permitted domain. Otherwise, returns an empty string.
501 emailaddr = emailaddr.lower()
502 localpart, domain = extractEmailAddress(emailaddr)
503 canonical = canonicalizeEmailDomain(domain, domainmap)
505 if ignorePlus:
506 idx = localpart.find('+')
507 if idx >= 0:
508 localpart = localpart[:idx]
510 rules = domainrules.get(canonical, [])
511 if 'ignore_dots' in rules:
512 localpart = localpart.replace(".", "")
514 normalized = "%s@%s" % (localpart, domain)
515 return normalized
518 class PortList(object):
519 """A container class for validated port ranges.
521 From torspec.git/dir-spec.txt §2.3:
523 | portspec ::= "*" | port | port "-" port
524 | port ::= an integer between 1 and 65535, inclusive.
526 | [Some implementations incorrectly generate ports with value 0.
527 | Implementations SHOULD accept this, and SHOULD NOT generate it.
528 | Connections to port 0 are never permitted.]
531 :ivar set ports: All ports which have been added to this ``PortList``.
534 #: The maximum number of allowed ports per IP address.
535 PORTSPEC_LEN = 16
537 def __init__(self, *args, **kwargs):
538 """Create a :class:`~bridgedb.parse.addr.PortList`.
540 :param args: Should match the ``portspec`` defined above.
541 :raises: InvalidPort, if one of ``args`` doesn't match ``port`` as
542 defined above.
544 self.ports = set()
545 self.add(*args)
547 def _sanitycheck(self, port):
548 """Check that ``port`` is in the range [1, 65535] inclusive.
550 :raises: InvalidPort, if ``port`` doesn't match ``port`` as defined
551 in the excert from torspec above.
552 :rtype: int
553 :returns: The **port**, if no exceptions were raised.
555 if (not isinstance(port, int)) or not (0 < port <= 65535):
556 raise InvalidPort("%s is not a valid port number!" % port)
557 return port
559 def __contains__(self, port):
560 """Determine whether ``port`` is already in this ``PortList``.
562 :returns: True if ``port`` is in this ``PortList``; False otherwise.
564 return port in self.ports
566 def add(self, *args):
567 """Add a port (or ports) to this ``PortList``.
569 :param args: Should match the ``portspec`` defined above.
570 :raises: InvalidPort, if one of ``args`` doesn't match ``port`` as
571 defined above.
573 for arg in args:
574 portlist = []
575 try:
576 if isinstance(arg, str):
577 ports = set([int(p)
578 for p in arg.split(',')][:self.PORTSPEC_LEN])
579 portlist.extend([self._sanitycheck(p) for p in ports])
580 if isinstance(arg, int):
581 portlist.append(self._sanitycheck(arg))
582 if isinstance(arg, PortList):
583 self.add(list(arg.ports))
584 except ValueError:
585 raise InvalidPort("%s is not a valid port number!" % arg)
587 self.ports.update(set(portlist))
589 def __iter__(self):
590 """Iterate through all ports in this PortList."""
591 return self.ports.__iter__()
593 def __str__(self):
594 """Returns a pretty string representation of this PortList."""
595 return ','.join(['%s' % port for port in self.ports])
597 def __repr__(self):
598 """Returns a raw depiction of this PortList."""
599 return "PortList('%s')" % self.__str__()
601 def __len__(self):
602 """Returns the total number of ports in this PortList."""
603 return len(self.ports)
605 def __getitem__(self, port):
606 """Get the value of ``port`` if it is in this PortList.
608 :raises: ValueError, if ``port`` isn't in this PortList.
609 :rtype: integer
610 :returns: The ``port``, if it is in this PortList.
612 return list(self.ports)[port]