bridgedb/metrics.py

   1 # -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*-
   2 # _____________________________________________________________________________
   3 #
   4 # This file is part of BridgeDB, a Tor bridge distribution system.
   5 #
   6 # :authors: please see included AUTHORS file
   7 # :copyright: (c) 2019, The Tor Project, Inc.
   8 #             (c) 2019, Philipp Winter
   9 # :license: see LICENSE for licensing information
  10 # _____________________________________________________________________________
  11
  12 """API for keeping track of BridgeDB metrics, e.g., the demand for bridges
  13 over time.
  14 """
  15
  16 import logging
  17 import ipaddr
  18 import operator
  19 import json
  20 import datetime
  21
  22 from bridgedb import geo
  23 from bridgedb.distributors.common.http import getClientIP
  24 from bridgedb.distributors.email import request
  25
  26 from twisted.mail.smtp import Address
  27
  28 # Our data structure to keep track of exit relays.  The variable is of type
  29 # bridgedb.proxy.ProxySet.  We reserve a special country code (determined by
  30 # PROXY_CC below) for exit relays and other proxies.
  31 PROXIES = None
  32
  33 # Our custom country code for IP addresses that we couldn't map to a country.
  34 # This can happen for private IP addresses or if our geo-location provider has
  35 # no mapping.
  36 UNKNOWN_CC = "??"
  37
  38 # Our custom country code for IP addresses that are proxies, e.g., Tor exit
  39 # relays.  The code "zz" is free for assignment for user needs as specified
  40 # here: <https://en.wikipedia.org/w/index.php?title=ISO_3166-1_alpha-2&oldid=906611218#Decoding_table>
  41 PROXY_CC = "ZZ"
  42
  43 # We use BIN_SIZE to reduce the granularity of our counters.  We round up
  44 # numbers to the next multiple of BIN_SIZE, e.g., 28 is rounded up to:
  45 # 10 * 3 = 30.
  46 BIN_SIZE = 10
  47
  48 # The prefix length that we use to keep track of the number of unique subnets
  49 # we have seen HTTPS requests from.
  50 SUBNET_CTR_PREFIX_LEN = 20
  51
  52 # All of the pluggable transports BridgeDB currently supports.
  53 SUPPORTED_TRANSPORTS = None
  54
  55 # Version number for our metrics format.  We increment the version if our
  56 # format changes.
  57 METRICS_VERSION = 1
  58
  59
  60 def setProxies(proxies):
  61     """Set the given proxies.
  62
  63     :type proxies: :class:`~bridgedb.proxy.ProxySet`
  64     :param proxies: The container for the IP addresses of any currently
  65         known open proxies.
  66     """
  67     logging.debug("Setting %d proxies." % len(proxies))
  68     global PROXIES
  69     PROXIES = proxies
  70
  71
  72 def setSupportedTransports(supportedTransports):
  73     """Set the given supported transports.
  74
  75     :param dict supportedTransports: The transport types that BridgeDB
  76         currently supports.
  77     """
  78
  79     logging.debug("Setting %d supported transports." %
  80                   len(supportedTransports))
  81     global SUPPORTED_TRANSPORTS
  82     SUPPORTED_TRANSPORTS = supportedTransports
  83
  84
  85 def isBridgeTypeSupported(bridgeType):
  86     """Return `True' or `False' depending on if the given bridge type is
  87     supported.
  88
  89     :param str bridgeType: The bridge type, e.g., "vanilla" or "obfs4".
  90     """
  91
  92     if SUPPORTED_TRANSPORTS is None:
  93         logging.error("Bug: Variable SUPPORTED_TRANSPORTS is None.")
  94         return False
  95
  96     # Note that "vanilla" isn't a transport protocol (in fact, it's the absence
  97     # of a transport), which is why it isn't in SUPPORTED_TRANSPORTS.
  98     return (bridgeType in SUPPORTED_TRANSPORTS) or (bridgeType == "vanilla")
  99
 100
 101 def export(fh, measurementInterval):
 102     """Export metrics by writing them to the given file handle.
 103
 104     :param file fh: The file handle to which we're writing our metrics.
 105     :param int measurementInterval: The number of seconds after which we rotate
 106         and dump our metrics.
 107     """
 108
 109     httpsMetrix = HTTPSMetrics()
 110     emailMetrix = EmailMetrics()
 111     moatMetrix = MoatMetrics()
 112
 113     # Rotate our metrics.
 114     httpsMetrix.rotate()
 115     emailMetrix.rotate()
 116     moatMetrix.rotate()
 117
 118     numProxies = len(PROXIES) if PROXIES is not None else 0
 119     if numProxies == 0:
 120         logging.error("Metrics module doesn't have any proxies.")
 121     else:
 122         logging.debug("Metrics module knows about %d proxies." % numProxies)
 123
 124     now = datetime.datetime.utcnow()
 125     fh.write("bridgedb-metrics-end %s (%d s)\n" % (
 126              now.strftime("%Y-%m-%d %H:%M:%S"),
 127              measurementInterval))
 128     fh.write("bridgedb-metrics-version %d\n" % METRICS_VERSION)
 129
 130     httpsLines = httpsMetrix.getMetrics()
 131     for line in httpsLines:
 132         fh.write("bridgedb-metric-count %s\n" % line)
 133
 134     moatLines = moatMetrix.getMetrics()
 135     for line in moatLines:
 136         fh.write("bridgedb-metric-count %s\n" % line)
 137
 138     emailLines = emailMetrix.getMetrics()
 139     for line in emailLines:
 140         fh.write("bridgedb-metric-count %s\n" % line)
 141
 142
 143 def resolveCountryCode(ipAddr):
 144     """Return the country code of the given IP address.
 145
 146     :param str ipAddr: The IP address to resolve.
 147
 148     :rtype: str
 149     :returns: A two-letter country code.
 150     """
 151
 152     if ipAddr is None:
 153         logging.warning("Given IP address was None.  Using %s as country "
 154                         "code." % UNKNOWN_CC)
 155         return UNKNOWN_CC
 156
 157     if PROXIES is None:
 158         logging.warning("Proxies are not yet set.")
 159     elif ipAddr in PROXIES:
 160         return PROXY_CC
 161
 162     countryCode = geo.getCountryCode(ipaddr.IPAddress(ipAddr))
 163
 164     # countryCode may be None if GeoIP is unable to map an IP address to a
 165     # country.
 166     return UNKNOWN_CC if countryCode is None else countryCode
 167
 168
 169 class Singleton(type):
 170     _instances = {}
 171
 172     def __call__(cls, *args, **kwargs):
 173         if cls not in cls._instances:
 174             cls._instances[cls] = super(Singleton, cls).__call__(*args,
 175                                                                  **kwargs)
 176         return cls._instances[cls]
 177
 178     def clear(cls):
 179         """Drop the instance (necessary for unit tests)."""
 180         try:
 181             del cls._instances[cls]
 182         except KeyError:
 183             pass
 184
 185
 186 class Metrics(metaclass=Singleton):
 187     """Base class representing metrics.
 188
 189     This class provides functionality that our three distribution mechanisms
 190     share.
 191     """
 192
 193     def __init__(self, binSize=BIN_SIZE):
 194         logging.debug("Instantiating metrics class.")
 195         self.binSize = binSize
 196
 197         # Metrics cover a 24 hour period.  To that end, we're maintaining two
 198         # data structures: our "hot" metrics are currently being populated
 199         # while our "cold" metrics are finished, and valid for 24 hours.  After
 200         # that, our hot metrics turn into cold metrics, and we start over.
 201         self.hotMetrics = dict()
 202         self.coldMetrics = dict()
 203
 204     def rotate(self):
 205         """Rotate our metrics."""
 206
 207         self.coldMetrics = self.hotMetrics
 208         self.hotMetrics = dict()
 209
 210     def findAnomaly(self, request):
 211         anomaly = "none"
 212
 213         # TODO: Inspect email for traces of bots, Sherlock Homes-style!
 214         # See <https://bugs.torproject.org/9316#comment:19> for the rationale.
 215         # All classes that inherit from Metrics() should implement this method.
 216
 217         return anomaly
 218
 219     def getMetrics(self):
 220         """Get our sanitized current metrics, one per line.
 221
 222         Metrics are of the form:
 223
 224             [
 225              "moat.obfs4.us.success.none 10",
 226              "https.vanilla.de.success.none 30",
 227              ...
 228             ]
 229
 230         :rtype: list
 231         :returns: A list of metric lines.
 232         """
 233         lines = []
 234         for key, value in self.coldMetrics.items():
 235             # Round up our value to the nearest multiple of self.binSize to
 236             # reduce the accuracy of our real values.
 237             if (value % self.binSize) > 0:
 238                 value += self.binSize - (value % self.binSize)
 239             lines.append("%s %d" % (key, value))
 240         return lines
 241
 242     def set(self, key, value):
 243         """Set the given key to the given value.
 244
 245         :param str key: The time series key.
 246         :param int value: The time series value.
 247         """
 248         self.hotMetrics[key] = value
 249
 250     def inc(self, key):
 251         """Increment the given key.
 252
 253         :param str key: The time series key.
 254         """
 255         if key in self.hotMetrics:
 256             self.hotMetrics[key] += 1
 257         else:
 258             self.set(key, 1)
 259
 260     def createKey(self, distMechanism, bridgeType, countryOrProvider,
 261                   success, anomaly):
 262         """Create and return a time series key.
 263
 264         :param str distMechanism: A string representing our distribution
 265             mechanism, e.g., "https".
 266         :param str bridgeType: A string representing the requested bridge
 267             type, e.g., "vanilla" or "obfs4".
 268         :param str countryOrProvider: A string representing the client's
 269             two-letter country code or email provider, e.g., "it" or
 270             "yahoo.com".
 271         :param bool success: ``True`` if the request was successful and
 272             BridgeDB handed out a bridge; ``False`` otherwise.
 273         :param str anomaly: ``None`` if the request was not anomalous and hence
 274             believed to have come from a real user; otherwise a string
 275             representing the type of anomaly.
 276         :rtype: str
 277         :returns: A key that uniquely identifies the given metrics
 278             combinations.
 279         """
 280
 281         if isinstance(countryOrProvider, bytes):
 282             countryOrProvider = countryOrProvider.decode('utf-8')
 283
 284         countryOrProvider = countryOrProvider.lower()
 285         bridgeType = bridgeType.lower()
 286         success = "success" if success else "fail"
 287
 288         key = "%s.%s.%s.%s.%s" % (distMechanism, bridgeType,
 289                                   countryOrProvider, success, anomaly)
 290
 291         return key
 292
 293
 294 class HTTPSMetrics(Metrics):
 295
 296     def __init__(self):
 297         super(HTTPSMetrics, self).__init__()
 298
 299         # Maps subnets (e.g., "1.2.0.0/16") to the number of times we've seen
 300         # requests from the given subnet.
 301         self.subnetCounter = dict()
 302         self.keyPrefix = "https"
 303
 304     def getTopNSubnets(self, n=10):
 305
 306         sortedByNum = sorted(self.subnetCounter.items(),
 307                              key=operator.itemgetter(1),
 308                              reverse=True)
 309         return sortedByNum[:n]
 310
 311     def _recordHTTPSRequest(self, request, success):
 312
 313         logging.debug("HTTPS request has user agent: %s" %
 314                       request.requestHeaders.getRawHeaders("User-Agent"))
 315
 316         # Pull the client's IP address out of the request and convert it to a
 317         # two-letter country code.
 318         ipAddr = getClientIP(request,
 319                              useForwardedHeader=True,
 320                              skipLoopback=False)
 321         self.updateSubnetCounter(ipAddr)
 322         countryCode = resolveCountryCode(ipAddr)
 323
 324         transports = request.args.get("transport", list())
 325         if len(transports) > 1:
 326             logging.warning("Expected a maximum of one transport but %d are "
 327                             "given." % len(transports))
 328
 329         if len(transports) == 0:
 330             bridgeType = "vanilla"
 331         elif transports[0] == "" or transports[0] == "0":
 332             bridgeType = "vanilla"
 333         else:
 334             bridgeType = transports[0]
 335
 336         # BridgeDB's HTTPS interface exposes transport types as a drop down
 337         # menu but users can still request anything by manipulating HTTP
 338         # parameters.
 339         if not isBridgeTypeSupported(bridgeType):
 340             logging.warning("User requested unsupported transport type %s "
 341                             "over HTTPS." % bridgeType)
 342             return
 343
 344         logging.debug("Recording %svalid HTTPS request for %s from %s (%s)." %
 345                       ("" if success else "in",
 346                        bridgeType, ipAddr, countryCode))
 347
 348         # Now update our metrics.
 349         key = self.createKey(self.keyPrefix, bridgeType, countryCode,
 350                              success, self.findAnomaly(request))
 351         self.inc(key)
 352
 353     def recordValidHTTPSRequest(self, request):
 354         self._recordHTTPSRequest(request, True)
 355
 356     def recordInvalidHTTPSRequest(self, request):
 357         self._recordHTTPSRequest(request, False)
 358
 359     def updateSubnetCounter(self, ipAddr):
 360
 361         if ipAddr is None:
 362             return
 363
 364         nw = ipaddr.IPNetwork(ipAddr + "/" + str(SUBNET_CTR_PREFIX_LEN),
 365                               strict=False)
 366         subnet = nw.network.compressed
 367         logging.debug("Updating subnet counter with %s" % subnet)
 368
 369         num = self.subnetCounter.get(subnet, 0)
 370         self.subnetCounter[subnet] = num + 1
 371
 372
 373 class EmailMetrics(Metrics):
 374
 375     def __init__(self):
 376         super(EmailMetrics, self).__init__()
 377         self.keyPrefix = "email"
 378
 379     def _recordEmailRequest(self, smtpAutoresp, success):
 380
 381         emailAddrs = smtpAutoresp.getMailTo()
 382         if len(emailAddrs) == 0:
 383             # This is just for unit tests.
 384             emailAddr = Address("foo@gmail.com")
 385         else:
 386             emailAddr = emailAddrs[0]
 387
 388         # Get the requested transport protocol.
 389         br = request.determineBridgeRequestOptions( smtpAutoresp.incoming.lines)
 390         bridgeType = "vanilla" if not len(br.transports) else br.transports[0]
 391
 392         # Over email, transports are requested by typing them.  Typos happen
 393         # and users can request anything, really.
 394         if not isBridgeTypeSupported(bridgeType):
 395             logging.warning("User requested unsupported transport type %s "
 396                             "over email." % bridgeType)
 397             return
 398
 399         logging.debug("Recording %svalid email request for %s from %s." %
 400                       ("" if success else "in", bridgeType, emailAddr))
 401         sld = emailAddr.domain.split(b".")[0]
 402
 403         # Now update our metrics.
 404         key = self.createKey(self.keyPrefix, bridgeType, sld, success,
 405                              self.findAnomaly(request))
 406         self.inc(key)
 407
 408     def recordValidEmailRequest(self, smtpAutoresp):
 409         self._recordEmailRequest(smtpAutoresp, True)
 410
 411     def recordInvalidEmailRequest(self, smtpAutoresp):
 412         self._recordEmailRequest(smtpAutoresp, False)
 413
 414
 415 class MoatMetrics(Metrics):
 416
 417     def __init__(self):
 418         super(MoatMetrics, self).__init__()
 419         self.keyPrefix = "moat"
 420
 421     def _recordMoatRequest(self, request, success):
 422
 423         logging.debug("Moat request has user agent: %s" %
 424                       request.requestHeaders.getRawHeaders("User-Agent"))
 425
 426         ipAddr = getClientIP(request,
 427                              useForwardedHeader=True,
 428                              skipLoopback=False)
 429         countryCode = resolveCountryCode(ipAddr)
 430
 431         try:
 432             encodedClientData = request.content.read()
 433             clientData = json.loads(encodedClientData)["data"][0]
 434             transport = clientData["transport"]
 435             bridgeType = "vanilla" if not len(transport) else transport
 436         except Exception as err:
 437             logging.warning("Could not decode request: %s" % err)
 438             return
 439
 440         if not isBridgeTypeSupported(bridgeType):
 441             logging.warning("User requested unsupported transport type %s "
 442                             "over moat." % bridgeType)
 443             return
 444
 445         logging.debug("Recording %svalid moat request for %s from %s (%s)." %
 446                       ("" if success else "in",
 447                        bridgeType, ipAddr, countryCode))
 448
 449         # Now update our metrics.
 450         key = self.createKey(self.keyPrefix, bridgeType,
 451                              countryCode, success, self.findAnomaly(request))
 452         self.inc(key)
 453
 454     def recordValidMoatRequest(self, request):
 455         self._recordMoatRequest(request, True)
 456
 457     def recordInvalidMoatRequest(self, request):
 458         self._recordMoatRequest(request, False)