bridgedb/metrics.py

   1 # -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*-
   2 # _____________________________________________________________________________
   3 #
   4 # This file is part of BridgeDB, a Tor bridge distribution system.
   5 #
   6 # :authors: please see included AUTHORS file
   7 # :copyright: (c) 2019, The Tor Project, Inc.
   8 #             (c) 2019, Philipp Winter
   9 # :license: see LICENSE for licensing information
  10 # _____________________________________________________________________________
  11
  12 """API for keeping track of BridgeDB metrics, e.g., the demand for bridges
  13 over time.
  14 """
  15
  16 import logging
  17 import ipaddr
  18 import operator
  19 import json
  20 import datetime
  21
  22 from bridgedb import geo
  23 from bridgedb.distributors.common.http import getClientIP
  24 from bridgedb.distributors.email import request
  25 from bridgedb.distributors.email.distributor import EmailRequestedHelp
  26
  27 from twisted.mail.smtp import Address
  28
  29 # Our data structure to keep track of exit relays.  The variable is of type
  30 # bridgedb.proxy.ProxySet.  We reserve a special country code (determined by
  31 # PROXY_CC below) for exit relays and other proxies.
  32 PROXIES = None
  33
  34 # Our custom country code for IP addresses that we couldn't map to a country.
  35 # This can happen for private IP addresses or if our geo-location provider has
  36 # no mapping.
  37 UNKNOWN_CC = "??"
  38
  39 # Our custom country code for IP addresses that are proxies, e.g., Tor exit
  40 # relays.  The code "zz" is free for assignment for user needs as specified
  41 # here: <https://en.wikipedia.org/w/index.php?title=ISO_3166-1_alpha-2&oldid=906611218#Decoding_table>
  42 PROXY_CC = "ZZ"
  43
  44 # We use BIN_SIZE to reduce the granularity of our counters.  We round up
  45 # numbers to the next multiple of BIN_SIZE, e.g., 28 is rounded up to:
  46 # 10 * 3 = 30.
  47 BIN_SIZE = 10
  48
  49 # The prefix length that we use to keep track of the number of unique subnets
  50 # we have seen HTTPS requests from.
  51 SUBNET_CTR_PREFIX_LEN = 20
  52
  53 # All of the pluggable transports BridgeDB currently supports.
  54 SUPPORTED_TRANSPORTS = None
  55
  56 # Version number for our metrics format.  We increment the version if our
  57 # format changes.
  58 METRICS_VERSION = 1
  59
  60
  61 def setProxies(proxies):
  62     """Set the given proxies.
  63
  64     :type proxies: :class:`~bridgedb.proxy.ProxySet`
  65     :param proxies: The container for the IP addresses of any currently
  66         known open proxies.
  67     """
  68     logging.debug("Setting %d proxies." % len(proxies))
  69     global PROXIES
  70     PROXIES = proxies
  71
  72
  73 def setSupportedTransports(supportedTransports):
  74     """Set the given supported transports.
  75
  76     :param dict supportedTransports: The transport types that BridgeDB
  77         currently supports.
  78     """
  79
  80     logging.debug("Setting %d supported transports." %
  81                   len(supportedTransports))
  82     global SUPPORTED_TRANSPORTS
  83     SUPPORTED_TRANSPORTS = supportedTransports
  84
  85
  86 def isBridgeTypeSupported(bridgeType):
  87     """Return `True' or `False' depending on if the given bridge type is
  88     supported.
  89
  90     :param str bridgeType: The bridge type, e.g., "vanilla" or "obfs4".
  91     """
  92
  93     if SUPPORTED_TRANSPORTS is None:
  94         logging.error("Bug: Variable SUPPORTED_TRANSPORTS is None.")
  95         return False
  96
  97     # Note that "vanilla" isn't a transport protocol (in fact, it's the absence
  98     # of a transport), which is why it isn't in SUPPORTED_TRANSPORTS.
  99     return (bridgeType in SUPPORTED_TRANSPORTS) or (bridgeType == "vanilla")
 100
 101
 102 def export(fh, measurementInterval):
 103     """Export metrics by writing them to the given file handle.
 104
 105     :param file fh: The file handle to which we're writing our metrics.
 106     :param int measurementInterval: The number of seconds after which we rotate
 107         and dump our metrics.
 108     """
 109
 110     httpsMetrix = HTTPSMetrics()
 111     emailMetrix = EmailMetrics()
 112     moatMetrix = MoatMetrics()
 113
 114     # Rotate our metrics.
 115     httpsMetrix.rotate()
 116     emailMetrix.rotate()
 117     moatMetrix.rotate()
 118
 119     numProxies = len(PROXIES) if PROXIES is not None else 0
 120     if numProxies == 0:
 121         logging.error("Metrics module doesn't have any proxies.")
 122     else:
 123         logging.debug("Metrics module knows about %d proxies." % numProxies)
 124
 125     now = datetime.datetime.utcnow()
 126     fh.write("bridgedb-metrics-end %s (%d s)\n" % (
 127              now.strftime("%Y-%m-%d %H:%M:%S"),
 128              measurementInterval))
 129     fh.write("bridgedb-metrics-version %d\n" % METRICS_VERSION)
 130
 131     httpsLines = httpsMetrix.getMetrics()
 132     for line in httpsLines:
 133         fh.write("bridgedb-metric-count %s\n" % line)
 134
 135     moatLines = moatMetrix.getMetrics()
 136     for line in moatLines:
 137         fh.write("bridgedb-metric-count %s\n" % line)
 138
 139     emailLines = emailMetrix.getMetrics()
 140     for line in emailLines:
 141         fh.write("bridgedb-metric-count %s\n" % line)
 142
 143
 144 def resolveCountryCode(ipAddr):
 145     """Return the country code of the given IP address.
 146
 147     :param str ipAddr: The IP address to resolve.
 148
 149     :rtype: str
 150     :returns: A two-letter country code.
 151     """
 152
 153     if ipAddr is None:
 154         logging.warning("Given IP address was None.  Using %s as country "
 155                         "code." % UNKNOWN_CC)
 156         return UNKNOWN_CC
 157
 158     if PROXIES is None:
 159         logging.warning("Proxies are not yet set.")
 160     elif ipAddr in PROXIES:
 161         return PROXY_CC
 162
 163     countryCode = geo.getCountryCode(ipaddr.IPAddress(ipAddr))
 164
 165     # countryCode may be None if GeoIP is unable to map an IP address to a
 166     # country.
 167     return UNKNOWN_CC if countryCode is None else countryCode
 168
 169
 170 class Singleton(type):
 171     _instances = {}
 172
 173     def __call__(cls, *args, **kwargs):
 174         if cls not in cls._instances:
 175             cls._instances[cls] = super(Singleton, cls).__call__(*args,
 176                                                                  **kwargs)
 177         return cls._instances[cls]
 178
 179     def clear(cls):
 180         """Drop the instance (necessary for unit tests)."""
 181         try:
 182             del cls._instances[cls]
 183         except KeyError:
 184             pass
 185
 186
 187 class Metrics(metaclass=Singleton):
 188     """Base class representing metrics.
 189
 190     This class provides functionality that our three distribution mechanisms
 191     share.
 192     """
 193
 194     def __init__(self, binSize=BIN_SIZE):
 195         logging.debug("Instantiating metrics class.")
 196         self.binSize = binSize
 197
 198         # Metrics cover a 24 hour period.  To that end, we're maintaining two
 199         # data structures: our "hot" metrics are currently being populated
 200         # while our "cold" metrics are finished, and valid for 24 hours.  After
 201         # that, our hot metrics turn into cold metrics, and we start over.
 202         self.hotMetrics = dict()
 203         self.coldMetrics = dict()
 204
 205     def rotate(self):
 206         """Rotate our metrics."""
 207
 208         self.coldMetrics = self.hotMetrics
 209         self.hotMetrics = dict()
 210
 211     def findAnomaly(self, request):
 212         anomaly = "none"
 213
 214         # TODO: Inspect email for traces of bots, Sherlock Homes-style!
 215         # See <https://bugs.torproject.org/9316#comment:19> for the rationale.
 216         # All classes that inherit from Metrics() should implement this method.
 217
 218         return anomaly
 219
 220     def getMetrics(self):
 221         """Get our sanitized current metrics, one per line.
 222
 223         Metrics are of the form:
 224
 225             [
 226              "moat.obfs4.us.success.none 10",
 227              "https.vanilla.de.success.none 30",
 228              ...
 229             ]
 230
 231         :rtype: list
 232         :returns: A list of metric lines.
 233         """
 234         lines = []
 235         for key, value in self.coldMetrics.items():
 236             # Round up our value to the nearest multiple of self.binSize to
 237             # reduce the accuracy of our real values.
 238             if (value % self.binSize) > 0:
 239                 value += self.binSize - (value % self.binSize)
 240             lines.append("%s %d" % (key, value))
 241         return lines
 242
 243     def set(self, key, value):
 244         """Set the given key to the given value.
 245
 246         :param str key: The time series key.
 247         :param int value: The time series value.
 248         """
 249         self.hotMetrics[key] = value
 250
 251     def inc(self, key):
 252         """Increment the given key.
 253
 254         :param str key: The time series key.
 255         """
 256         if key in self.hotMetrics:
 257             self.hotMetrics[key] += 1
 258         else:
 259             self.set(key, 1)
 260
 261     def createKey(self, distMechanism, bridgeType, countryOrProvider,
 262                   success, anomaly):
 263         """Create and return a time series key.
 264
 265         :param str distMechanism: A string representing our distribution
 266             mechanism, e.g., "https".
 267         :param str bridgeType: A string representing the requested bridge
 268             type, e.g., "vanilla" or "obfs4".
 269         :param str countryOrProvider: A string representing the client's
 270             two-letter country code or email provider, e.g., "it" or
 271             "yahoo.com".
 272         :param bool success: ``True`` if the request was successful and
 273             BridgeDB handed out a bridge; ``False`` otherwise.
 274         :param str anomaly: ``None`` if the request was not anomalous and hence
 275             believed to have come from a real user; otherwise a string
 276             representing the type of anomaly.
 277         :rtype: str
 278         :returns: A key that uniquely identifies the given metrics
 279             combinations.
 280         """
 281
 282         if isinstance(countryOrProvider, bytes):
 283             countryOrProvider = countryOrProvider.decode('utf-8')
 284
 285         countryOrProvider = countryOrProvider.lower()
 286         bridgeType = bridgeType.lower()
 287         success = "success" if success else "fail"
 288
 289         key = "%s.%s.%s.%s.%s" % (distMechanism, bridgeType,
 290                                   countryOrProvider, success, anomaly)
 291
 292         return key
 293
 294
 295 class HTTPSMetrics(Metrics):
 296
 297     def __init__(self):
 298         super(HTTPSMetrics, self).__init__()
 299
 300         # Maps subnets (e.g., "1.2.0.0/16") to the number of times we've seen
 301         # requests from the given subnet.
 302         self.subnetCounter = dict()
 303         self.keyPrefix = "https"
 304
 305     def getTopNSubnets(self, n=10):
 306
 307         sortedByNum = sorted(self.subnetCounter.items(),
 308                              key=operator.itemgetter(1),
 309                              reverse=True)
 310         return sortedByNum[:n]
 311
 312     def _recordHTTPSRequest(self, request, success):
 313
 314         logging.debug("HTTPS request has user agent: %s" %
 315                       request.requestHeaders.getRawHeaders("User-Agent"))
 316
 317         # Pull the client's IP address out of the request and convert it to a
 318         # two-letter country code.
 319         ipAddr = getClientIP(request,
 320                              useForwardedHeader=True,
 321                              skipLoopback=False)
 322         self.updateSubnetCounter(ipAddr)
 323         countryCode = resolveCountryCode(ipAddr)
 324
 325         transports = request.args.get("transport", list())
 326         if len(transports) > 1:
 327             logging.warning("Expected a maximum of one transport but %d are "
 328                             "given." % len(transports))
 329
 330         if len(transports) == 0:
 331             bridgeType = "vanilla"
 332         elif transports[0] == "" or transports[0] == "0":
 333             bridgeType = "vanilla"
 334         else:
 335             bridgeType = transports[0]
 336
 337         # BridgeDB's HTTPS interface exposes transport types as a drop down
 338         # menu but users can still request anything by manipulating HTTP
 339         # parameters.
 340         if not isBridgeTypeSupported(bridgeType):
 341             logging.warning("User requested unsupported transport type %s "
 342                             "over HTTPS." % bridgeType)
 343             return
 344
 345         logging.debug("Recording %svalid HTTPS request for %s from %s (%s)." %
 346                       ("" if success else "in",
 347                        bridgeType, ipAddr, countryCode))
 348
 349         # Now update our metrics.
 350         key = self.createKey(self.keyPrefix, bridgeType, countryCode,
 351                              success, self.findAnomaly(request))
 352         self.inc(key)
 353
 354     def recordValidHTTPSRequest(self, request):
 355         self._recordHTTPSRequest(request, True)
 356
 357     def recordInvalidHTTPSRequest(self, request):
 358         self._recordHTTPSRequest(request, False)
 359
 360     def updateSubnetCounter(self, ipAddr):
 361
 362         if ipAddr is None:
 363             return
 364
 365         nw = ipaddr.IPNetwork(ipAddr + "/" + str(SUBNET_CTR_PREFIX_LEN),
 366                               strict=False)
 367         subnet = nw.network.compressed
 368         logging.debug("Updating subnet counter with %s" % subnet)
 369
 370         num = self.subnetCounter.get(subnet, 0)
 371         self.subnetCounter[subnet] = num + 1
 372
 373
 374 class EmailMetrics(Metrics):
 375
 376     def __init__(self):
 377         super(EmailMetrics, self).__init__()
 378         self.keyPrefix = "email"
 379
 380     def _recordEmailRequest(self, smtpAutoresp, success):
 381
 382         emailAddrs = smtpAutoresp.getMailTo()
 383         if len(emailAddrs) == 0:
 384             # This is just for unit tests.
 385             emailAddr = Address("foo@gmail.com")
 386         else:
 387             emailAddr = emailAddrs[0]
 388
 389         # Get the requested transport protocol.
 390         try:
 391             br = request.determineBridgeRequestOptions(
 392                     smtpAutoresp.incoming.lines)
 393         except EmailRequestedHelp:
 394             return
 395         bridgeType = "vanilla" if not len(br.transports) else br.transports[0]
 396
 397         # Over email, transports are requested by typing them.  Typos happen
 398         # and users can request anything, really.
 399         if not isBridgeTypeSupported(bridgeType):
 400             logging.warning("User requested unsupported transport type %s "
 401                             "over email." % bridgeType)
 402             return
 403
 404         logging.debug("Recording %svalid email request for %s from %s." %
 405                       ("" if success else "in", bridgeType, emailAddr))
 406         sld = emailAddr.domain.split(b".")[0]
 407
 408         # Now update our metrics.
 409         key = self.createKey(self.keyPrefix, bridgeType, sld, success,
 410                              self.findAnomaly(request))
 411         self.inc(key)
 412
 413     def recordValidEmailRequest(self, smtpAutoresp):
 414         self._recordEmailRequest(smtpAutoresp, True)
 415
 416     def recordInvalidEmailRequest(self, smtpAutoresp):
 417         self._recordEmailRequest(smtpAutoresp, False)
 418
 419
 420 class MoatMetrics(Metrics):
 421
 422     def __init__(self):
 423         super(MoatMetrics, self).__init__()
 424         self.keyPrefix = "moat"
 425
 426     def _recordMoatRequest(self, request, success):
 427
 428         logging.debug("Moat request has user agent: %s" %
 429                       request.requestHeaders.getRawHeaders("User-Agent"))
 430
 431         ipAddr = getClientIP(request,
 432                              useForwardedHeader=True,
 433                              skipLoopback=False)
 434         countryCode = resolveCountryCode(ipAddr)
 435
 436         try:
 437             encodedClientData = request.content.read()
 438             clientData = json.loads(encodedClientData)["data"][0]
 439             transport = clientData["transport"]
 440             bridgeType = "vanilla" if not len(transport) else transport
 441         except Exception as err:
 442             logging.warning("Could not decode request: %s" % err)
 443             return
 444
 445         if not isBridgeTypeSupported(bridgeType):
 446             logging.warning("User requested unsupported transport type %s "
 447                             "over moat." % bridgeType)
 448             return
 449
 450         logging.debug("Recording %svalid moat request for %s from %s (%s)." %
 451                       ("" if success else "in",
 452                        bridgeType, ipAddr, countryCode))
 453
 454         # Now update our metrics.
 455         key = self.createKey(self.keyPrefix, bridgeType,
 456                              countryCode, success, self.findAnomaly(request))
 457         self.inc(key)
 458
 459     def recordValidMoatRequest(self, request):
 460         self._recordMoatRequest(request, True)
 461
 462     def recordInvalidMoatRequest(self, request):
 463         self._recordMoatRequest(request, False)