Bump version number to 0.10.1.
[tor-bridgedb.git] / bridgedb / metrics.py
blob48713f03f83d5c45cb8b2ea7d897ee4d3fb80949
1 # -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*-
2 # _____________________________________________________________________________
4 # This file is part of BridgeDB, a Tor bridge distribution system.
6 # :authors: please see included AUTHORS file
7 # :copyright: (c) 2019, The Tor Project, Inc.
8 # (c) 2019, Philipp Winter
9 # :license: see LICENSE for licensing information
10 # _____________________________________________________________________________
12 """API for keeping track of BridgeDB metrics, e.g., the demand for bridges
13 over time.
14 """
16 import logging
17 import ipaddr
18 import operator
19 import json
20 import datetime
22 from bridgedb import geo
23 from bridgedb.distributors.common.http import getClientIP
24 from bridgedb.distributors.email import request
26 from twisted.mail.smtp import Address
28 # Our data structure to keep track of exit relays. The variable is of type
29 # bridgedb.proxy.ProxySet. We reserve a special country code (determined by
30 # PROXY_CC below) for exit relays and other proxies.
31 PROXIES = None
33 # Our custom country code for IP addresses that we couldn't map to a country.
34 # This can happen for private IP addresses or if our geo-location provider has
35 # no mapping.
36 UNKNOWN_CC = "??"
38 # Our custom country code for IP addresses that are proxies, e.g., Tor exit
39 # relays. The code "zz" is free for assignment for user needs as specified
40 # here: <https://en.wikipedia.org/w/index.php?title=ISO_3166-1_alpha-2&oldid=906611218#Decoding_table>
41 PROXY_CC = "ZZ"
43 # We use BIN_SIZE to reduce the granularity of our counters. We round up
44 # numbers to the next multiple of BIN_SIZE, e.g., 28 is rounded up to:
45 # 10 * 3 = 30.
46 BIN_SIZE = 10
48 # The prefix length that we use to keep track of the number of unique subnets
49 # we have seen HTTPS requests from.
50 SUBNET_CTR_PREFIX_LEN = 20
52 # All of the pluggable transports BridgeDB currently supports.
53 SUPPORTED_TRANSPORTS = None
55 # Version number for our metrics format. We increment the version if our
56 # format changes.
57 METRICS_VERSION = 1
60 def setProxies(proxies):
61 """Set the given proxies.
63 :type proxies: :class:`~bridgedb.proxy.ProxySet`
64 :param proxies: The container for the IP addresses of any currently
65 known open proxies.
66 """
67 logging.debug("Setting %d proxies." % len(proxies))
68 global PROXIES
69 PROXIES = proxies
72 def setSupportedTransports(supportedTransports):
73 """Set the given supported transports.
75 :param dict supportedTransports: The transport types that BridgeDB
76 currently supports.
77 """
79 logging.debug("Setting %d supported transports." %
80 len(supportedTransports))
81 global SUPPORTED_TRANSPORTS
82 SUPPORTED_TRANSPORTS = supportedTransports
85 def isBridgeTypeSupported(bridgeType):
86 """Return `True' or `False' depending on if the given bridge type is
87 supported.
89 :param str bridgeType: The bridge type, e.g., "vanilla" or "obfs4".
90 """
92 if SUPPORTED_TRANSPORTS is None:
93 logging.error("Bug: Variable SUPPORTED_TRANSPORTS is None.")
94 return False
96 # Note that "vanilla" isn't a transport protocol (in fact, it's the absence
97 # of a transport), which is why it isn't in SUPPORTED_TRANSPORTS.
98 return (bridgeType in SUPPORTED_TRANSPORTS) or (bridgeType == "vanilla")
101 def export(fh, measurementInterval):
102 """Export metrics by writing them to the given file handle.
104 :param file fh: The file handle to which we're writing our metrics.
105 :param int measurementInterval: The number of seconds after which we rotate
106 and dump our metrics.
109 httpsMetrix = HTTPSMetrics()
110 emailMetrix = EmailMetrics()
111 moatMetrix = MoatMetrics()
113 # Rotate our metrics.
114 httpsMetrix.rotate()
115 emailMetrix.rotate()
116 moatMetrix.rotate()
118 numProxies = len(PROXIES) if PROXIES is not None else 0
119 if numProxies == 0:
120 logging.error("Metrics module doesn't have any proxies.")
121 else:
122 logging.debug("Metrics module knows about %d proxies." % numProxies)
124 now = datetime.datetime.utcnow()
125 fh.write("bridgedb-metrics-end %s (%d s)\n" % (
126 now.strftime("%Y-%m-%d %H:%M:%S"),
127 measurementInterval))
128 fh.write("bridgedb-metrics-version %d\n" % METRICS_VERSION)
130 httpsLines = httpsMetrix.getMetrics()
131 for line in httpsLines:
132 fh.write("bridgedb-metric-count %s\n" % line)
134 moatLines = moatMetrix.getMetrics()
135 for line in moatLines:
136 fh.write("bridgedb-metric-count %s\n" % line)
138 emailLines = emailMetrix.getMetrics()
139 for line in emailLines:
140 fh.write("bridgedb-metric-count %s\n" % line)
143 def resolveCountryCode(ipAddr):
144 """Return the country code of the given IP address.
146 :param str ipAddr: The IP address to resolve.
148 :rtype: str
149 :returns: A two-letter country code.
152 if ipAddr is None:
153 logging.warning("Given IP address was None. Using %s as country "
154 "code." % UNKNOWN_CC)
155 return UNKNOWN_CC
157 if PROXIES is None:
158 logging.warning("Proxies are not yet set.")
159 elif ipAddr in PROXIES:
160 return PROXY_CC
162 countryCode = geo.getCountryCode(ipaddr.IPAddress(ipAddr))
164 # countryCode may be None if GeoIP is unable to map an IP address to a
165 # country.
166 return UNKNOWN_CC if countryCode is None else countryCode
169 class Singleton(type):
170 _instances = {}
172 def __call__(cls, *args, **kwargs):
173 if cls not in cls._instances:
174 cls._instances[cls] = super(Singleton, cls).__call__(*args,
175 **kwargs)
176 return cls._instances[cls]
178 def clear(cls):
179 """Drop the instance (necessary for unit tests)."""
180 try:
181 del cls._instances[cls]
182 except KeyError:
183 pass
186 class Metrics(metaclass=Singleton):
187 """Base class representing metrics.
189 This class provides functionality that our three distribution mechanisms
190 share.
193 def __init__(self, binSize=BIN_SIZE):
194 logging.debug("Instantiating metrics class.")
195 self.binSize = binSize
197 # Metrics cover a 24 hour period. To that end, we're maintaining two
198 # data structures: our "hot" metrics are currently being populated
199 # while our "cold" metrics are finished, and valid for 24 hours. After
200 # that, our hot metrics turn into cold metrics, and we start over.
201 self.hotMetrics = dict()
202 self.coldMetrics = dict()
204 def rotate(self):
205 """Rotate our metrics."""
207 self.coldMetrics = self.hotMetrics
208 self.hotMetrics = dict()
210 def findAnomaly(self, request):
211 anomaly = "none"
213 # TODO: Inspect email for traces of bots, Sherlock Homes-style!
214 # See <https://bugs.torproject.org/9316#comment:19> for the rationale.
215 # All classes that inherit from Metrics() should implement this method.
217 return anomaly
219 def getMetrics(self):
220 """Get our sanitized current metrics, one per line.
222 Metrics are of the form:
225 "moat.obfs4.us.success.none 10",
226 "https.vanilla.de.success.none 30",
230 :rtype: list
231 :returns: A list of metric lines.
233 lines = []
234 for key, value in self.coldMetrics.items():
235 # Round up our value to the nearest multiple of self.binSize to
236 # reduce the accuracy of our real values.
237 if (value % self.binSize) > 0:
238 value += self.binSize - (value % self.binSize)
239 lines.append("%s %d" % (key, value))
240 return lines
242 def set(self, key, value):
243 """Set the given key to the given value.
245 :param str key: The time series key.
246 :param int value: The time series value.
248 self.hotMetrics[key] = value
250 def inc(self, key):
251 """Increment the given key.
253 :param str key: The time series key.
255 if key in self.hotMetrics:
256 self.hotMetrics[key] += 1
257 else:
258 self.set(key, 1)
260 def createKey(self, distMechanism, bridgeType, countryOrProvider,
261 success, anomaly):
262 """Create and return a time series key.
264 :param str distMechanism: A string representing our distribution
265 mechanism, e.g., "https".
266 :param str bridgeType: A string representing the requested bridge
267 type, e.g., "vanilla" or "obfs4".
268 :param str countryOrProvider: A string representing the client's
269 two-letter country code or email provider, e.g., "it" or
270 "yahoo.com".
271 :param bool success: ``True`` if the request was successful and
272 BridgeDB handed out a bridge; ``False`` otherwise.
273 :param str anomaly: ``None`` if the request was not anomalous and hence
274 believed to have come from a real user; otherwise a string
275 representing the type of anomaly.
276 :rtype: str
277 :returns: A key that uniquely identifies the given metrics
278 combinations.
281 if isinstance(countryOrProvider, bytes):
282 countryOrProvider = countryOrProvider.decode('utf-8')
284 countryOrProvider = countryOrProvider.lower()
285 bridgeType = bridgeType.lower()
286 success = "success" if success else "fail"
288 key = "%s.%s.%s.%s.%s" % (distMechanism, bridgeType,
289 countryOrProvider, success, anomaly)
291 return key
294 class HTTPSMetrics(Metrics):
296 def __init__(self):
297 super(HTTPSMetrics, self).__init__()
299 # Maps subnets (e.g., "1.2.0.0/16") to the number of times we've seen
300 # requests from the given subnet.
301 self.subnetCounter = dict()
302 self.keyPrefix = "https"
304 def getTopNSubnets(self, n=10):
306 sortedByNum = sorted(self.subnetCounter.items(),
307 key=operator.itemgetter(1),
308 reverse=True)
309 return sortedByNum[:n]
311 def _recordHTTPSRequest(self, request, success):
313 logging.debug("HTTPS request has user agent: %s" %
314 request.requestHeaders.getRawHeaders("User-Agent"))
316 # Pull the client's IP address out of the request and convert it to a
317 # two-letter country code.
318 ipAddr = getClientIP(request,
319 useForwardedHeader=True,
320 skipLoopback=False)
321 self.updateSubnetCounter(ipAddr)
322 countryCode = resolveCountryCode(ipAddr)
324 transports = request.args.get("transport", list())
325 if len(transports) > 1:
326 logging.warning("Expected a maximum of one transport but %d are "
327 "given." % len(transports))
329 if len(transports) == 0:
330 bridgeType = "vanilla"
331 elif transports[0] == "" or transports[0] == "0":
332 bridgeType = "vanilla"
333 else:
334 bridgeType = transports[0]
336 # BridgeDB's HTTPS interface exposes transport types as a drop down
337 # menu but users can still request anything by manipulating HTTP
338 # parameters.
339 if not isBridgeTypeSupported(bridgeType):
340 logging.warning("User requested unsupported transport type %s "
341 "over HTTPS." % bridgeType)
342 return
344 logging.debug("Recording %svalid HTTPS request for %s from %s (%s)." %
345 ("" if success else "in",
346 bridgeType, ipAddr, countryCode))
348 # Now update our metrics.
349 key = self.createKey(self.keyPrefix, bridgeType, countryCode,
350 success, self.findAnomaly(request))
351 self.inc(key)
353 def recordValidHTTPSRequest(self, request):
354 self._recordHTTPSRequest(request, True)
356 def recordInvalidHTTPSRequest(self, request):
357 self._recordHTTPSRequest(request, False)
359 def updateSubnetCounter(self, ipAddr):
361 if ipAddr is None:
362 return
364 nw = ipaddr.IPNetwork(ipAddr + "/" + str(SUBNET_CTR_PREFIX_LEN),
365 strict=False)
366 subnet = nw.network.compressed
367 logging.debug("Updating subnet counter with %s" % subnet)
369 num = self.subnetCounter.get(subnet, 0)
370 self.subnetCounter[subnet] = num + 1
373 class EmailMetrics(Metrics):
375 def __init__(self):
376 super(EmailMetrics, self).__init__()
377 self.keyPrefix = "email"
379 def _recordEmailRequest(self, smtpAutoresp, success):
381 emailAddrs = smtpAutoresp.getMailTo()
382 if len(emailAddrs) == 0:
383 # This is just for unit tests.
384 emailAddr = Address("foo@gmail.com")
385 else:
386 emailAddr = emailAddrs[0]
388 # Get the requested transport protocol.
389 br = request.determineBridgeRequestOptions( smtpAutoresp.incoming.lines)
390 bridgeType = "vanilla" if not len(br.transports) else br.transports[0]
392 # Over email, transports are requested by typing them. Typos happen
393 # and users can request anything, really.
394 if not isBridgeTypeSupported(bridgeType):
395 logging.warning("User requested unsupported transport type %s "
396 "over email." % bridgeType)
397 return
399 logging.debug("Recording %svalid email request for %s from %s." %
400 ("" if success else "in", bridgeType, emailAddr))
401 sld = emailAddr.domain.split(b".")[0]
403 # Now update our metrics.
404 key = self.createKey(self.keyPrefix, bridgeType, sld, success,
405 self.findAnomaly(request))
406 self.inc(key)
408 def recordValidEmailRequest(self, smtpAutoresp):
409 self._recordEmailRequest(smtpAutoresp, True)
411 def recordInvalidEmailRequest(self, smtpAutoresp):
412 self._recordEmailRequest(smtpAutoresp, False)
415 class MoatMetrics(Metrics):
417 def __init__(self):
418 super(MoatMetrics, self).__init__()
419 self.keyPrefix = "moat"
421 def _recordMoatRequest(self, request, success):
423 logging.debug("Moat request has user agent: %s" %
424 request.requestHeaders.getRawHeaders("User-Agent"))
426 ipAddr = getClientIP(request,
427 useForwardedHeader=True,
428 skipLoopback=False)
429 countryCode = resolveCountryCode(ipAddr)
431 try:
432 encodedClientData = request.content.read()
433 clientData = json.loads(encodedClientData)["data"][0]
434 transport = clientData["transport"]
435 bridgeType = "vanilla" if not len(transport) else transport
436 except Exception as err:
437 logging.warning("Could not decode request: %s" % err)
438 return
440 if not isBridgeTypeSupported(bridgeType):
441 logging.warning("User requested unsupported transport type %s "
442 "over moat." % bridgeType)
443 return
445 logging.debug("Recording %svalid moat request for %s from %s (%s)." %
446 ("" if success else "in",
447 bridgeType, ipAddr, countryCode))
449 # Now update our metrics.
450 key = self.createKey(self.keyPrefix, bridgeType,
451 countryCode, success, self.findAnomaly(request))
452 self.inc(key)
454 def recordValidMoatRequest(self, request):
455 self._recordMoatRequest(request, True)
457 def recordInvalidMoatRequest(self, request):
458 self._recordMoatRequest(request, False)