Make getCaptchaImage return (bytes, str).
[tor-bridgedb.git] / bridgedb / metrics.py
blobbb888d9acae07ffa390b849729a1d30ab9b73485
1 # -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*-
2 # _____________________________________________________________________________
4 # This file is part of BridgeDB, a Tor bridge distribution system.
6 # :authors: please see included AUTHORS file
7 # :copyright: (c) 2019, The Tor Project, Inc.
8 # (c) 2019, Philipp Winter
9 # :license: see LICENSE for licensing information
10 # _____________________________________________________________________________
12 """API for keeping track of BridgeDB metrics, e.g., the demand for bridges
13 over time.
14 """
16 import logging
17 import ipaddr
18 import operator
19 import json
20 import datetime
22 from bridgedb import geo
23 from bridgedb.distributors.common.http import getClientIP
24 from bridgedb.distributors.email import request
25 from bridgedb.distributors.email.distributor import EmailRequestedHelp
27 from twisted.mail.smtp import Address
29 # Our data structure to keep track of exit relays. The variable is of type
30 # bridgedb.proxy.ProxySet. We reserve a special country code (determined by
31 # PROXY_CC below) for exit relays and other proxies.
32 PROXIES = None
34 # Our custom country code for IP addresses that we couldn't map to a country.
35 # This can happen for private IP addresses or if our geo-location provider has
36 # no mapping.
37 UNKNOWN_CC = "??"
39 # Our custom country code for IP addresses that are proxies, e.g., Tor exit
40 # relays. The code "zz" is free for assignment for user needs as specified
41 # here: <https://en.wikipedia.org/w/index.php?title=ISO_3166-1_alpha-2&oldid=906611218#Decoding_table>
42 PROXY_CC = "ZZ"
44 # We use BIN_SIZE to reduce the granularity of our counters. We round up
45 # numbers to the next multiple of BIN_SIZE, e.g., 28 is rounded up to:
46 # 10 * 3 = 30.
47 BIN_SIZE = 10
49 # The prefix length that we use to keep track of the number of unique subnets
50 # we have seen HTTPS requests from.
51 SUBNET_CTR_PREFIX_LEN = 20
53 # All of the pluggable transports BridgeDB currently supports.
54 SUPPORTED_TRANSPORTS = None
56 # Version number for our metrics format. We increment the version if our
57 # format changes.
58 METRICS_VERSION = 1
61 def setProxies(proxies):
62 """Set the given proxies.
64 :type proxies: :class:`~bridgedb.proxy.ProxySet`
65 :param proxies: The container for the IP addresses of any currently
66 known open proxies.
67 """
68 logging.debug("Setting %d proxies." % len(proxies))
69 global PROXIES
70 PROXIES = proxies
73 def setSupportedTransports(supportedTransports):
74 """Set the given supported transports.
76 :param dict supportedTransports: The transport types that BridgeDB
77 currently supports.
78 """
80 logging.debug("Setting %d supported transports." %
81 len(supportedTransports))
82 global SUPPORTED_TRANSPORTS
83 SUPPORTED_TRANSPORTS = supportedTransports
86 def isBridgeTypeSupported(bridgeType):
87 """Return `True' or `False' depending on if the given bridge type is
88 supported.
90 :param str bridgeType: The bridge type, e.g., "vanilla" or "obfs4".
91 """
93 if SUPPORTED_TRANSPORTS is None:
94 logging.error("Bug: Variable SUPPORTED_TRANSPORTS is None.")
95 return False
97 # Note that "vanilla" isn't a transport protocol (in fact, it's the absence
98 # of a transport), which is why it isn't in SUPPORTED_TRANSPORTS.
99 return (bridgeType in SUPPORTED_TRANSPORTS) or (bridgeType == "vanilla")
102 def export(fh, measurementInterval):
103 """Export metrics by writing them to the given file handle.
105 :param file fh: The file handle to which we're writing our metrics.
106 :param int measurementInterval: The number of seconds after which we rotate
107 and dump our metrics.
110 httpsMetrix = HTTPSMetrics()
111 emailMetrix = EmailMetrics()
112 moatMetrix = MoatMetrics()
114 # Rotate our metrics.
115 httpsMetrix.rotate()
116 emailMetrix.rotate()
117 moatMetrix.rotate()
119 numProxies = len(PROXIES) if PROXIES is not None else 0
120 if numProxies == 0:
121 logging.error("Metrics module doesn't have any proxies.")
122 else:
123 logging.debug("Metrics module knows about %d proxies." % numProxies)
125 now = datetime.datetime.utcnow()
126 fh.write("bridgedb-metrics-end %s (%d s)\n" % (
127 now.strftime("%Y-%m-%d %H:%M:%S"),
128 measurementInterval))
129 fh.write("bridgedb-metrics-version %d\n" % METRICS_VERSION)
131 httpsLines = httpsMetrix.getMetrics()
132 for line in httpsLines:
133 fh.write("bridgedb-metric-count %s\n" % line)
135 moatLines = moatMetrix.getMetrics()
136 for line in moatLines:
137 fh.write("bridgedb-metric-count %s\n" % line)
139 emailLines = emailMetrix.getMetrics()
140 for line in emailLines:
141 fh.write("bridgedb-metric-count %s\n" % line)
144 def resolveCountryCode(ipAddr):
145 """Return the country code of the given IP address.
147 :param str ipAddr: The IP address to resolve.
149 :rtype: str
150 :returns: A two-letter country code.
153 if ipAddr is None:
154 logging.warning("Given IP address was None. Using %s as country "
155 "code." % UNKNOWN_CC)
156 return UNKNOWN_CC
158 if PROXIES is None:
159 logging.warning("Proxies are not yet set.")
160 elif ipAddr in PROXIES:
161 return PROXY_CC
163 countryCode = geo.getCountryCode(ipaddr.IPAddress(ipAddr))
165 # countryCode may be None if GeoIP is unable to map an IP address to a
166 # country.
167 return UNKNOWN_CC if countryCode is None else countryCode
170 class Singleton(type):
171 _instances = {}
173 def __call__(cls, *args, **kwargs):
174 if cls not in cls._instances:
175 cls._instances[cls] = super(Singleton, cls).__call__(*args,
176 **kwargs)
177 return cls._instances[cls]
179 def clear(cls):
180 """Drop the instance (necessary for unit tests)."""
181 try:
182 del cls._instances[cls]
183 except KeyError:
184 pass
187 class Metrics(metaclass=Singleton):
188 """Base class representing metrics.
190 This class provides functionality that our three distribution mechanisms
191 share.
194 def __init__(self, binSize=BIN_SIZE):
195 logging.debug("Instantiating metrics class.")
196 self.binSize = binSize
198 # Metrics cover a 24 hour period. To that end, we're maintaining two
199 # data structures: our "hot" metrics are currently being populated
200 # while our "cold" metrics are finished, and valid for 24 hours. After
201 # that, our hot metrics turn into cold metrics, and we start over.
202 self.hotMetrics = dict()
203 self.coldMetrics = dict()
205 def rotate(self):
206 """Rotate our metrics."""
208 self.coldMetrics = self.hotMetrics
209 self.hotMetrics = dict()
211 def findAnomaly(self, request):
212 anomaly = "none"
214 # TODO: Inspect email for traces of bots, Sherlock Homes-style!
215 # See <https://bugs.torproject.org/9316#comment:19> for the rationale.
216 # All classes that inherit from Metrics() should implement this method.
218 return anomaly
220 def getMetrics(self):
221 """Get our sanitized current metrics, one per line.
223 Metrics are of the form:
226 "moat.obfs4.us.success.none 10",
227 "https.vanilla.de.success.none 30",
231 :rtype: list
232 :returns: A list of metric lines.
234 lines = []
235 for key, value in self.coldMetrics.items():
236 # Round up our value to the nearest multiple of self.binSize to
237 # reduce the accuracy of our real values.
238 if (value % self.binSize) > 0:
239 value += self.binSize - (value % self.binSize)
240 lines.append("%s %d" % (key, value))
241 return lines
243 def set(self, key, value):
244 """Set the given key to the given value.
246 :param str key: The time series key.
247 :param int value: The time series value.
249 self.hotMetrics[key] = value
251 def inc(self, key):
252 """Increment the given key.
254 :param str key: The time series key.
256 if key in self.hotMetrics:
257 self.hotMetrics[key] += 1
258 else:
259 self.set(key, 1)
261 def createKey(self, distMechanism, bridgeType, countryOrProvider,
262 success, anomaly):
263 """Create and return a time series key.
265 :param str distMechanism: A string representing our distribution
266 mechanism, e.g., "https".
267 :param str bridgeType: A string representing the requested bridge
268 type, e.g., "vanilla" or "obfs4".
269 :param str countryOrProvider: A string representing the client's
270 two-letter country code or email provider, e.g., "it" or
271 "yahoo.com".
272 :param bool success: ``True`` if the request was successful and
273 BridgeDB handed out a bridge; ``False`` otherwise.
274 :param str anomaly: ``None`` if the request was not anomalous and hence
275 believed to have come from a real user; otherwise a string
276 representing the type of anomaly.
277 :rtype: str
278 :returns: A key that uniquely identifies the given metrics
279 combinations.
282 if isinstance(countryOrProvider, bytes):
283 countryOrProvider = countryOrProvider.decode('utf-8')
285 countryOrProvider = countryOrProvider.lower()
286 bridgeType = bridgeType.lower()
287 success = "success" if success else "fail"
289 key = "%s.%s.%s.%s.%s" % (distMechanism, bridgeType,
290 countryOrProvider, success, anomaly)
292 return key
295 class HTTPSMetrics(Metrics):
297 def __init__(self):
298 super(HTTPSMetrics, self).__init__()
300 # Maps subnets (e.g., "1.2.0.0/16") to the number of times we've seen
301 # requests from the given subnet.
302 self.subnetCounter = dict()
303 self.keyPrefix = "https"
305 def getTopNSubnets(self, n=10):
307 sortedByNum = sorted(self.subnetCounter.items(),
308 key=operator.itemgetter(1),
309 reverse=True)
310 return sortedByNum[:n]
312 def _recordHTTPSRequest(self, request, success):
314 logging.debug("HTTPS request has user agent: %s" %
315 request.requestHeaders.getRawHeaders("User-Agent"))
317 # Pull the client's IP address out of the request and convert it to a
318 # two-letter country code.
319 ipAddr = getClientIP(request,
320 useForwardedHeader=True,
321 skipLoopback=False)
322 self.updateSubnetCounter(ipAddr)
323 countryCode = resolveCountryCode(ipAddr)
325 transports = request.args.get("transport", list())
326 if len(transports) > 1:
327 logging.warning("Expected a maximum of one transport but %d are "
328 "given." % len(transports))
330 if len(transports) == 0:
331 bridgeType = "vanilla"
332 elif transports[0] == "" or transports[0] == "0":
333 bridgeType = "vanilla"
334 else:
335 bridgeType = transports[0]
337 # BridgeDB's HTTPS interface exposes transport types as a drop down
338 # menu but users can still request anything by manipulating HTTP
339 # parameters.
340 if not isBridgeTypeSupported(bridgeType):
341 logging.warning("User requested unsupported transport type %s "
342 "over HTTPS." % bridgeType)
343 return
345 logging.debug("Recording %svalid HTTPS request for %s from %s (%s)." %
346 ("" if success else "in",
347 bridgeType, ipAddr, countryCode))
349 # Now update our metrics.
350 key = self.createKey(self.keyPrefix, bridgeType, countryCode,
351 success, self.findAnomaly(request))
352 self.inc(key)
354 def recordValidHTTPSRequest(self, request):
355 self._recordHTTPSRequest(request, True)
357 def recordInvalidHTTPSRequest(self, request):
358 self._recordHTTPSRequest(request, False)
360 def updateSubnetCounter(self, ipAddr):
362 if ipAddr is None:
363 return
365 nw = ipaddr.IPNetwork(ipAddr + "/" + str(SUBNET_CTR_PREFIX_LEN),
366 strict=False)
367 subnet = nw.network.compressed
368 logging.debug("Updating subnet counter with %s" % subnet)
370 num = self.subnetCounter.get(subnet, 0)
371 self.subnetCounter[subnet] = num + 1
374 class EmailMetrics(Metrics):
376 def __init__(self):
377 super(EmailMetrics, self).__init__()
378 self.keyPrefix = "email"
380 def _recordEmailRequest(self, smtpAutoresp, success):
382 emailAddrs = smtpAutoresp.getMailTo()
383 if len(emailAddrs) == 0:
384 # This is just for unit tests.
385 emailAddr = Address("foo@gmail.com")
386 else:
387 emailAddr = emailAddrs[0]
389 # Get the requested transport protocol.
390 try:
391 br = request.determineBridgeRequestOptions(
392 smtpAutoresp.incoming.lines)
393 except EmailRequestedHelp:
394 return
395 bridgeType = "vanilla" if not len(br.transports) else br.transports[0]
397 # Over email, transports are requested by typing them. Typos happen
398 # and users can request anything, really.
399 if not isBridgeTypeSupported(bridgeType):
400 logging.warning("User requested unsupported transport type %s "
401 "over email." % bridgeType)
402 return
404 logging.debug("Recording %svalid email request for %s from %s." %
405 ("" if success else "in", bridgeType, emailAddr))
406 sld = emailAddr.domain.split(b".")[0]
408 # Now update our metrics.
409 key = self.createKey(self.keyPrefix, bridgeType, sld, success,
410 self.findAnomaly(request))
411 self.inc(key)
413 def recordValidEmailRequest(self, smtpAutoresp):
414 self._recordEmailRequest(smtpAutoresp, True)
416 def recordInvalidEmailRequest(self, smtpAutoresp):
417 self._recordEmailRequest(smtpAutoresp, False)
420 class MoatMetrics(Metrics):
422 def __init__(self):
423 super(MoatMetrics, self).__init__()
424 self.keyPrefix = "moat"
426 def _recordMoatRequest(self, request, success):
428 logging.debug("Moat request has user agent: %s" %
429 request.requestHeaders.getRawHeaders("User-Agent"))
431 ipAddr = getClientIP(request,
432 useForwardedHeader=True,
433 skipLoopback=False)
434 countryCode = resolveCountryCode(ipAddr)
436 try:
437 encodedClientData = request.content.read()
438 clientData = json.loads(encodedClientData)["data"][0]
439 transport = clientData["transport"]
440 bridgeType = "vanilla" if not len(transport) else transport
441 except Exception as err:
442 logging.warning("Could not decode request: %s" % err)
443 return
445 if not isBridgeTypeSupported(bridgeType):
446 logging.warning("User requested unsupported transport type %s "
447 "over moat." % bridgeType)
448 return
450 logging.debug("Recording %svalid moat request for %s from %s (%s)." %
451 ("" if success else "in",
452 bridgeType, ipAddr, countryCode))
454 # Now update our metrics.
455 key = self.createKey(self.keyPrefix, bridgeType,
456 countryCode, success, self.findAnomaly(request))
457 self.inc(key)
459 def recordValidMoatRequest(self, request):
460 self._recordMoatRequest(request, True)
462 def recordInvalidMoatRequest(self, request):
463 self._recordMoatRequest(request, False)