Add a bash script to make graphs (#2394).
[tor-metrics-tasks/delber.git] / task-2718 / detector.py
blob716bbdf99dbea47e0703ef2202ecabac6d885755
1 ## Copyright (c) 2011 George Danezis <gdane@microsoft.com>
2 ##
3 ## All rights reserved.
4 ##
5 ## Redistribution and use in source and binary forms, with or without
6 ## modification, are permitted (subject to the limitations in the
7 ## disclaimer below) provided that the following conditions are met:
8 ##
9 ## * Redistributions of source code must retain the above copyright
10 ## notice, this list of conditions and the following disclaimer.
12 ## * Redistributions in binary form must reproduce the above copyright
13 ## notice, this list of conditions and the following disclaimer in the
14 ## documentation and/or other materials provided with the
15 ## distribution.
17 ## * Neither the name of <Owner Organization> nor the names of its
18 ## contributors may be used to endorse or promote products derived
19 ## from this software without specific prior written permission.
21 ## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
22 ## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
23 ## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
24 ## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
25 ## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 ## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 ## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 ## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 ## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30 ## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
31 ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
32 ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
33 ## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 ## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
37 ## This script reads a .csv file of the number of Tor users and finds
38 ## anomalies that might be indicative of censorship.
40 # Dep: matplotlib
41 from pylab import *
42 import matplotlib
44 # Dep: numpy
45 import numpy
47 # Dep: scipy
48 import scipy.stats
49 from scipy.stats.distributions import norm
50 from scipy.stats.distributions import poisson
52 # Std lib
53 from datetime import date
54 from datetime import timedelta
55 import os.path
57 days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
59 # read the .csv file
60 class torstatstore:
61 def __init__(self, file_name):
62 f = file(file_name)
63 country_codes = f.readline()
64 country_codes = country_codes.strip().split(",")
66 store = {}
67 MAX_INDEX = 0
68 for i, line in enumerate(f):
69 MAX_INDEX += 1
70 line_parsed = line.strip().split(",")
71 for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)):
72 processed_val = None
73 if ccode == "date":
74 try:
75 year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])
76 processed_val = date(year, month, day)
77 except Exception, e:
78 print "Parsing error (ignoring line %s):" % j
79 print "%s" % val,e
80 break
82 elif val != "NA":
83 processed_val = int(val)
84 store[(ccode, i)] = processed_val
86 # min and max
87 date_min = store[("date", 0)]
88 date_max = store[("date", i)]
90 all_dates = []
91 d = date_min
92 dt = timedelta(days=1)
93 while d <= date_max:
94 all_dates += [d]
95 d = d + dt
97 # Save for later
98 self.store = store
99 self.all_dates = all_dates
100 self.country_codes = country_codes
101 self.MAX_INDEX = MAX_INDEX
102 self.date_min = date_min
103 self.date_max = date_max
105 def get_country_series(self, ccode):
106 assert ccode in self.country_codes
107 series = {}
108 for d in self.all_dates:
109 series[d] = None
110 for i in range(self.MAX_INDEX):
111 series[self.store[("date", i)]] = self.store[(ccode, i)]
112 sx = []
113 for d in self.all_dates:
114 sx += [series[d]]
115 return sx
117 def get_largest(self, number):
118 exclude = set(["all", "??", "date"])
119 l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
120 l.sort()
121 l.reverse()
122 return l[:number]
124 def get_largest_locations(self, number):
125 l = self.get_largest(number)
126 res = {}
127 for _, ccode in l[:number]:
128 res[ccode] = self.get_country_series(ccode)
129 return res
131 # Computes the difference between today and a number of days in the past
132 def n_day_rel(series, days):
133 rel = []
134 for i, v in enumerate(series):
135 if series[i] is None:
136 rel += [None]
137 continue
139 if i - days < 0 or series[i-days] is None or series[i-days] == 0:
140 rel += [None]
141 else:
142 rel += [ float(series[i]) / series[i-days]]
143 return rel
145 # Main model: computes the expected min / max range of number of users
146 def make_tendencies_minmax(l, INTERVAL = 1):
147 lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
148 c = lminus1[lminus1.keys()[0]]
149 dists = []
150 minx = []
151 maxx = []
152 for i in range(len(c)):
153 vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None]
154 if len(vals) < 8:
155 dists += [None]
156 minx += [None]
157 maxx += [None]
158 else:
159 vals.sort()
160 median = vals[len(vals)/2]
161 q1 = vals[len(vals)/4]
162 q2 = vals[(3*len(vals))/4]
163 qd = q2 - q1
164 vals = [v for v in vals if median - qd*4 < v and v < median + qd*4]
165 if len(vals) < 8:
166 dists += [None]
167 minx += [None]
168 maxx += [None]
169 continue
170 mu, signma = norm.fit(vals)
171 dists += [(mu, signma)]
172 maxx += [norm.ppf(0.9999, mu, signma)]
173 minx += [norm.ppf(1 - 0.9999, mu, signma)]
174 ## print minx[-1], maxx[-1]
175 return minx, maxx
177 # Makes pretty plots
178 def raw_plot(series, minc, maxc, labels, xtitle):
179 assert len(xtitle) == 3
180 fname, stitle, slegend = xtitle
182 font = {'family' : 'Bitstream Vera Sans',
183 'weight' : 'normal',
184 'size' : 8}
185 matplotlib.rc('font', **font)
187 ylim( (-max(series)*0.1, max(series)*1.1) )
188 plot(labels, series, linewidth=1.0, label="Users")
190 wherefill = []
191 for mm,mx in zip(minc, maxc):
192 wherefill += [not (mm == None and mx == None)]
193 assert mm < mx or (mm == None and mx == None)
195 fill_between(labels, minc, maxc, where=wherefill, color="gray", label="Prediction")
197 vdown = []
198 vup = []
199 for i,v in enumerate(series):
200 if minc[i] != None and v < minc[i]:
201 vdown += [v]
202 vup += [None]
203 elif maxc[i] != None and v > maxc[i]:
204 vdown += [None]
205 vup += [v]
206 else:
207 vup += [None]
208 vdown += [None]
210 plot(labels, vdown, 'o', ms=10, lw=2, alpha=0.5, mfc='orange', label="Downturns")
211 plot(labels, vup, 'o', ms=10, lw=2, alpha=0.5, mfc='green', label="Upturns")
213 legend(loc=2)
215 xlabel('Time (days)')
216 ylabel('Users')
217 title(stitle)
218 grid(True)
219 F = gcf()
221 F.set_size_inches(10,5)
222 F.savefig(fname, format="png", dpi = (150))
223 close()
225 def absolute_plot(series, minc, maxc, labels,INTERVAL, xtitle):
226 in_minc = []
227 in_maxc = []
228 for i, v in enumerate(series):
229 if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
230 in_minc += [minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])]
231 in_maxc += [maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])]
232 if not in_minc[-1] < in_maxc[-1]:
233 print in_minc[-1], in_maxc[-1], series[i-INTERVAL], minc[i], maxc[i]
234 assert in_minc[-1] < in_maxc[-1]
235 else:
236 in_minc += [None]
237 in_maxc += [None]
238 raw_plot(series, in_minc, in_maxc, labels, xtitle)
240 # Censorship score by jurisdiction
241 def censor_score(series, minc, maxc, INTERVAL):
242 upscore = 0
243 downscore = 0
244 for i, v in enumerate(series):
245 if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
246 in_minc = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])
247 in_maxc = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])
248 downscore += 1 if minc[i] != None and v < in_minc else 0
249 upscore += 1 if maxc[i] != None and v > in_maxc else 0
250 return downscore, upscore
252 def plot_target(tss, TARGET, xtitle, minx, maxx, DAYS=365, INTERV = 7):
253 ctarget = tss.get_country_series(TARGET)
254 c = n_day_rel(ctarget, INTERV)
255 absolute_plot(ctarget[-DAYS:], minx[-DAYS:], maxx[-DAYS:], tss.all_dates[-DAYS:],INTERV, xtitle = xtitle)
258 ## Make a league table of censorship + nice graphs
259 def plot_all(tss, minx, maxx, INTERV, DAYS=None, rdir="img"):
260 rdir = os.path.realpath(rdir)
261 if not os.path.exists(rdir) or not os.path.isdir(rdir):
262 print "ERROR: %s does not exist or is not a directory." % rdir
263 return
265 summary_file = file(os.path.join(rdir, "summary.txt"), "w")
267 if DAYS == None:
268 DAYS = 6*31
270 s = tss.get_largest(200)
271 scores = []
272 for num, li in s:
273 print ".",
274 ds,us = censor_score(tss.get_country_series(li)[-DAYS:], minx[-DAYS:], maxx[-DAYS:], INTERV)
275 # print ds, us
276 scores += [(ds,num, us, li)]
277 scores.sort()
278 scores.reverse()
279 s = "\n=======================\n"
280 s+= "Report for %s to %s\n" % (tss.all_dates[-DAYS], tss.all_dates[-1])
281 s+= "=======================\n"
282 print s
283 summary_file.write(s)
284 for a,nx, b,c in scores:
285 if a > 0:
286 s = "%s -- down: %2d (up: %2d affected: %s)" % (c, a, b, nx)
287 print s
288 summary_file.write(s + "\n")
289 xtitle = (os.path.join(rdir, "%03d-%s-censor.png" % (a,c)), "Tor report for %s -- down: %2d (up: %2d affected: %s)" % (c, a, b, nx),"")
290 plot_target(tss, c,xtitle, minx, maxx, DAYS, INTERV)
291 summary_file.close()
293 def write_all(tss, minc, maxc, INTERVAL=7):
294 ranges_file = file("direct-users-ranges.csv", "w")
295 ranges_file.write("date,country,minusers,maxusers\n")
296 exclude = set(["all", "??", "date"])
297 for c in tss.country_codes:
298 if c in exclude:
299 continue
300 print ".",
301 series = tss.get_country_series(c)
302 for i, v in enumerate(series):
303 if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
304 minv = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])
305 maxv = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])
306 if not minv < maxv:
307 print minv, maxv, series[i-INTERVAL], minc[i], maxc[i]
308 assert minv < maxv
309 ranges_file.write("%s,%s,%s,%s\n" % (tss.all_dates[i], c, minv, maxv))
310 ranges_file.close()
312 def main():
313 # Change these to customize script
314 CSV_FILE = "direct-users.csv"
315 GRAPH_DIR = "img"
316 INTERV = 7
317 DAYS= 6 * 31
319 tss = torstatstore(CSV_FILE)
320 l = tss.get_largest_locations(50)
321 minx, maxx = make_tendencies_minmax(l, INTERV)
322 plot_all(tss, minx, maxx, INTERV, DAYS, rdir=GRAPH_DIR)
323 write_all(tss, minx, maxx, INTERV)
325 if __name__ == "__main__":
326 main()