1 ## Copyright (c) 2011 George Danezis <gdane@microsoft.com>
3 ## All rights reserved.
5 ## Redistribution and use in source and binary forms, with or without
6 ## modification, are permitted (subject to the limitations in the
7 ## disclaimer below) provided that the following conditions are met:
9 ## * Redistributions of source code must retain the above copyright
10 ## notice, this list of conditions and the following disclaimer.
12 ## * Redistributions in binary form must reproduce the above copyright
13 ## notice, this list of conditions and the following disclaimer in the
14 ## documentation and/or other materials provided with the
17 ## * Neither the name of <Owner Organization> nor the names of its
18 ## contributors may be used to endorse or promote products derived
19 ## from this software without specific prior written permission.
21 ## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
22 ## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
23 ## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
24 ## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
25 ## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 ## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 ## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 ## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 ## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30 ## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
31 ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
32 ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
33 ## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 ## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
37 ## This script reads a .csv file of the number of Tor users and finds
38 ## anomalies that might be indicative of censorship.
49 from scipy
.stats
.distributions
import norm
50 from scipy
.stats
.distributions
import poisson
53 from datetime
import date
54 from datetime
import timedelta
57 days
= ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
61 def __init__(self
, file_name
):
63 country_codes
= f
.readline()
64 country_codes
= country_codes
.strip().split(",")
68 for i
, line
in enumerate(f
):
70 line_parsed
= line
.strip().split(",")
71 for j
, (ccode
, val
) in enumerate(zip(country_codes
,line_parsed
)):
75 year
, month
, day
= int(val
[:4]), int(val
[5:7]), int(val
[8:10])
76 processed_val
= date(year
, month
, day
)
78 print "Parsing error (ignoring line %s):" % j
83 processed_val
= int(val
)
84 store
[(ccode
, i
)] = processed_val
87 date_min
= store
[("date", 0)]
88 date_max
= store
[("date", i
)]
92 dt
= timedelta(days
=1)
99 self
.all_dates
= all_dates
100 self
.country_codes
= country_codes
101 self
.MAX_INDEX
= MAX_INDEX
102 self
.date_min
= date_min
103 self
.date_max
= date_max
105 def get_country_series(self
, ccode
):
106 assert ccode
in self
.country_codes
108 for d
in self
.all_dates
:
110 for i
in range(self
.MAX_INDEX
):
111 series
[self
.store
[("date", i
)]] = self
.store
[(ccode
, i
)]
113 for d
in self
.all_dates
:
117 def get_largest(self
, number
):
118 exclude
= set(["all", "??", "date"])
119 l
= [(self
.store
[(c
, self
.MAX_INDEX
-1)], c
) for c
in self
.country_codes
if c
not in exclude
]
124 def get_largest_locations(self
, number
):
125 l
= self
.get_largest(number
)
127 for _
, ccode
in l
[:number
]:
128 res
[ccode
] = self
.get_country_series(ccode
)
131 # Computes the difference between today and a number of days in the past
132 def n_day_rel(series
, days
):
134 for i
, v
in enumerate(series
):
135 if series
[i
] is None:
139 if i
- days
< 0 or series
[i
-days
] is None or series
[i
-days
] == 0:
142 rel
+= [ float(series
[i
]) / series
[i
-days
]]
145 # Main model: computes the expected min / max range of number of users
146 def make_tendencies_minmax(l
, INTERVAL
= 1):
147 lminus1
= dict([(ccode
, n_day_rel(l
[ccode
], INTERVAL
)) for ccode
in l
])
148 c
= lminus1
[lminus1
.keys()[0]]
152 for i
in range(len(c
)):
153 vals
= [lminus1
[ccode
][i
] for ccode
in lminus1
.keys() if lminus1
[ccode
][i
] != None]
160 median
= vals
[len(vals
)/2]
161 q1
= vals
[len(vals
)/4]
162 q2
= vals
[(3*len(vals
))/4]
164 vals
= [v
for v
in vals
if median
- qd
*4 < v
and v
< median
+ qd
*4]
170 mu
, signma
= norm
.fit(vals
)
171 dists
+= [(mu
, signma
)]
172 maxx
+= [norm
.ppf(0.9999, mu
, signma
)]
173 minx
+= [norm
.ppf(1 - 0.9999, mu
, signma
)]
174 ## print minx[-1], maxx[-1]
178 def raw_plot(series
, minc
, maxc
, labels
, xtitle
):
179 assert len(xtitle
) == 3
180 fname
, stitle
, slegend
= xtitle
182 font
= {'family' : 'Bitstream Vera Sans',
185 matplotlib
.rc('font', **font
)
187 ylim( (-max(series
)*0.1, max(series
)*1.1) )
188 plot(labels
, series
, linewidth
=1.0, label
="Users")
191 for mm
,mx
in zip(minc
, maxc
):
192 wherefill
+= [not (mm
== None and mx
== None)]
193 assert mm
< mx
or (mm
== None and mx
== None)
195 fill_between(labels
, minc
, maxc
, where
=wherefill
, color
="gray", label
="Prediction")
199 for i
,v
in enumerate(series
):
200 if minc
[i
] != None and v
< minc
[i
]:
203 elif maxc
[i
] != None and v
> maxc
[i
]:
210 plot(labels
, vdown
, 'o', ms
=10, lw
=2, alpha
=0.5, mfc
='orange', label
="Downturns")
211 plot(labels
, vup
, 'o', ms
=10, lw
=2, alpha
=0.5, mfc
='green', label
="Upturns")
215 xlabel('Time (days)')
221 F
.set_size_inches(10,5)
222 F
.savefig(fname
, format
="png", dpi
= (150))
225 def absolute_plot(series
, minc
, maxc
, labels
,INTERVAL
, xtitle
):
228 for i
, v
in enumerate(series
):
229 if i
> 0 and i
- INTERVAL
>= 0 and series
[i
] != None and series
[i
-INTERVAL
] != None and series
[i
-INTERVAL
] != 0 and minc
[i
]!= None and maxc
[i
]!= None:
230 in_minc
+= [minc
[i
] * poisson
.ppf(1-0.9999, series
[i
-INTERVAL
])]
231 in_maxc
+= [maxc
[i
] * poisson
.ppf(0.9999, series
[i
-INTERVAL
])]
232 if not in_minc
[-1] < in_maxc
[-1]:
233 print in_minc
[-1], in_maxc
[-1], series
[i
-INTERVAL
], minc
[i
], maxc
[i
]
234 assert in_minc
[-1] < in_maxc
[-1]
238 raw_plot(series
, in_minc
, in_maxc
, labels
, xtitle
)
240 # Censorship score by jurisdiction
241 def censor_score(series
, minc
, maxc
, INTERVAL
):
244 for i
, v
in enumerate(series
):
245 if i
> 0 and i
- INTERVAL
>= 0 and series
[i
] != None and series
[i
-INTERVAL
] != None and series
[i
-INTERVAL
] != 0 and minc
[i
]!= None and maxc
[i
]!= None:
246 in_minc
= minc
[i
] * poisson
.ppf(1-0.9999, series
[i
-INTERVAL
])
247 in_maxc
= maxc
[i
] * poisson
.ppf(0.9999, series
[i
-INTERVAL
])
248 downscore
+= 1 if minc
[i
] != None and v
< in_minc
else 0
249 upscore
+= 1 if maxc
[i
] != None and v
> in_maxc
else 0
250 return downscore
, upscore
252 def plot_target(tss
, TARGET
, xtitle
, minx
, maxx
, DAYS
=365, INTERV
= 7):
253 ctarget
= tss
.get_country_series(TARGET
)
254 c
= n_day_rel(ctarget
, INTERV
)
255 absolute_plot(ctarget
[-DAYS
:], minx
[-DAYS
:], maxx
[-DAYS
:], tss
.all_dates
[-DAYS
:],INTERV
, xtitle
= xtitle
)
258 ## Make a league table of censorship + nice graphs
259 def plot_all(tss
, minx
, maxx
, INTERV
, DAYS
=None, rdir
="img"):
260 rdir
= os
.path
.realpath(rdir
)
261 if not os
.path
.exists(rdir
) or not os
.path
.isdir(rdir
):
262 print "ERROR: %s does not exist or is not a directory." % rdir
265 summary_file
= file(os
.path
.join(rdir
, "summary.txt"), "w")
270 s
= tss
.get_largest(200)
274 ds
,us
= censor_score(tss
.get_country_series(li
)[-DAYS
:], minx
[-DAYS
:], maxx
[-DAYS
:], INTERV
)
276 scores
+= [(ds
,num
, us
, li
)]
279 s
= "\n=======================\n"
280 s
+= "Report for %s to %s\n" % (tss
.all_dates
[-DAYS
], tss
.all_dates
[-1])
281 s
+= "=======================\n"
283 summary_file
.write(s
)
284 for a
,nx
, b
,c
in scores
:
286 s
= "%s -- down: %2d (up: %2d affected: %s)" % (c
, a
, b
, nx
)
288 summary_file
.write(s
+ "\n")
289 xtitle
= (os
.path
.join(rdir
, "%03d-%s-censor.png" % (a
,c
)), "Tor report for %s -- down: %2d (up: %2d affected: %s)" % (c
, a
, b
, nx
),"")
290 plot_target(tss
, c
,xtitle
, minx
, maxx
, DAYS
, INTERV
)
293 def write_all(tss
, minc
, maxc
, INTERVAL
=7):
294 ranges_file
= file("direct-users-ranges.csv", "w")
295 ranges_file
.write("date,country,minusers,maxusers\n")
296 exclude
= set(["all", "??", "date"])
297 for c
in tss
.country_codes
:
301 series
= tss
.get_country_series(c
)
302 for i
, v
in enumerate(series
):
303 if i
> 0 and i
- INTERVAL
>= 0 and series
[i
] != None and series
[i
-INTERVAL
] != None and series
[i
-INTERVAL
] != 0 and minc
[i
]!= None and maxc
[i
]!= None:
304 minv
= minc
[i
] * poisson
.ppf(1-0.9999, series
[i
-INTERVAL
])
305 maxv
= maxc
[i
] * poisson
.ppf(0.9999, series
[i
-INTERVAL
])
307 print minv
, maxv
, series
[i
-INTERVAL
], minc
[i
], maxc
[i
]
309 ranges_file
.write("%s,%s,%s,%s\n" % (tss
.all_dates
[i
], c
, minv
, maxv
))
313 # Change these to customize script
314 CSV_FILE
= "direct-users.csv"
319 tss
= torstatstore(CSV_FILE
)
320 l
= tss
.get_largest_locations(50)
321 minx
, maxx
= make_tendencies_minmax(l
, INTERV
)
322 plot_all(tss
, minx
, maxx
, INTERV
, DAYS
, rdir
=GRAPH_DIR
)
323 write_all(tss
, minx
, maxx
, INTERV
)
325 if __name__
== "__main__":