Update criteria for partial/full IPv6 support.
[tor-metrics-tasks.git] / task-6232 / pyentropy.py
blobe6d32147b8fa3e2bf8bb3fbf04ea447f6fdbd273
1 """
2 Usage - python pyentropy.py -h
3 Output - A CSV file of the format (without newlines):
4 <valid-after>,
5 <entropy for all nodes>,
6 <max entropy for all nodes>,
7 <entropy for exit nodes>,
8 <max entropy for exit nodes>,
9 <entropy for guard nodes>,
10 <max entropy for guard nodes>,
11 <entropy for countries>,
12 <max entropy for countries>,
13 <entropy for AS>,
14 <max entropy for AS>
15 rsync -arz --delete metrics.torproject.org::metrics-recent/relay-descriptors/consensuses in
16 """
18 import sys
19 import math
20 import os
21 import pygeoip
22 import StringIO
23 import stem.descriptor
25 from optparse import OptionParser
26 from binascii import b2a_hex, a2b_base64, a2b_hex
27 from stem.descriptor.server_descriptor import RelayDescriptor, BridgeDescriptor
29 class Router:
30 def __init__(self):
31 self.bandwidth = None
32 self.advertised_bw = None
33 self.country = None
34 self.as_no = None
35 self.is_exit = None
36 self.is_guard = None
38 def add_router_info(self, values):
39 hex_digest = b2a_hex(a2b_base64(values[2]+"="))
40 self.advertised_bw = self.get_advertised_bw(hex_digest)
41 ip = values[5]
42 self.country = gi_db.country_code_by_addr(ip)
43 self.as_no = self.get_as_details(ip)
45 def add_weights(self, values):
46 self.bandwidth = int(values[0].split('=')[1])
48 def add_flags(self, values):
49 if "Exit" in values and not "BadExit" in values:
50 self.is_exit = True
51 if "Guard" in values:
52 self.is_guard = True
54 def get_as_details(self, ip):
55 try:
56 value = as_db.org_by_addr(str(ip)).split()
57 return value[0]
58 except:
59 return ""
61 def get_advertised_bw(self, hex_digest):
62 try:
63 with open(options.server_desc+hex_digest) as f:
64 data = f.read()
66 desc_iter = stem.descriptor.server_descriptor.parse_file(StringIO.StringIO(data))
67 desc_entries = list(desc_iter)
68 desc = desc_entries[0]
69 return min(desc.average_bandwidth, desc.burst_bandwidth, desc.observed_bandwidth)
70 except:
71 return 0
73 def parse_bw_weights(values):
74 data = {}
75 try:
76 for value in values:
77 key, value = value.split("=")
78 data[key] = float(value) / 10000
79 return data
80 except:
81 return None
83 def run(file_name):
84 routers = []
85 router = None
86 Wed, Wee, Wgd, Wgg = 1, 1, 1, 1
87 # parse consensus
88 with open(file_name, 'r') as f:
89 for line in f.readlines():
90 key = line.split()[0]
91 values = line.split()[1:]
92 if key =='r':
93 router = Router()
94 routers.append(router)
95 router.add_router_info(values)
96 elif key == 's':
97 router.add_flags(values)
98 elif key == 'w':
99 router.add_weights(values)
100 elif key == 'valid-after':
101 valid_after = ' '.join(values)
102 elif key == 'bandwidth-weights':
103 data = parse_bw_weights(values)
104 try:
105 Wed = data['Wed']
106 Wee = data['Wee']
107 Wgd = data['Wgd']
108 Wgg = data['Wgg']
109 except:
110 pass
112 if len(routers) <= 0:
113 return
115 total_bw, total_exit_bw, total_guard_bw = 0, 0, 0
116 guards_no, exits_no = 0, 0
117 bw_countries, bw_as = {}, {}
118 for router in routers:
119 if not router.bandwidth:
120 continue
121 total_bw += router.bandwidth
122 if router.is_guard and router.is_exit:
123 total_guard_bw += Wgd*router.bandwidth
124 total_exit_bw += Wed*router.bandwidth
125 guards_no += 1
126 exits_no += 1
127 elif router.is_guard:
128 total_guard_bw += Wgg*router.bandwidth
129 guards_no += 1
130 elif router.is_exit:
131 total_exit_bw += Wee*router.bandwidth
132 exits_no += 1
133 if bw_countries.has_key(router.country):
134 bw_countries[router.country] += router.bandwidth
135 else:
136 bw_countries[router.country] = router.bandwidth
137 if bw_as.has_key(router.as_no):
138 bw_as[router.as_no] += router.bandwidth
139 else:
140 bw_as[router.as_no] = router.bandwidth
142 if total_bw == 0:
143 return
145 entropy, entropy_exit, entropy_guard, entropy_country, entropy_as = 0.0, 0.0, 0.0, 0.0, 0.0
146 for router in routers:
147 p = float(router.bandwidth) / float(total_bw)
148 if p != 0:
149 entropy += -(p * math.log(p, 2))
150 if router.is_guard and router.is_exit:
151 p = float(Wgd*router.bandwidth) / float(total_guard_bw)
152 if p != 0:
153 entropy_guard += -(p * math.log(p, 2))
154 p = float(Wed*router.bandwidth) / float(total_exit_bw)
155 if p != 0:
156 entropy_exit += -(p * math.log(p, 2))
157 elif router.is_guard:
158 p = float(Wgg*router.bandwidth) / float(total_guard_bw)
159 if p != 0:
160 entropy_guard += -(p * math.log(p, 2))
161 elif router.is_exit:
162 p = float(Wee*router.bandwidth) / float(total_exit_bw)
163 if p != 0:
164 entropy_exit += -(p * math.log(p, 2))
166 for country in bw_countries.iterkeys():
167 p = float(bw_countries[country]) / float(total_bw)
168 if p != 0:
169 entropy_country += -(p * math.log(p, 2))
171 for as_no in bw_as.iterkeys():
172 p = float(bw_as[as_no]) / float(total_bw)
173 if p !=0:
174 entropy_as += -(p * math.log(p, 2))
176 # Entropy of uniform distribution of 'n' possible values: log(n)
177 max_entropy = math.log(len(routers), 2)
178 max_entropy_guard = math.log(guards_no, 2)
179 max_entropy_exit = math.log(exits_no, 2)
180 max_entropy_country = math.log(len(bw_countries), 2)
181 max_entropy_as = math.log(len(bw_as), 2)
183 return ",".join([valid_after,
184 str(entropy),
185 str(max_entropy),
186 str(entropy_exit),
187 str(max_entropy_exit),
188 str(entropy_guard),
189 str(max_entropy_guard),
190 str(entropy_country),
191 str(max_entropy_country),
192 str(entropy_as),
193 str(max_entropy_as)])
195 def parse_args():
196 usage = "Usage - python pyentropy.py [options]"
197 parser = OptionParser(usage)
199 parser.add_option("-g", "--geoip", dest="gi_db", default="GeoIP.dat",
200 help="Input GeoIP database")
201 parser.add_option("-a", "--as", dest="as_db", default="GeoIPASNum.dat",
202 help="Input AS GeoIP database")
203 parser.add_option("-s", "--server_desc", dest="server_desc",
204 default="data/relay-descriptors/server-descriptors/", help="Server descriptors directory")
205 parser.add_option("-o", "--output", dest="output", default="entropy.csv",
206 help="Output filename")
207 parser.add_option("-c", "--consensus", dest="consensus", default="in/consensus",
208 help="Input consensus dir")
210 (options, args) = parser.parse_args()
212 return options
214 if __name__ == "__main__":
215 options = parse_args()
216 gi_db = pygeoip.GeoIP(options.gi_db)
217 as_db = pygeoip.GeoIP(options.as_db)
219 with open(options.output, 'w') as f:
220 for file_name in os.listdir(options.consensus):
221 string = run(os.path.join(options.consensus, file_name))
222 if string:
223 f.write("%s\n" % (string))