Add graphing code for bandwidth by version (13634).
[tor-metrics-tasks.git] / task-2680 / ProcessSanitizedBridges.java
blobc3ab6c8dc3d43b5cf8eecaff0a4e99567ca29ac9
1 import java.io.*;
2 import java.text.*;
3 import java.util.*;
4 import org.apache.commons.codec.binary.*;
6 public class ProcessSanitizedBridges {
7 public static void main(String[] args) throws IOException,
8 ParseException {
10 /* Validate command-line arguments. */
11 if (args.length != 1 || !new File(args[0]).exists()) {
12 System.out.println("Usage: java ProcessSanitizedBridges <dir>");
13 System.exit(1);
16 /* Find all files that we should parse. Somewhat fragile, but should
17 * work. */
18 System.out.println("Creating list of files we should parse.");
19 SortedMap<String, File> statuses = new TreeMap<String, File>();
20 SortedMap<String, File> serverDescriptors =
21 new TreeMap<String, File>();
22 SortedMap<String, File> extraInfoDescriptors =
23 new TreeMap<String, File>();
24 Stack<File> files = new Stack<File>();
25 files.add(new File(args[0]));
26 while (!files.isEmpty()) {
27 File file = files.pop();
28 String path = file.getAbsolutePath();
29 String filename = file.getName();
30 if (file.isDirectory()) {
31 files.addAll(Arrays.asList(file.listFiles()));
32 } else if (path.contains("statuses")) {
33 statuses.put(filename, file);
34 } else if (path.contains("server-descriptors")) {
35 serverDescriptors.put(filename, file);
36 } else if (path.contains("extra-infos")) {
37 extraInfoDescriptors.put(filename, file);
40 System.out.println("We found\n " + statuses.size() + " statuses,\n "
41 + serverDescriptors.size() + " server descriptors, and\n "
42 + extraInfoDescriptors.size() + " extra-info descriptors.");
44 /* Parse statuses. */
45 if (!statuses.isEmpty()) {
46 System.out.println("Parsing statuses.");
47 List<String> knownFlags = new ArrayList<String>(Arrays.asList(
48 ("Authority,BadExit,BadDirectory,Exit,Fast,Guard,Named,Stable,"
49 + "Running,Valid,V2Dir").split(",")));
50 BufferedWriter bw = new BufferedWriter(new FileWriter(
51 "statuses.csv"));
52 bw.write("status,fingerprint,descriptor,published,address,orport,"
53 + "dirport");
54 for (String knownFlag : knownFlags) {
55 bw.write("," + knownFlag.toLowerCase());
57 bw.write("\n");
58 int parsedStatuses = 0, totalStatuses = statuses.size(),
59 writtenOutputLines = 1;
60 long started = System.currentTimeMillis();
61 for (File file : statuses.values()) {
62 String filename = file.getName();
63 if (filename.length() != ("20110101-000703-"
64 + "4A0CCD2DDC7995083D73F5D667100C8A5831F16D").length()) {
65 System.out.println("Status filename has wrong length: '"
66 + filename + "' Please check. Exiting.");
67 System.exit(1);
69 String statusDateTime = filename.substring(0, 4) + "-"
70 + filename.substring(4, 6) + "-" + filename.substring(6, 8)
71 + " " + filename.substring(9, 11) + ":"
72 + filename.substring(11, 13) + ":"
73 + filename.substring(13, 15);
74 BufferedReader br = new BufferedReader(new FileReader(file));
75 String line;
76 while ((line = br.readLine()) != null) {
77 if (line.startsWith("r ")) {
78 String[] parts = line.split(" ");
79 if (parts.length != 9) {
80 System.out.println("r line doesn't have the correct number "
81 + "of entries: '" + line + "'. Please check. Exiting.");
82 System.exit(1);
84 String fingerprint = Hex.encodeHexString(Base64.decodeBase64(
85 parts[2] + "="));
86 String descriptor = Hex.encodeHexString(Base64.decodeBase64(
87 parts[3] + "="));
88 String published = parts[4] + " " + parts[5];
89 String address = parts[6];
90 String orPort = parts[7];
91 String dirPort = parts[8];
92 bw.write(statusDateTime + "," + fingerprint + "," + descriptor
93 + "," + published + "," + address + "," + orPort + ","
94 + dirPort);
95 } else if (line.equals("s") || line.startsWith("s ")) {
96 String flags = line.substring(1);
97 for (String flag : knownFlags) {
98 if (flags.contains(" " + flag)) {
99 bw.write(",TRUE");
100 } else {
101 bw.write(",FALSE");
104 bw.write("\n");
105 writtenOutputLines++;
108 br.close();
109 parsedStatuses++;
110 if (parsedStatuses % (totalStatuses / 10) == 0) {
111 double fractionDone = (double) (parsedStatuses) /
112 (double) totalStatuses;
113 double fractionLeft = 1.0D - fractionDone;
114 long now = System.currentTimeMillis();
115 double millisLeft = ((double) (now - started)) * fractionLeft /
116 fractionDone;
117 long secondsLeft = (long) millisLeft / 1000L;
118 System.out.println(" " + (parsedStatuses / (totalStatuses
119 / 10)) + "0% done, " + secondsLeft + " seconds left.");
122 bw.close();
123 System.out.println("Parsed " + parsedStatuses + " statuses and "
124 + "wrote " + writtenOutputLines + " lines to statuses.csv.");
127 /* Parse server descriptors and extra-info descriptors. */
128 if (!serverDescriptors.isEmpty()) {
129 System.out.println("Parsing server descriptors and extra-info "
130 + "descriptors.");
131 List<String> knownCountries = new ArrayList<String>(Arrays.asList(
132 ("?? A1 A2 AD AE AF AG AI AL AM AN AO AP AQ AR AS AT AU AW AX "
133 + "AZ BA BB BD BE BF BG BH BI BJ BM BN BO BR BS BT BV BW BY BZ "
134 + "CA CD CF CG CH CI CK CL CM CN CO CR CS CU CV CY CZ DE DJ DK "
135 + "DM DO DZ EC EE EG ER ES ET EU FI FJ FK FM FO FR GA GB GD GE "
136 + "GF GG GH GI GL GM GN GP GQ GR GT GU GW GY HK HN HR HT HU ID "
137 + "IE IL IM IN IO IQ IR IS IT JE JM JO JP KE KG KH KI KM KN KP "
138 + "KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD ME MF "
139 + "MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC "
140 + "NE NF NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PR "
141 + "PS PT PW PY QA RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK "
142 + "SL SM SN SO SR ST SV SY SZ TC TD TG TH TJ TK TL TM TN TO TR "
143 + "TT TV TW TZ UA UG UM US UY UZ VA VC VE VG VI VN VU WF WS YE "
144 + "YT ZA ZM ZW").toLowerCase().split(" ")));
145 BufferedWriter bw = new BufferedWriter(new FileWriter(
146 "descriptors.csv"));
147 bw.write("descriptor,fingerprint,published,address,orport,dirport,"
148 + "version,platform,uptime,bridgestatsend,bridgestatsseconds");
149 for (String country : knownCountries) {
150 bw.write("," + country);
152 bw.write(",bridgestatscountries,bridgestatstotal\n");
153 int parsedServerDescriptors = 0, parsedExtraInfoDescriptors = 0,
154 parsedGeoipStats = 0, skippedGeoipStats = 0,
155 parsedBridgeStats = 0,
156 totalServerDescriptors = serverDescriptors.size(),
157 writtenOutputLines = 1;
158 SimpleDateFormat timeFormat = new SimpleDateFormat(
159 "yyyy-MM-dd HH:mm:ss");
160 timeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
161 long started = System.currentTimeMillis();
162 for (File file : serverDescriptors.values()) {
163 String filename = file.getName();
164 BufferedReader br = new BufferedReader(new FileReader(file));
165 String line, fingerprint = null, published = null, address = null,
166 orPort = null, dirPort = null, version = null,
167 platform = null, uptime = null, extraInfoDigest = null,
168 bridgeStatsEnd = null, bridgeStatsSeconds = null;
169 SortedMap<String, String> bridgeStatsIps =
170 new TreeMap<String, String>();
171 long bridgeStatsTotal = 0L;
172 while ((line = br.readLine()) != null) {
173 if (line.startsWith("opt ")) {
174 line = line.substring(4);
176 if (line.startsWith("router ")) {
177 String[] parts = line.split(" ");
178 address = parts[2];
179 orPort = parts[3];
180 dirPort = parts[4];
181 } else if (line.startsWith("platform ")) {
182 version = line.split(" ")[2];
183 platform = line.substring(line.indexOf("on ")
184 + "on ".length());
185 if (platform.contains("Windows")) {
186 platform = "Windows";
187 } else if (platform.contains("Linux")) {
188 platform = "Linux";
189 } else if (platform.contains("Darwin")) {
190 platform = "Mac OS X";
191 } else if (platform.contains("BSD")) {
192 platform = "*BSD";
193 } else {
194 platform = "Other";
196 } else if (line.startsWith("published ")) {
197 String[] parts = line.split(" ");
198 published = parts[1] + " " + parts[2];
199 } else if (line.startsWith("fingerprint ")) {
200 fingerprint = line.substring("fingerprint".length()).
201 replaceAll(" ", "").toLowerCase();
202 } else if (line.startsWith("uptime ")) {
203 uptime = line.split(" ")[1];
204 } else if (line.startsWith("extra-info-digest ")) {
205 extraInfoDigest = line.substring("extra-info-digest ".
206 length()).toLowerCase();
207 if (extraInfoDescriptors.containsKey(extraInfoDigest)) {
208 parsedExtraInfoDescriptors++;
209 BufferedReader br2 = new BufferedReader(new FileReader(
210 extraInfoDescriptors.get(extraInfoDigest)));
211 String geoipStartTime = null, bridgeStatsEndLine = null;
212 while ((line = br2.readLine()) != null) {
213 if (line.startsWith("geoip-start-time ")) {
214 geoipStartTime = line.substring("geoip-start-time ".
215 length());
216 } else if (line.startsWith("geoip-client-origins ") &&
217 line.split(" ").length > 1 && published != null &&
218 geoipStartTime != null) {
219 if (version.startsWith("0.2.2.")) {
220 skippedGeoipStats++;
221 } else {
222 parsedGeoipStats++;
223 bridgeStatsEnd = published;
224 bridgeStatsSeconds = "" +
225 + (timeFormat.parse(published).getTime()
226 - timeFormat.parse(geoipStartTime).getTime())
227 / 1000L;
228 for (String pair : line.split(" ")[1].split(",")) {
229 String country = pair.substring(0, 2);
230 String ips = pair.substring(3);
231 bridgeStatsIps.put(country, ips);
232 bridgeStatsTotal += Long.parseLong(ips);
235 } else if (line.startsWith("bridge-stats-end ")) {
236 bridgeStatsEndLine = line;
237 } else if (line.startsWith("bridge-ips ") &&
238 line.length() > "bridge-ips ".length() &&
239 bridgeStatsEndLine != null) {
240 parsedBridgeStats++;
241 String[] parts = bridgeStatsEndLine.split(" ");
242 bridgeStatsEnd = parts[1] + " " + parts[2];
243 bridgeStatsSeconds = parts[3].substring(1);
244 for (String pair : line.split(" ")[1].split(",")) {
245 String country = pair.substring(0, 2);
246 String ips = pair.substring(3);
247 bridgeStatsIps.put(country, ips);
248 bridgeStatsTotal += Long.parseLong(ips);
252 br2.close();
256 br.close();
257 if (fingerprint == null || published == null || address == null ||
258 orPort == null || dirPort == null || version == null ||
259 platform == null || uptime == null) {
260 System.out.println("Server descriptor " + filename + " is "
261 + "missing critical information. Please check. Exiting.");
262 System.exit(1);
264 bw.write(filename + "," + fingerprint + "," + published + ","
265 + address + "," + orPort + "," + dirPort + "," + version + ","
266 + platform + "," + uptime);
267 if (bridgeStatsEnd != null) {
268 bw.write("," + bridgeStatsEnd + "," + bridgeStatsSeconds);
269 int bridgeStatsCountries = bridgeStatsIps.size();
270 for (String country : knownCountries) {
271 if (bridgeStatsIps.containsKey(country)) {
272 bw.write("," + bridgeStatsIps.remove(country));
273 } else {
274 bw.write(",0");
277 if (!bridgeStatsIps.isEmpty()) {
278 StringBuilder message = new StringBuilder();
279 for (String country : bridgeStatsIps.keySet()) {
280 message.append(", " + country);
282 System.out.println("Unknown " + (bridgeStatsIps.size() == 1 ?
283 "country" : "countries") + " " + message.toString().
284 substring(2) + " in extra-info descriptor "
285 + extraInfoDigest + ". Please check. Exiting.");
286 System.exit(1);
288 bw.write("," + bridgeStatsCountries + "," + bridgeStatsTotal
289 + "\n");
290 } else {
291 bw.write(",NA,NA");
292 for (String country : knownCountries) {
293 bw.write(",NA");
295 bw.write(",NA,NA\n");
297 writtenOutputLines++;
298 parsedServerDescriptors++;
299 if (parsedServerDescriptors % (totalServerDescriptors / 100)
300 == 0) {
301 double fractionDone = (double) (parsedServerDescriptors) /
302 (double) totalServerDescriptors;
303 double fractionLeft = 1.0D - fractionDone;
304 long now = System.currentTimeMillis();
305 double millisLeft = ((double) (now - started)) * fractionLeft /
306 fractionDone;
307 long secondsLeft = (long) millisLeft / 1000L;
308 System.out.println(" " + (parsedServerDescriptors /
309 (totalServerDescriptors / 100)) + "% done, " + secondsLeft
310 + " seconds left.");
313 bw.close();
314 System.out.println("Parsed " + parsedServerDescriptors + " server "
315 + "descriptors and " + parsedExtraInfoDescriptors
316 + " extra-info descriptors.\nParsed " + parsedGeoipStats
317 + " geoip-stats and " + parsedBridgeStats + " bridge-stats.\n"
318 + "Skipped " + skippedGeoipStats + " broken geoip-stats of "
319 + "0.2.2.x bridges.\nWrote " + writtenOutputLines + " to "
320 + "descriptors.csv.");
323 /* This is it. */
324 System.out.println("Terminating.");