text
[mlfp.git] / matlab / analyze-clusters.py
blobcb29bb14685ab6bfe93b33f4e4a4ba09cd191a4f
1 import os
2 import re
3 import sys
5 class ClusterInfo:
6 def __init__(self):
7 self.priors = {}
8 pass
9 def HandleCluster(self, cluster, prior):
10 self.priors[cluster] = prior
11 def Prior(self, cluster):
12 return self.priors[cluster]
14 class ProgramClusters:
15 def __init__(self):
16 self.clusters = {}
17 self.total = 0
19 def AddCluster(self, cluster):
20 if not self.clusters.has_key(cluster):
21 self.clusters[cluster] = 0
22 self.clusters[cluster] += 1
23 self.total += 1
25 def Summary(self):
26 # Figure out the best cluster, and its count
27 best_cluster = -1
28 best_count = 0
29 for cluster in self.clusters.keys():
30 num_clusters = self.clusters[cluster]
31 if num_clusters > best_count:
32 best_count = num_clusters
33 best_cluster = cluster
34 return (best_cluster, best_count, self.total)
36 def main(argv):
37 infile = open(argv[1], 'r')
38 program_clusters_map = {}
39 cluster_info = ClusterInfo()
40 clusters_re = re.compile('[^_\\d\\.]+')
41 for line in infile:
42 fields = line.strip().split(',')
43 base = os.path.basename(fields[0])
44 m = clusters_re.search(base)
45 program_id = m.group(0)
46 if not program_clusters_map.has_key(program_id):
47 program_clusters_map[program_id] = ProgramClusters()
48 cluster = int(fields[1])
49 prior = float(fields[-1])
50 program_clusters_map[program_id].AddCluster(cluster)
51 cluster_info.HandleCluster(cluster, prior)
52 for program in program_clusters_map.keys():
53 (best_cluster, count, total) = program_clusters_map[program].Summary()
54 print '%s & %d & %d & %.2f & %.2f \\\\' % (
55 program, best_cluster, count, count * 1.0 / total, cluster_info.Prior(best_cluster))
58 if __name__ == '__main__':
59 main(sys.argv)