[safe browsing] Remove stale BINHASH code.
[chromium-blink-merge.git] / chrome / tools / history-viz.py
blobfccbb313bda84a97ae99ef3fcc25df7a0efa33d5
1 #!/usr/bin/env python
2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Process a History database and dump a .dot file suitable for GraphViz.
8 This is useful for debugging history redirect flows.
10 An example run of this program:
11 python /src/history-viz.py History > foo.dot
12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
13 """
15 import struct
16 import subprocess
17 import sys
18 import urlparse
21 # Some transition types, copied from page_transition_types.h.
22 TRANS_TYPES = {
23 0: 'link',
24 1: 'typed',
25 2: 'most-visited',
26 3: 'auto subframe',
27 7: 'form',
31 class URL(object):
32 """Represents a broken-down URL from our most visited database."""
34 def __init__(self, id, url):
35 """Initialize a new URL object. |id| is the database id of the URL."""
36 self.id = id
37 self.url = url
38 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
39 if scheme == 'http':
40 scheme = '' # Shorten for display purposes.
41 if len(scheme) > 0:
42 scheme += '://'
43 self.host = scheme + loc
44 self.path = path
46 extra = ''
47 if len(query) > 0:
48 extra += '?' + query
49 if len(fragment) > 0 or url.find('#') > 0:
50 extra += '#' + fragment
51 self.extra = extra
53 def PrettyPrint(self, include_host=True, include_path=True):
54 """Pretty-print this URL in a form more suitable for the graph.
56 This will elide very long paths and potentially puts newlines between parts
57 of long components. include_host and include_path determine whether to
58 include the host and path in the output.
60 Returns: the pretty-printed string."""
61 MAX_LEN = 30 # Maximum length of a line in the output.
62 parts = []
63 if include_host:
64 parts.append(self.host)
65 if include_path:
66 parts.append(self.path)
67 parts.append(self.extra)
68 lines = []
69 line = ''
70 for part in parts:
71 if len(part) > MAX_LEN:
72 part = part[0:MAX_LEN-3] + '...'
73 if len(line)+len(part) > MAX_LEN:
74 lines.append(line)
75 line = ''
76 line += part
77 if len(line) > 0:
78 lines.append(line)
79 return '\n'.join(lines)
82 class Edge(object):
83 """Represents an edge in the history graph, connecting two pages.
85 If a link is traversed twice, it is one Edge with two entries in
86 the .transitions array."""
87 def __init__(self, src, dst):
88 self.src = src
89 self.dst = dst
90 self.transitions = []
92 def Transitions(self):
93 """Return a dictionary mapping transition type -> occurences."""
94 all = {}
95 for trans in self.transitions:
96 all[trans] = all.get(trans, 0) + 1
97 # We currently don't use the chain type.
98 # TODO(evanm): make this a command-line option.
99 # if trans & 0x30000000 != 0:
100 # chain = ''
101 # if trans & 0x10000000:
102 # chain = 'start'
103 # if trans & 0x20000000:
104 # if len(chain) == 0:
105 # chain = 'end'
106 # else:
107 # chain = ''
108 # if len(chain) > 0:
109 # edge['chain'] = chain
110 return all
113 def ClusterBy(objs, pred):
114 """Group a list of objects by a predicate.
116 Given a list of objects and a predicate over the objects, return a
117 dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
118 clusters = {}
119 for obj in objs:
120 cluster = pred(obj)
121 clusters[cluster] = clusters.get(cluster, [])
122 clusters[cluster].append(obj)
123 return clusters
126 def EscapeDot(string):
127 """Escape a string suitable for embedding in a graphviz graph."""
128 # TODO(evanm): this is likely not sufficient.
129 return string.replace('\n', '\\n')
132 class SQLite(object):
133 """Trivial interface to executing SQLite queries.
134 Spawns a new process with each call."""
135 def __init__(self, file=None):
136 self.file = file
138 def Run(self, sql):
139 """Execute |sql|, yielding each row of results as an array."""
140 subproc = subprocess.Popen(['sqlite', self.file],
141 stdin=subprocess.PIPE,
142 stdout=subprocess.PIPE)
143 subproc.stdin.write('.mode tabs\n')
144 subproc.stdin.write(sql + ';')
145 subproc.stdin.close()
146 for line in subproc.stdout:
147 row = line.strip().split('\t')
148 yield row
151 def LoadHistory(filename):
152 db = SQLite(filename)
154 urls = {} # Map of urlid => url.
155 urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node.
156 for id, url in db.Run('SELECT id, url FROM urls'):
157 urls[id] = URL(id, url)
159 visiturlids = {} # Map of visitid => urlid.
160 visiturlids['0'] = '0' # '0' is our special 'start' node.
161 edges = {} # Map of urlid->urlid->Edge.
162 for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
163 'FROM visits ORDER BY id'):
164 visiturlids[dst] = url
165 src = visiturlids[src]
166 dst = visiturlids[dst]
167 edges[src] = edges.get(src, {})
168 edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
169 # SQLite outputs transition values as signed integers, but they're really
170 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)".
171 trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
172 edge.transitions.append(trans)
174 return urls, edges
177 def main():
178 urls, edges = LoadHistory(sys.argv[1])
179 print 'digraph G {'
180 print ' graph [rankdir=LR]' # Display left to right.
181 print ' node [shape=box]' # Display nodes as boxes.
182 print ' subgraph { rank=source; 0 [label="start"] }'
184 # Output all the nodes within graph clusters.
185 hosts = ClusterBy(urls.values(), lambda url: url.host)
186 for i, (host, urls) in enumerate(hosts.items()):
187 # Cluster all URLs under this host if it has more than one entry.
188 host_clustered = len(urls) > 1
189 if host_clustered:
190 print 'subgraph clusterhost%d {' % i
191 print ' label="%s"' % host
192 paths = ClusterBy(urls, lambda url: url.path)
193 for j, (path, urls) in enumerate(paths.items()):
194 # Cluster all URLs under this host if it has more than one entry.
195 path_clustered = host_clustered and len(urls) > 1
196 if path_clustered:
197 print ' subgraph cluster%d%d {' % (i, j)
198 print ' label="%s"' % path
199 for url in urls:
200 if url.id == '0': continue # We already output the special start node.
201 pretty = url.PrettyPrint(include_host=not host_clustered,
202 include_path=not path_clustered)
203 print ' %s [label="%s"]' % (url.id, EscapeDot(pretty))
204 if path_clustered:
205 print ' }'
206 if host_clustered:
207 print '}'
209 # Output all the edges between nodes.
210 for src, dsts in edges.items():
211 for dst, edge in dsts.items():
212 # Gather up all the transitions into the label.
213 label = [] # Label for the edge.
214 transitions = edge.Transitions()
215 for trans, count in transitions.items():
216 text = ''
217 if count > 1:
218 text = '%dx ' % count
219 base_type = trans & 0xFF
220 redir = (trans & 0xC0000000) != 0
221 start = (trans & 0x10000000) != 0
222 end = (trans & 0x20000000) != 0
223 if start or end:
224 if start:
225 text += '<'
226 if end:
227 text += '>'
228 text += ' '
229 if redir:
230 text += 'R '
231 text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
232 label.append(text)
233 if len(label) == 0:
234 continue
236 edgeattrs = [] # Graphviz attributes for the edge.
237 # If the edge is from the start and the transitions are fishy, make it
238 # display as a dotted line.
239 if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0):
240 edgeattrs.append('style=dashed')
241 if len(label) > 0:
242 edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))
244 out = '%s -> %s' % (src, dst)
245 if len(edgeattrs) > 0:
246 out += ' [%s]' % ','.join(edgeattrs)
247 print out
248 print '}'
249 return 0
252 if __name__ == '__main__':
253 sys.exit(main())