Follow-up to r29036: Now that the "mergeinfo" transaction file is no
[svn.git] / tools / dev / iz / find-fix.py
blob3a4d309e3ab0f3fee7d906ac82923555202b7540
1 #!/usr/bin/env python
2 # -*- Python -*-
3 """find-fix.py: produce a find/fix report for Subversion's IZ database
5 For simple text summary:
6 find-fix.py query-set-1.tsv YYYY-MM-DD YYYY-MM-DD
7 Statistics will be printed for bugs found or fixed within the
8 time frame.
10 For gnuplot presentation:
11 find-fix.py query-set-1.tsv outfile
12 Gnuplot provides its own way to select date ranges.
14 Either way, get a query-set-1.tsv from:
15 http://subversion.tigris.org/iz-data/query-set-1.tsv (updated nightly)
16 See http://subversion.tigris.org/iz-data/README for more info on that file.
18 For more usage info on this script:
19 find-fix.py --help
20 """
22 _version = "$Revision:"
25 # This can be run over the data file found at:
26 # http://subversion.tigris.org/iz-data/query-set-1.tsv
29 import getopt
30 try:
31 my_getopt = getopt.gnu_getopt
32 except AttributeError:
33 my_getopt = getopt.getopt
34 import operator
35 import os
36 import os.path
37 import pydoc
38 import re
39 import string
40 import sys
41 import time
43 me = os.path.basename(sys.argv[0])
45 # Long options and their usage strings; "=" means it takes an argument.
46 # To get a list suitable for getopt, just do
48 # [x[0] for x in long_opts]
50 # Make sure to sacrifice a lamb to Guido for each element of the list.
51 long_opts = [
52 ["milestones=", """Optional, milestones NOT to report on
53 (one or more of Beta, 1.0, Post-1.0, cvs2svn-1.0, cvs2svn-opt,
54 inapplicable)"""],
55 ["update", """Optional, update the statistics first."""],
56 ["doc", """Optional, print pydocs."""],
57 ["help", """Optional, print usage (this text)."""],
58 ["verbose", """Optional, print more progress messages."""],
61 help = 0
62 verbose = 0
63 update = 0
65 DATA_FILE = "http://subversion.tigris.org/iz-data/query-set-1.tsv"
66 ONE_WEEK = 7 * 24 * 60 * 60
68 _types = []
69 _milestone_filter = []
71 noncore_milestone_filter = [
72 'Post-1.0',
73 '1.1',
74 'cvs2svn-1.0',
75 'cvs2svn-opt',
76 'inapplicable',
77 'no milestone',
80 one_point_oh_milestone_filter = noncore_milestone_filter + []
82 beta_milestone_filter = one_point_oh_milestone_filter + ['1.0']
85 _types = [
86 'DEFECT',
87 'TASK',
88 'FEATURE',
89 'ENHANCEMENT',
90 'PATCH',
94 def main():
95 """Report bug find/fix rate statistics for Subversion."""
97 global verbose
98 global update
99 global _types
100 global _milestone_filter
101 global noncore_milestone_filter
103 try:
104 opts, args = my_getopt(sys.argv[1:], "", [x[0] for x in long_opts])
105 except getopt.GetoptError, e:
106 sys.stderr.write("Error: %s\n" % e.msg)
107 shortusage()
108 sys.stderr.write("%s --help for options.\n" % me)
109 sys.exit(1)
111 for opt, arg in opts:
112 if opt == "--help":
113 usage()
114 sys.exit(0)
115 elif opt == "--verbose":
116 verbose = 1
117 elif opt == "--milestones":
118 for mstone in string.split(arg, ","):
119 if mstone == "noncore":
120 _milestone_filter = noncore_milestone_filter
121 elif mstone == "beta":
122 _milestone_filter = beta_milestone_filter
123 elif mstone == "one":
124 _milestone_filter = one_point_oh_milestone_filter
125 elif mstone[0] == '-':
126 if mstone[1:] in _milestone_filter:
127 spot = _milestone_filter.index(mstone[1:])
128 _milestone_filter = _milestone_filter[:spot] \
129 + _milestone_filter[(spot+1):]
130 else:
131 _milestone_filter += [mstone]
133 elif opt == "--update":
134 update = 1
135 elif opt == "--doc":
136 pydoc.doc(pydoc.importfile(sys.argv[0]))
137 sys.exit(0)
139 if len(_milestone_filter) == 0:
140 _milestone_filter = noncore_milestone_filter
142 if verbose:
143 sys.stderr.write("%s: Filtering out milestones %s.\n"
144 % (me, string.join(_milestone_filter, ", ")))
146 if len(args) == 2:
147 if verbose:
148 sys.stderr.write("%s: Generating gnuplot data.\n" % me)
149 if update:
150 if verbose:
151 sys.stderr.write("%s: Updating %s from %s.\n" % (me, args[0], DATA_FILE))
152 if os.system("curl " + DATA_FILE + "> " + args[0]):
153 os.system("wget " + DATA_FILE)
154 plot(args[0], args[1])
156 elif len(args) == 3:
157 if verbose:
158 sys.stderr.write("%s: Generating summary from %s to %s.\n"
159 % (me, args[1], args[2]))
160 if update:
161 if verbose:
162 sys.stderr.write("%s: Updating %s from %s.\n" % (me, args[0], DATA_FILE))
163 if os.system("curl " + DATA_FILE + "> " + args[0]):
164 os.system("wget " + DATA_FILE)
166 try:
167 t_start = parse_time(args[1] + " 00:00:00")
168 except ValueError:
169 sys.stderr.write('%s: ERROR: bad time value: %s\n' % (me, args[1]))
170 sys.exit(1)
172 try:
173 t_end = parse_time(args[2] + " 00:00:00")
174 except ValueError:
175 sys.stderr.write('%s: ERROR: bad time value: %s\n' % (me, args[2]))
176 sys.exit(1)
178 summary(args[0], t_start, t_end)
179 else:
180 usage()
182 sys.exit(0)
185 def summary(datafile, d_start, d_end):
186 "Prints a summary of activity within a specified date range."
188 data = load_data(datafile)
190 # activity during the requested period
191 found, fixed, inval, dup, other = extract(data, 1, d_start, d_end)
193 # activity from the beginning of time to the end of the request
194 # used to compute remaining
195 # XXX It would be faster to change extract to collect this in one
196 # pass. But we don't presently have enough data, nor use this
197 # enough, to justify that rework.
198 fromzerofound, fromzerofixed, fromzeroinval, fromzerodup, fromzeroother \
199 = extract(data, 1, 0, d_end)
201 alltypes_found = alltypes_fixed = alltypes_inval = alltypes_dup \
202 = alltypes_other = alltypes_rem = 0
203 for t in _types:
204 fromzerorem_t = fromzerofound[t]\
205 - (fromzerofixed[t] + fromzeroinval[t] + fromzerodup[t]
206 + fromzeroother[t])
207 print '%12s: found=%3d fixed=%3d inval=%3d dup=%3d ' \
208 'other=%3d remain=%3d' \
209 % (t, found[t], fixed[t], inval[t], dup[t], other[t], fromzerorem_t)
210 alltypes_found = alltypes_found + found[t]
211 alltypes_fixed = alltypes_fixed + fixed[t]
212 alltypes_inval = alltypes_inval + inval[t]
213 alltypes_dup = alltypes_dup + dup[t]
214 alltypes_other = alltypes_other + other[t]
215 alltypes_rem = alltypes_rem + fromzerorem_t
217 print '-' * 77
218 print '%12s: found=%3d fixed=%3d inval=%3d dup=%3d ' \
219 'other=%3d remain=%3d' \
220 % ('totals', alltypes_found, alltypes_fixed, alltypes_inval,
221 alltypes_dup, alltypes_other, alltypes_rem)
222 # print '%12s find/fix ratio: %g%%' \
223 # % (" "*12, (alltypes_found*100.0/(alltypes_fixed
224 # + alltypes_inval + alltypes_dup + alltypes_other)))
227 def plot(datafile, outbase):
228 "Generates data files intended for use by gnuplot."
230 global _types
232 data = load_data(datafile)
234 t_min = 1L<<32
235 for issue in data:
236 if issue.created < t_min:
237 t_min = issue.created
239 # break the time up into a tuple, then back up to Sunday
240 t_start = time.localtime(t_min)
241 t_start = time.mktime((t_start[0], t_start[1], t_start[2] - t_start[6] - 1,
242 0, 0, 0, 0, 0, 0))
244 plots = { }
245 for t in _types:
246 # for each issue type, we will record per-week stats, compute a moving
247 # average of the find/fix delta, and track the number of open issues
248 plots[t] = [ [ ], MovingAverage(), 0 ]
250 week = 0
251 for date in range(t_start, time.time(), ONE_WEEK):
252 ### this is quite inefficient, as we could just sort by date, but
253 ### I'm being lazy
254 found, fixed = extract(data, None, date, date + ONE_WEEK - 1)
256 for t in _types:
257 per_week, avg, open_issues = plots[t]
258 delta = found[t] - fixed[t]
259 per_week.append((week, date,
260 found[t], -fixed[t], avg.add(delta), open_issues))
261 plots[t][2] = open_issues + delta
263 week = week + 1
265 for t in _types:
266 week_data = plots[t][0]
267 write_file(week_data, outbase, t, 'found', 2)
268 write_file(week_data, outbase, t, 'fixed', 3)
269 write_file(week_data, outbase, t, 'avg', 4)
270 write_file(week_data, outbase, t, 'open', 5)
272 def write_file(week_data, base, type, tag, idx):
273 f = open('%s.%s.%s' % (base, tag, type), 'w')
274 for info in week_data:
275 f.write('%s %s # %s\n' % (info[0], info[idx], time.ctime(info[1])))
278 class MovingAverage:
279 "Helper class to compute moving averages."
280 def __init__(self, n=4):
281 self.n = n
282 self.data = [ 0 ] * n
283 def add(self, value):
284 self.data.pop(0)
285 self.data.append(float(value) / self.n)
286 return self.avg()
287 def avg(self):
288 return reduce(operator.add, self.data)
291 def extract(data, details, d_start, d_end):
292 """Extract found/fixed counts for each issue type within the data range.
294 If DETAILS is false, then return two dictionaries:
296 found, fixed
298 ...each mapping issue types to the number of issues of that type
299 found or fixed respectively.
301 If DETAILS is true, return five dictionaries:
303 found, fixed, invalid, duplicate, other
305 The first is still the found issues, but the other four break down
306 the resolution into 'FIXED', 'INVALID', 'DUPLICATE', and a grab-bag
307 category for 'WORKSFORME', 'LATER', 'REMIND', and 'WONTFIX'."""
309 global _types
310 global _milestone_filter
312 found = { }
313 fixed = { }
314 invalid = { }
315 duplicate = { }
316 other = { } # "WORKSFORME", "LATER", "REMIND", and "WONTFIX"
318 for t in _types:
319 found[t] = fixed[t] = invalid[t] = duplicate[t] = other[t] = 0
321 for issue in data:
322 # filter out disrespected milestones
323 if issue.milestone in _milestone_filter:
324 continue
326 # record the found/fixed counts
327 if d_start <= issue.created <= d_end:
328 found[issue.type] = found[issue.type] + 1
329 if d_start <= issue.resolved <= d_end:
330 if details:
331 if issue.resolution == "FIXED":
332 fixed[issue.type] = fixed[issue.type] + 1
333 elif issue.resolution == "INVALID":
334 invalid[issue.type] = invalid[issue.type] + 1
335 elif issue.resolution == "DUPLICATE":
336 duplicate[issue.type] = duplicate[issue.type] + 1
337 else:
338 other[issue.type] = other[issue.type] + 1
339 else:
340 fixed[issue.type] = fixed[issue.type] + 1
342 if details:
343 return found, fixed, invalid, duplicate, other
344 else:
345 return found, fixed
348 def load_data(datafile):
349 "Return a list of Issue objects for the specified data."
350 return map(Issue, open(datafile).readlines())
353 class Issue:
354 "Represents a single issue from the exported IssueZilla data."
356 def __init__(self, line):
357 row = string.split(string.strip(line), '\t')
359 self.id = int(row[0])
360 self.type = row[1]
361 self.reporter = row[2]
362 if row[3] == 'NULL':
363 self.assigned = None
364 else:
365 self.assigned = row[3]
366 self.milestone = row[4]
367 self.created = parse_time(row[5])
368 self.resolution = row[7]
369 if not self.resolution:
370 # If the resolution is empty, then force the resolved date to None.
371 # When an issue is reopened, there will still be activity showing
372 # a "RESOLVED", thus we get a resolved date. But we simply want to
373 # ignore that date.
374 self.resolved = None
375 else:
376 self.resolved = parse_time(row[6])
377 self.summary = row[8]
380 parse_time_re = re.compile('([0-9]{4})-([0-9]{2})-([0-9]{2}) '
381 '([0-9]{2}):([0-9]{2}):([0-9]{2})')
383 def parse_time(t):
384 "Convert an exported MySQL timestamp into seconds since the epoch."
386 global parse_time_re
388 if t == 'NULL':
389 return None
390 try:
391 matches = parse_time_re.match(t)
392 return time.mktime((int(matches.group(1)),
393 int(matches.group(2)),
394 int(matches.group(3)),
395 int(matches.group(4)),
396 int(matches.group(5)),
397 int(matches.group(6)),
398 0, 0, -1))
399 except ValueError:
400 sys.stderr.write('ERROR: bad time value: %s\n'% t)
401 sys.exit(1)
403 def shortusage():
404 print pydoc.synopsis(sys.argv[0])
405 print """
406 For simple text summary:
407 find-fix.py [options] query-set-1.tsv YYYY-MM-DD YYYY-MM-DD
409 For gnuplot presentation:
410 find-fix.py [options] query-set-1.tsv outfile
413 def usage():
414 shortusage()
415 for x in long_opts:
416 padding_limit = 18
417 if x[0][-1:] == '=':
418 print " --" + x[0][:-1],
419 padding_limit = 19
420 else:
421 print " --" + x[0],
422 print (' ' * (padding_limit - len(x[0]))), x[1]
423 print '''
424 Option keywords may be abbreviated to any unique prefix.
425 Most options require "=xxx" arguments.
426 Option order is not important.'''
428 if __name__ == '__main__':
429 main()