3 # See usage() for details, or run with --help option.
5 # .-------------------------------------------------.
6 # | "An ad hoc format deserves an ad hoc parser." |
7 # `-------------------------------------------------'
9 # Some Subversion project log messages include parseable data to help
10 # track who's contributing what. The exact syntax is described in
11 # hacking.html#crediting, but here's an example, indented by three
12 # spaces, i.e., the "Patch by:" starts at the beginning of a line:
14 # Patch by: David Anderson <david.anderson@calixo.net>
15 # <justin@erenkrantz.com>
17 # (I wrote the regression tests.)
18 # Found by: Phineas T. Phinder <phtph@ph1nderz.com>
19 # Suggested by: Snosbig Q. Ptermione <sqptermione@example.com>
20 # Review by: Justin Erenkrantz <justin@erenkrantz.com>
22 # (They caught an off-by-one error in the main loop.)
24 # This is a pathological example, but it shows all the things we might
25 # need to parse. We need to:
27 # - Detect the officially-approved "WORD by: " fields.
28 # - Grab every name (one per line) in each field.
29 # - Handle names in various formats, unifying where possible.
30 # - Expand "me" to the committer name for this revision.
31 # - Associate a parenthetical aside following a field with that field.
33 # NOTES: You might be wondering, why not take 'svn log --xml' input?
34 # Well, that would be the Right Thing to do, but in practice this was
35 # a lot easier to whip up for straight 'svn log' output. I'd have no
36 # objection to it being rewritten to take XML input.
43 my_getopt
= getopt
.gnu_getopt
44 except AttributeError:
45 my_getopt
= getopt
.getopt
46 from urllib
import quote
as url_encode
48 # Pretend we have true booleans on older python versions
55 # Warnings and errors start with these strings. They are typically
56 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
57 warning_prefix
= 'WARNING'
58 error_prefix
= 'ERROR'
60 def complain(msg
, fatal
=False):
61 """Print MSG as a warning, or if FATAL is true, print it as an error
66 sys
.stderr
.write(prefix
+ msg
+ '\n')
71 def html_spam_guard(addr
, entities_only
=False):
72 """Return a spam-protected version of email ADDR that renders the
73 same in HTML as the original address. If ENTITIES_ONLY, use a less
74 thorough mangling scheme involving entities only, avoiding the use
78 return "&#%d;" % ord (x
)
81 return "<span>&#%d;</span>" % ord(x
)
82 return "".join(map(mangle
, addr
))
86 """Return an HTML-escaped version of STR."""
87 return str.replace('&', '&').replace('<', '<').replace('>', '>')
90 _spam_guard_in_html_block_re
= re
.compile(r
'<([^&]*@[^&]*)>')
91 def _spam_guard_in_html_block_func(m
):
92 return "<%s>" % html_spam_guard(m
.group(1))
93 def spam_guard_in_html_block(str):
94 """Take a block of HTML data, and run html_spam_guard() on parts of it."""
95 return _spam_guard_in_html_block_re
.subn(_spam_guard_in_html_block_func
,
98 def html_header(title
, page_heading
=None):
99 """Write HTML file header. TITLE and PAGE_HEADING parameters are
100 expected to already by HTML-escaped if needed."""
103 s
= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n'
104 s
+= ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
105 s
+= '<html><head>\n'
106 s
+= '<meta http-equiv="Content-Type"'
107 s
+= ' content="text/html; charset=UTF-8" />\n'
108 s
+= '<title>%s</title>\n' % title
110 s
+= '<body style="text-color: black; background-color: white">\n\n'
111 s
+= '<h1 style="text-align: center">%s</h1>\n\n' % page_heading
117 return '\n</body>\n</html>\n'
121 # Map contributor names to contributor instances, so that there
122 # exists exactly one instance associated with a given name.
123 # Fold names with email addresses. That is, if we see someone
124 # listed first with just an email address, but later with a real
125 # name and that same email address together, we create only one
126 # instance, and store it under both the email and the real name.
127 all_contributors
= { }
129 # See __hash__() for why this is necessary.
132 def __init__(self
, username
, real_name
, email
):
133 """Instantiate a contributor. Don't use this to generate a
134 Contributor for an external caller, though, use .get() instead."""
135 self
.real_name
= real_name
136 self
.username
= username
138 self
.is_committer
= False # Assume not until hear otherwise.
139 self
.is_full_committer
= False # Assume not until hear otherwise.
140 # Map verbs (e.g., "Patch", "Suggested", "Review") to lists of
141 # LogMessage objects. For example, the log messages stored under
142 # "Patch" represent all the revisions for which this contributor
143 # contributed a patch.
144 self
.activities
= { }
146 self
.unique_hash_value
= Contributor
.hash_value
147 Contributor
.hash_value
+= 1
149 def add_activity(self
, field_name
, log
):
150 """Record that this contributor was active in FIELD_NAME in LOG."""
151 logs
= self
.activities
.get(field_name
)
154 self
.activities
[field_name
] = logs
158 def get(username
, real_name
, email
):
159 """If this contributor is already registered, just return it;
160 otherwise, register it then return it. Hint: use parse() to
161 generate the arguments."""
163 for key
in username
, real_name
, email
:
164 if key
and Contributor
.all_contributors
.has_key(key
):
165 c
= Contributor
.all_contributors
[key
]
167 # If we didn't get a Contributor, create one now.
169 c
= Contributor(username
, real_name
, email
)
170 # If we know identifying information that the Contributor lacks,
171 # then give it to the Contributor now.
174 c
.username
= username
175 Contributor
.all_contributors
[username
] = c
178 c
.real_name
= real_name
179 Contributor
.all_contributors
[real_name
] = c
183 Contributor
.all_contributors
[email
] = c
184 # This Contributor has never been in better shape; return it.
186 get
= staticmethod(get
)
189 """Return a contribution score for this contributor."""
190 # Right now we count a patch as 2, anything else as 1.
192 for activity
in self
.activities
.keys():
193 if activity
== 'Patch':
194 score
+= len(self
.activities
[activity
]) * 2
196 score
+= len(self
.activities
[activity
])
200 """Return a contribution score HTML string for this contributor."""
203 for activity
in self
.activities
.keys():
204 if activity
== 'Patch':
205 patch_score
+= len(self
.activities
[activity
])
207 other_score
+= len(self
.activities
[activity
])
210 elif patch_score
== 1:
211 patch_str
= "1 patch"
213 patch_str
= "%d patches" % patch_score
216 elif other_score
== 1:
217 other_str
= "1 non-patch"
219 other_str
= "%d non-patches" % other_score
222 return ", ".join((patch_str
, other_str
))
228 def __cmp__(self
, other
):
229 if self
.is_full_committer
and not other
.is_full_committer
:
231 if other
.is_full_committer
and not self
.is_full_committer
:
233 result
= cmp(self
.score(), other
.score())
235 return cmp(self
.big_name(), other
.big_name())
240 """See LogMessage.__hash__() for why this exists."""
241 return self
.hash_value
244 """Parse NAME, which can be
246 - A committer username, or
247 - A space-separated real name, or
248 - A space-separated real name followed by an email address in
250 - Just an email address in angle brackets.
252 (The email address may have '@' disguised as '{_AT_}'.)
254 Return a tuple of (committer_username, real_name, email_address)
255 any of which can be None if not available in NAME."""
259 name_components
= name
.split()
260 if len(name_components
) == 1:
261 name
= name_components
[0] # Effectively, name = name.strip()
262 if name
[0] == '<' and name
[-1] == '>':
264 elif name
.find('@') != -1 or name
.find('{_AT_}') != -1:
268 elif name_components
[-1][0] == '<' and name_components
[-1][-1] == '>':
269 real_name
= ' '.join(name_components
[0:-1])
270 email
= name_components
[-1][1:-1]
272 real_name
= ' '.join(name_components
)
274 if email
is not None:
275 # We unobfuscate here and work with the '@' internally, since
276 # we'll obfuscate it again (differently) before writing it out.
277 email
= email
.replace('{_AT_}', '@')
279 return username
, real_name
, email
280 parse
= staticmethod(parse
)
282 def canonical_name(self
):
283 """Return a canonical name for this contributor. The canonical
284 name may or may not be based on the contributor's actual email
287 The canonical name will not contain filename-unsafe characters.
289 This method is guaranteed to return the same canonical name every
290 time only if no further contributions are recorded from this
291 contributor after the first call. This is because a contribution
292 may bring a new form of the contributor's name, one which affects
293 the algorithm used to construct canonical names."""
296 retval
= self
.username
298 # Take some rudimentary steps to shorten the email address, to
299 # make it more manageable. If this is ever discovered to result
300 # in collisions, we can always just use to the full address.
302 at_posn
= self
.email
.index('@')
303 first_dot_after_at
= self
.email
.index('.', at_posn
)
304 retval
= self
.email
[0:first_dot_after_at
]
308 # Last resort: construct canonical name based on real name.
309 retval
= ''.join(self
.real_name
.lower().split(' '))
311 complain('Unable to construct a canonical name for Contributor.', True)
312 return url_encode(retval
, safe
="!#$&'()+,;<=>@[]^`{}~")
314 def big_name(self
, html
=False, html_eo
=False):
315 """Return as complete a name as possible for this contributor.
316 If HTML, then call html_spam_guard() on email addresses.
317 If HTML_EO, then do the same, but specifying entities_only mode."""
318 html
= html
or html_eo
322 name_bits
.append(escape_html(self
.real_name
))
324 name_bits
.append(self
.real_name
)
326 if not self
.real_name
and not self
.username
:
327 name_bits
.append(self
.email
)
329 name_bits
.append("<%s>" % html_spam_guard(self
.email
, html_eo
))
331 name_bits
.append("<%s>" % self
.email
)
333 if not self
.real_name
and not self
.email
:
334 name_bits
.append(self
.username
)
336 name_bits
.append("(%s)" % self
.username
)
337 return " ".join(name_bits
)
342 s
+= "\ncanonical name: '%s'" % self
.canonical_name()
343 if len(self
.activities
) > 0:
345 for activity
in self
.activities
.keys():
346 val
= self
.activities
[activity
]
347 s
+= '[%s:' % activity
349 s
+= ' %s' % log
.revision
353 def html_out(self
, revision_url_pattern
, filename
):
354 """Create an HTML file named FILENAME, showing all the revisions in which
355 this contributor was active."""
356 out
= open(filename
, 'w')
357 out
.write(html_header(self
.big_name(html_eo
=True),
358 self
.big_name(html
=True)))
361 sorted_activities
= self
.activities
.keys()
362 sorted_activities
.sort()
364 out
.write('<div class="h2" id="activities" title="activities">\n\n')
365 out
.write('<table border="1">\n')
367 for activity
in sorted_activities
:
368 out
.write('<td>%s</td>\n\n' % activity
)
371 for activity
in sorted_activities
:
373 first_activity
= True
374 for log
in self
.activities
[activity
]:
378 first_activity
= False
379 out
.write('%s<a href="#%s">%s</a>' % (s
, log
.revision
, log
.revision
))
380 unique_logs
[log
] = True
383 out
.write('</table>\n\n')
384 out
.write('</div>\n\n')
386 sorted_logs
= unique_logs
.keys()
388 for log
in sorted_logs
:
389 out
.write('<hr />\n')
390 out
.write('<div class="h3" id="%s" title="%s">\n' % (log
.revision
,
393 if revision_url_pattern
:
394 revision_url
= revision_url_pattern
% log
.revision
[1:]
395 revision
= '<a href="%s">%s</a>' \
396 % (escape_html(revision_url
), log
.revision
)
398 revision
= log
.revision
399 out
.write('<b>%s | %s | %s</b>\n\n' % (revision
,
400 escape_html(log
.committer
),
401 escape_html(log
.date
)))
402 out
.write(spam_guard_in_html_block(escape_html(log
.message
)))
403 out
.write('</pre>\n')
404 out
.write('</div>\n\n')
405 out
.write('<hr />\n')
407 out
.write(html_footer())
412 """One field in one log message."""
413 def __init__(self
, name
, alias
= None):
414 # The name of this field (e.g., "Patch", "Review", etc).
416 # An alias for the name of this field (e.g., "Reviewed").
418 # A list of contributor objects, in the order in which they were
419 # encountered in the field.
420 self
.contributors
= [ ]
421 # Any parenthesized asides immediately following the field. The
422 # parentheses and trailing newline are left on. In theory, this
423 # supports concatenation of consecutive asides. In practice, the
424 # parser only detects the first one anyway, because additional
425 # ones are very uncommon and furthermore by that point one should
426 # probably be looking at the full log message.
428 def add_contributor(self
, contributor
):
429 self
.contributors
.append(contributor
)
430 def add_endum(self
, addendum
):
431 self
.addendum
+= addendum
433 s
= 'FIELD: %s (%d contributors)\n' % (self
.name
, len(self
.contributors
))
434 for contributor
in self
.contributors
:
435 s
+= str(contributor
) + '\n'
441 # Maps revision strings (e.g., "r12345") onto LogMessage instances,
442 # holding all the LogMessage instances ever created.
444 # Keep track of youngest rev.
446 def __init__(self
, revision
, committer
, date
):
447 """Instantiate a log message. All arguments are strings,
448 including REVISION, which should retain its leading 'r'."""
449 self
.revision
= revision
450 self
.committer
= committer
453 # Map field names (e.g., "Patch", "Review", "Suggested") onto
456 if LogMessage
.all_logs
.has_key(revision
):
457 complain("Revision '%s' seen more than once" % revision
, True)
458 LogMessage
.all_logs
[revision
] = self
459 rev_as_number
= int(revision
[1:])
460 if rev_as_number
> LogMessage
.max_revnum
:
461 LogMessage
.max_revnum
= rev_as_number
462 def add_field(self
, field
):
463 self
.fields
[field
.name
] = field
464 def accum(self
, line
):
465 """Accumulate one more line of raw message."""
468 def __cmp__(self
, other
):
469 """Compare two log messages by revision number, for sort().
470 Return -1, 0 or 1 depending on whether a > b, a == b, or a < b.
471 Note that this is reversed from normal sorting behavior, but it's
472 what we want for reverse chronological ordering of revisions."""
473 a
= int(self
.revision
[1:])
474 b
= int(other
.revision
[1:])
480 """I don't really understand why defining __cmp__() but not
481 __hash__() renders an object type unfit to be a dictionary key,
482 especially in light of the recommendation that if a class defines
483 mutable objects and implements __cmp__() or __eq__(), then it
484 should not implement __hash__(). See these for details:
485 http://mail.python.org/pipermail/python-dev/2004-February/042580.html
486 http://mail.python.org/pipermail/python-bugs-list/2003-December/021314.html
488 In the meantime, I think it's safe to use the revision as a hash value."""
489 return int(self
.revision
[1:])
493 header
= ' LOG: %s | %s ' % (self
.revision
, self
.committer
)
497 for field_name
in self
.fields
.keys():
498 s
+= str(self
.fields
[field_name
]) + '\n'
500 s
+= '-' * len(header
)
507 ### Code to parse the logs. ##
509 log_separator
= '-' * 72 + '\n'
510 log_header_re
= re
.compile\
511 ('^(r[0-9]+) \| ([^|]+) \| ([^|]+) \| ([0-9]+)[^0-9]')
512 field_re
= re
.compile('^(Patch|Review(ed)?|Suggested|Found) by:\s*(.*)')
513 field_aliases
= { 'Reviewed' : 'Review' }
514 parenthetical_aside_re
= re
.compile('^\s*\(.*\)\s*$')
517 just_saw_separator
= False
520 line
= input.readline()
522 if line
== log_separator
:
523 if just_saw_separator
:
524 sys
.stderr
.write('Two separators in a row.\n')
527 just_saw_separator
= True
531 if just_saw_separator
:
532 m
= log_header_re
.match(line
)
534 sys
.stderr
.write('Could not match log message header.\n')
535 sys
.stderr
.write('Line was:\n')
536 sys
.stderr
.write("'%s'\n" % line
)
539 log
= LogMessage(m
.group(1), m
.group(2), m
.group(3))
540 num_lines
= int(m
.group(4))
541 just_saw_separator
= False
542 line
= input.readline()
543 # Handle 'svn log -v' by waiting for the blank line.
545 line
= input.readline()
546 # Parse the log message.
549 line
= input.readline()
551 m
= field_re
.match(line
)
553 # We're on the first line of a field. Parse the field.
557 if field_aliases
.has_key(ident
):
558 field
= Field(field_aliases
[ident
], ident
)
561 # Each line begins either with "WORD by:", or with whitespace.
562 in_field_re
= re
.compile('^('
563 + (field
.alias
or field
.name
)
564 + ' by:\s+|\s+)([^\s(].*)')
565 m
= in_field_re
.match(line
)
566 user
, real
, email
= Contributor
.parse(m
.group(2))
569 c
= Contributor
.get(user
, real
, email
)
570 c
.add_activity(field
.name
, log
)
571 field
.add_contributor(c
)
572 line
= input.readline()
575 m
= in_field_re
.match(line
)
577 m
= field_re
.match(line
)
579 aside_match
= parenthetical_aside_re
.match(line
)
581 field
.add_endum(line
)
587 index_introduction
= '''
588 <p>The following list of contributors and their contributions is meant
589 to help us keep track of whom to consider for commit access. The list
590 was generated from "svn log" output by <a
591 href="http://svn.collab.net/repos/svn/trunk/tools/dev/contribulyze.py"
592 >contribulyze.py</a>, which looks for log messages that use the <a
593 href="http://subversion.tigris.org/hacking.html#crediting">special
594 contribution format</a>.</p>
596 <p><i>Please do not use this list as a generic guide to who has
597 contributed what to Subversion!</i> It omits existing <a
598 href="http://svn.collab.net/repos/svn/trunk/COMMITTERS"
599 >full committers</a>, for example, because they are irrelevant to our
600 search for new committers. Also, it merely counts changes, it does
601 not evaluate them. To truly understand what someone has contributed,
602 you have to read their changes in detail. This page can only assist
603 human judgement, not substitute for it.</p>
607 def drop(revision_url_pattern
):
610 # The data structures are all linked up nicely to one another. You
611 # can get all the LogMessages, and each LogMessage contains all the
612 # Contributors involved with that commit; likewise, each Contributor
613 # points back to all the LogMessages it contributed to.
615 # However, the HTML output is pretty simple right now. It's not take
616 # full advantage of all that cross-linking. For each contributor, we
617 # just create a file listing all the revisions contributed to; and we
618 # build a master index of all contributors, each name being a link to
619 # that contributor's individual file. Much more is possible... but
620 # let's just get this up and running first.
622 for key
in LogMessage
.all_logs
.keys():
623 # You could print out all log messages this way, if you wanted to.
625 # print LogMessage.all_logs[key]
627 detail_subdir
= "detail"
628 if not os
.path
.exists(detail_subdir
):
629 os
.mkdir(detail_subdir
)
631 index
= open('index.html', 'w')
632 index
.write(html_header('Contributors as of r%d' % LogMessage
.max_revnum
))
633 index
.write(index_introduction
)
634 index
.write('<ol>\n')
635 # The same contributor appears under multiple keys, so uniquify.
636 seen_contributors
= { }
637 # Sorting alphabetically is acceptable, but even better would be to
638 # sort by number of contributions, so the most active people appear at
639 # the top -- that way we know whom to look at first for commit access
641 sorted_contributors
= Contributor
.all_contributors
.values()
642 sorted_contributors
.sort()
643 for c
in sorted_contributors
:
644 if not seen_contributors
.has_key(c
):
646 if c
.is_full_committer
:
647 # Don't even bother to print out full committers. They are
648 # a distraction from the purposes for which we're here.
653 committerness
= ' (partial committer)'
654 urlpath
= "%s/%s.html" % (detail_subdir
, c
.canonical_name())
655 fname
= os
.path
.join(detail_subdir
, "%s.html" % c
.canonical_name())
656 index
.write('<li><p><a href="%s">%s</a> [%s]%s</p></li>\n'
657 % (url_encode(urlpath
),
658 c
.big_name(html
=True),
659 c
.score_str(), committerness
))
660 c
.html_out(revision_url_pattern
, fname
)
661 seen_contributors
[c
] = True
662 index
.write('</ol>\n')
663 index
.write(html_footer())
667 def process_committers(committers
):
668 """Read from open file handle COMMITTERS, which should be in
669 the same format as the Subversion 'COMMITTERS' file. Create
670 Contributor objects based on the contents."""
671 line
= committers
.readline()
672 while line
!= 'Blanket commit access:\n':
673 line
= committers
.readline()
674 in_full_committers
= True
675 matcher
= re
.compile('(\S+)\s+([^\(\)]+)\s+(\([^()]+\)){0,1}')
676 line
= committers
.readline()
678 # Every @-sign we see after this point indicates a committer line...
679 if line
== 'Commit access for specific areas:\n':
680 in_full_committers
= False
681 # ...except in the "dormant committers" area, which comes last anyway.
682 if line
== 'Committers who have asked to be listed as dormant:\n':
683 in_full_committers
= True
684 elif line
.find('@') >= 0:
686 m
= matcher
.match(line
)
688 real_and_email
= m
.group(2).strip()
689 ignored
, real
, email
= Contributor
.parse(real_and_email
)
690 c
= Contributor
.get(user
, real
, email
)
691 c
.is_committer
= True
692 c
.is_full_committer
= in_full_committers
693 line
= committers
.readline()
697 print 'USAGE: %s [-C COMMITTERS_FILE] < SVN_LOG_OR_LOG-V_OUTPUT' \
698 % os
.path
.basename(sys
.argv
[0])
700 print 'Create HTML files in the current directory, rooted at index.html,'
701 print 'in which you can browse to see who contributed what.'
703 print 'The log input should use the contribution-tracking format defined'
704 print 'in http://subversion.tigris.org/hacking.html#crediting.'
708 print ' -h, -H, -?, --help Print this usage message and exit'
709 print ' -C FILE Use FILE as the COMMITTERS file'
710 print ' -U URL Use URL as a Python interpolation pattern to'
711 print ' generate URLs to link revisions to some kind'
712 print ' of web-based viewer (e.g. ViewCVS). The'
713 print ' interpolation pattern should contain exactly'
714 print ' one format specifier, \'%s\', which will be'
715 print ' replaced with the revision number.'
721 opts
, args
= my_getopt(sys
.argv
[1:], 'C:U:hH?', [ 'help' ])
722 except getopt
.GetoptError
, e
:
723 complain(str(e
) + '\n\n')
728 revision_url_pattern
= None
729 for opt
, value
in opts
:
730 if opt
in ('--help', '-h', '-H', '-?'):
734 process_committers(open(value
))
736 revision_url_pattern
= value
742 drop(revision_url_pattern
)
744 if __name__
== '__main__':