gn build: Merge r372706
[llvm-complete.git] / utils / Reviewing / find_interesting_reviews.py
blob7bfbec8cfde9e5459274fc25dc3140438ef921e5
1 #!/usr/bin/env python
3 from __future__ import print_function
5 import argparse
6 import email.mime.multipart
7 import email.mime.text
8 import logging
9 import os.path
10 import pickle
11 import re
12 import smtplib
13 import subprocess
14 import sys
15 from datetime import datetime, timedelta
16 from phabricator import Phabricator
18 # Setting up a virtualenv to run this script can be done by running the
19 # following commands:
20 # $ virtualenv venv
21 # $ . ./venv/bin/activate
22 # $ pip install Phabricator
24 GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )
26 # The below PhabXXX classes represent objects as modelled by Phabricator.
27 # The classes can be serialized to disk, to try and make sure that we don't
28 # needlessly have to re-fetch lots of data from Phabricator, as that would
29 # make this script unusably slow.
32 class PhabObject:
33 OBJECT_KIND = None
35 def __init__(self, id):
36 self.id = id
39 class PhabObjectCache:
40 def __init__(self, PhabObjectClass):
41 self.PhabObjectClass = PhabObjectClass
42 self.most_recent_info = None
43 self.oldest_info = None
44 self.id2PhabObjects = {}
46 def get_name(self):
47 return self.PhabObjectClass.OBJECT_KIND + "sCache"
49 def get(self, id):
50 if id not in self.id2PhabObjects:
51 self.id2PhabObjects[id] = self.PhabObjectClass(id)
52 return self.id2PhabObjects[id]
54 def get_ids_in_cache(self):
55 return list(self.id2PhabObjects.keys())
57 def get_objects(self):
58 return list(self.id2PhabObjects.values())
60 DEFAULT_DIRECTORY = "PhabObjectCache"
62 def _get_pickle_name(self, directory):
63 file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
64 return os.path.join(directory, file_name)
66 def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
67 """
68 FIXME: consider if serializing to JSON would bring interoperability
69 advantages over serializing to pickle.
70 """
71 try:
72 f = open(self._get_pickle_name(directory), "rb")
73 except IOError as err:
74 print("Could not find cache. Error message: {0}. Continuing..."
75 .format(err))
76 else:
77 with f:
78 try:
79 d = pickle.load(f)
80 self.__dict__.update(d)
81 except EOFError as err:
82 print("Cache seems to be corrupt. " +
83 "Not using cache. Error message: {0}".format(err))
85 def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
86 if not os.path.exists(directory):
87 os.makedirs(directory)
88 with open(self._get_pickle_name(directory), "wb") as f:
89 pickle.dump(self.__dict__, f)
90 print("wrote cache to disk, most_recent_info= {0}".format(
91 datetime.fromtimestamp(self.most_recent_info)
92 if self.most_recent_info is not None else None))
95 class PhabReview(PhabObject):
96 OBJECT_KIND = "Review"
98 def __init__(self, id):
99 PhabObject.__init__(self, id)
101 def update(self, title, dateCreated, dateModified, author):
102 self.title = title
103 self.dateCreated = dateCreated
104 self.dateModified = dateModified
105 self.author = author
107 def setPhabDiffs(self, phabDiffs):
108 self.phabDiffs = phabDiffs
111 class PhabUser(PhabObject):
112 OBJECT_KIND = "User"
114 def __init__(self, id):
115 PhabObject.__init__(self, id)
117 def update(self, phid, realName):
118 self.phid = phid
119 self.realName = realName
122 class PhabHunk:
123 def __init__(self, rest_api_hunk):
124 self.oldOffset = int(rest_api_hunk["oldOffset"])
125 self.oldLength = int(rest_api_hunk["oldLength"])
126 # self.actual_lines_changed_offset will contain the offsets of the
127 # lines that were changed in this hunk.
128 self.actual_lines_changed_offset = []
129 offset = self.oldOffset
130 inHunk = False
131 hunkStart = -1
132 contextLines = 3
133 for line in rest_api_hunk["corpus"].split("\n"):
134 if line.startswith("+"):
135 # line is a new line that got introduced in this patch.
136 # Do not record it as a changed line.
137 if inHunk is False:
138 inHunk = True
139 hunkStart = max(self.oldOffset, offset - contextLines)
140 continue
141 if line.startswith("-"):
142 # line was changed or removed from the older version of the
143 # code. Record it as a changed line.
144 if inHunk is False:
145 inHunk = True
146 hunkStart = max(self.oldOffset, offset - contextLines)
147 offset += 1
148 continue
149 # line is a context line.
150 if inHunk is True:
151 inHunk = False
152 hunkEnd = offset + contextLines
153 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
154 offset += 1
155 if inHunk is True:
156 hunkEnd = offset + contextLines
157 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
159 # The above algorithm could result in adjacent or overlapping ranges
160 # being recorded into self.actual_lines_changed_offset.
161 # Merge the adjacent and overlapping ranges in there:
162 t = []
163 lastRange = None
164 for start, end in self.actual_lines_changed_offset + \
165 [(sys.maxsize, sys.maxsize)]:
166 if lastRange is None:
167 lastRange = (start, end)
168 else:
169 if lastRange[1] >= start:
170 lastRange = (lastRange[0], end)
171 else:
172 t.append(lastRange)
173 lastRange = (start, end)
174 self.actual_lines_changed_offset = t
177 class PhabChange:
178 def __init__(self, rest_api_change):
179 self.oldPath = rest_api_change["oldPath"]
180 self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
183 class PhabDiff(PhabObject):
184 OBJECT_KIND = "Diff"
186 def __init__(self, id):
187 PhabObject.__init__(self, id)
189 def update(self, rest_api_results):
190 self.revisionID = rest_api_results["revisionID"]
191 self.dateModified = int(rest_api_results["dateModified"])
192 self.dateCreated = int(rest_api_results["dateCreated"])
193 self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
196 class ReviewsCache(PhabObjectCache):
197 def __init__(self):
198 PhabObjectCache.__init__(self, PhabReview)
201 class UsersCache(PhabObjectCache):
202 def __init__(self):
203 PhabObjectCache.__init__(self, PhabUser)
206 reviews_cache = ReviewsCache()
207 users_cache = UsersCache()
210 def init_phab_connection():
211 phab = Phabricator()
212 phab.update_interfaces()
213 return phab
216 def update_cached_info(phab, cache, phab_query, order, record_results,
217 max_nr_entries_per_fetch, max_nr_days_to_cache):
218 q = phab
219 LIMIT = max_nr_entries_per_fetch
220 for query_step in phab_query:
221 q = getattr(q, query_step)
222 results = q(order=order, limit=LIMIT)
223 most_recent_info, oldest_info = record_results(cache, results, phab)
224 oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
225 timedelta(days=max_nr_days_to_cache)
226 most_recent_info_overall = most_recent_info
227 cache.write_cache_to_disk()
228 after = results["cursor"]["after"]
229 print("after: {0!r}".format(after))
230 print("most_recent_info: {0}".format(
231 datetime.fromtimestamp(most_recent_info)))
232 while (after is not None
233 and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
234 need_more_older_data = \
235 (cache.oldest_info is None or
236 datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
237 print(("need_more_older_data={0} cache.oldest_info={1} " +
238 "oldest_info_to_fetch={2}").format(
239 need_more_older_data,
240 datetime.fromtimestamp(cache.oldest_info)
241 if cache.oldest_info is not None else None,
242 oldest_info_to_fetch))
243 need_more_newer_data = \
244 (cache.most_recent_info is None or
245 cache.most_recent_info < most_recent_info)
246 print(("need_more_newer_data={0} cache.most_recent_info={1} " +
247 "most_recent_info={2}")
248 .format(need_more_newer_data, cache.most_recent_info,
249 most_recent_info))
250 if not need_more_older_data and not need_more_newer_data:
251 break
252 results = q(order=order, after=after, limit=LIMIT)
253 most_recent_info, oldest_info = record_results(cache, results, phab)
254 after = results["cursor"]["after"]
255 print("after: {0!r}".format(after))
256 print("most_recent_info: {0}".format(
257 datetime.fromtimestamp(most_recent_info)))
258 cache.write_cache_to_disk()
259 cache.most_recent_info = most_recent_info_overall
260 if after is None:
261 # We did fetch all records. Mark the cache to contain all info since
262 # the start of time.
263 oldest_info = 0
264 cache.oldest_info = oldest_info
265 cache.write_cache_to_disk()
268 def record_reviews(cache, reviews, phab):
269 most_recent_info = None
270 oldest_info = None
271 for reviewInfo in reviews["data"]:
272 if reviewInfo["type"] != "DREV":
273 continue
274 id = reviewInfo["id"]
275 # phid = reviewInfo["phid"]
276 dateModified = int(reviewInfo["fields"]["dateModified"])
277 dateCreated = int(reviewInfo["fields"]["dateCreated"])
278 title = reviewInfo["fields"]["title"]
279 author = reviewInfo["fields"]["authorPHID"]
280 phabReview = cache.get(id)
281 if "dateModified" not in phabReview.__dict__ or \
282 dateModified > phabReview.dateModified:
283 diff_results = phab.differential.querydiffs(revisionIDs=[id])
284 diff_ids = sorted(diff_results.keys())
285 phabDiffs = []
286 for diff_id in diff_ids:
287 diffInfo = diff_results[diff_id]
288 d = PhabDiff(diff_id)
289 d.update(diffInfo)
290 phabDiffs.append(d)
291 phabReview.update(title, dateCreated, dateModified, author)
292 phabReview.setPhabDiffs(phabDiffs)
293 print("Updated D{0} modified on {1} ({2} diffs)".format(
294 id, datetime.fromtimestamp(dateModified), len(phabDiffs)))
296 if most_recent_info is None:
297 most_recent_info = dateModified
298 elif most_recent_info < dateModified:
299 most_recent_info = dateModified
301 if oldest_info is None:
302 oldest_info = dateModified
303 elif oldest_info > dateModified:
304 oldest_info = dateModified
305 return most_recent_info, oldest_info
308 def record_users(cache, users, phab):
309 most_recent_info = None
310 oldest_info = None
311 for info in users["data"]:
312 if info["type"] != "USER":
313 continue
314 id = info["id"]
315 phid = info["phid"]
316 dateModified = int(info["fields"]["dateModified"])
317 # dateCreated = int(info["fields"]["dateCreated"])
318 realName = info["fields"]["realName"]
319 phabUser = cache.get(id)
320 phabUser.update(phid, realName)
321 if most_recent_info is None:
322 most_recent_info = dateModified
323 elif most_recent_info < dateModified:
324 most_recent_info = dateModified
325 if oldest_info is None:
326 oldest_info = dateModified
327 elif oldest_info > dateModified:
328 oldest_info = dateModified
329 return most_recent_info, oldest_info
332 PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
333 "updated", record_reviews, 5, 7),
334 (users_cache, ("user", "search"), "newest", record_users,
335 100, 1000))
338 def load_cache():
339 for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
340 cache.populate_cache_from_disk()
341 print("Loaded {0} nr entries: {1}".format(
342 cache.get_name(), len(cache.get_ids_in_cache())))
343 print("Loaded {0} has most recent info: {1}".format(
344 cache.get_name(),
345 datetime.fromtimestamp(cache.most_recent_info)
346 if cache.most_recent_info is not None else None))
349 def update_cache(phab):
350 load_cache()
351 for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
352 max_nr_days_to_cache in PHABCACHESINFO:
353 update_cached_info(phab, cache, phab_query, order, record_results,
354 max_nr_entries_per_fetch, max_nr_days_to_cache)
355 ids_in_cache = cache.get_ids_in_cache()
356 print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
357 cache.write_cache_to_disk()
360 def get_most_recent_reviews(days):
361 newest_reviews = sorted(
362 reviews_cache.get_objects(), key=lambda r: -r.dateModified)
363 if len(newest_reviews) == 0:
364 return newest_reviews
365 most_recent_review_time = \
366 datetime.fromtimestamp(newest_reviews[0].dateModified)
367 cut_off_date = most_recent_review_time - timedelta(days=days)
368 result = []
369 for review in newest_reviews:
370 if datetime.fromtimestamp(review.dateModified) < cut_off_date:
371 return result
372 result.append(review)
373 return result
376 # All of the above code is about fetching data from Phabricator and caching it
377 # on local disk. The below code contains the actual "business logic" for this
378 # script.
380 _userphid2realname = None
383 def get_real_name_from_author(user_phid):
384 global _userphid2realname
385 if _userphid2realname is None:
386 _userphid2realname = {}
387 for user in users_cache.get_objects():
388 _userphid2realname[user.phid] = user.realName
389 return _userphid2realname.get(user_phid, "unknown")
392 def print_most_recent_reviews(phab, days, filter_reviewers):
393 msgs = []
395 def add_msg(msg):
396 msgs.append(msg)
397 print(msg)
399 newest_reviews = get_most_recent_reviews(days)
400 add_msg(u"These are the reviews that look interesting to be reviewed. " +
401 u"The report below has 2 sections. The first " +
402 u"section is organized per review; the second section is organized "
403 + u"per potential reviewer.\n")
404 oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
405 oldest_datetime = \
406 datetime.fromtimestamp(oldest_review.dateModified) \
407 if oldest_review else None
408 add_msg((u"The report below is based on analyzing the reviews that got " +
409 u"touched in the past {0} days (since {1}). " +
410 u"The script found {2} such reviews.\n").format(
411 days, oldest_datetime, len(newest_reviews)))
412 reviewer2reviews_and_scores = {}
413 for i, review in enumerate(newest_reviews):
414 matched_reviewers = find_reviewers_for_review(review)
415 matched_reviewers = filter_reviewers(matched_reviewers)
416 if len(matched_reviewers) == 0:
417 continue
418 add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
419 u" Last updated on {4}").format(
420 i, review.id,
421 get_real_name_from_author(review.author), review.title,
422 datetime.fromtimestamp(review.dateModified)))
423 for reviewer, scores in matched_reviewers:
424 add_msg(u" potential reviewer {0}, score {1}".format(
425 reviewer,
426 "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
427 if reviewer not in reviewer2reviews_and_scores:
428 reviewer2reviews_and_scores[reviewer] = []
429 reviewer2reviews_and_scores[reviewer].append((review, scores))
431 # Print out a summary per reviewer.
432 for reviewer in sorted(reviewer2reviews_and_scores.keys()):
433 reviews_and_scores = reviewer2reviews_and_scores[reviewer]
434 reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
435 add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
436 reviewer, len(reviews_and_scores)))
437 for review, scores in reviews_and_scores:
438 add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
439 "/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
440 review.title, get_real_name_from_author(review.author)))
441 return "\n".join(msgs)
444 def get_git_cmd_output(cmd):
445 output = None
446 try:
447 logging.debug(cmd)
448 output = subprocess.check_output(
449 cmd, shell=True, stderr=subprocess.STDOUT)
450 except subprocess.CalledProcessError as e:
451 logging.debug(str(e))
452 if output is None:
453 return None
454 return output.decode("utf-8", errors='ignore')
457 reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
460 def parse_blame_output_line_porcelain(blame_output):
461 email2nr_occurences = {}
462 if blame_output is None:
463 return email2nr_occurences
464 for line in blame_output.split('\n'):
465 m = reAuthorMail.match(line)
466 if m:
467 author_email_address = m.group(1)
468 if author_email_address not in email2nr_occurences:
469 email2nr_occurences[author_email_address] = 1
470 else:
471 email2nr_occurences[author_email_address] += 1
472 return email2nr_occurences
475 def find_reviewers_for_diff_heuristic(diff):
476 # Heuristic 1: assume good reviewers are the ones that touched the same
477 # lines before as this patch is touching.
478 # Heuristic 2: assume good reviewers are the ones that touched the same
479 # files before as this patch is touching.
480 reviewers2nr_lines_touched = {}
481 reviewers2nr_files_touched = {}
482 # Assume last revision before diff was modified is the revision the diff
483 # applies to.
484 git_repo = "git_repos/llvm"
485 cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
486 git_repo,
487 datetime.fromtimestamp(
488 diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
489 base_revision = get_git_cmd_output(cmd).strip()
490 logging.debug("Base revision={0}".format(base_revision))
491 for change in diff.changes:
492 path = change.oldPath
493 # Compute heuristic 1: look at context of patch lines.
494 for hunk in change.hunks:
495 for start_line, end_line in hunk.actual_lines_changed_offset:
496 # Collect git blame results for authors in those ranges.
497 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
498 "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
499 git_repo, start_line, end_line, base_revision, path)
500 blame_output = get_git_cmd_output(cmd)
501 for reviewer, nr_occurences in \
502 parse_blame_output_line_porcelain(blame_output).items():
503 if reviewer not in reviewers2nr_lines_touched:
504 reviewers2nr_lines_touched[reviewer] = 0
505 reviewers2nr_lines_touched[reviewer] += nr_occurences
506 # Compute heuristic 2: don't look at context, just at files touched.
507 # Collect git blame results for authors in those ranges.
508 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
509 "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
510 path)
511 blame_output = get_git_cmd_output(cmd)
512 for reviewer, nr_occurences in parse_blame_output_line_porcelain(
513 blame_output).items():
514 if reviewer not in reviewers2nr_files_touched:
515 reviewers2nr_files_touched[reviewer] = 0
516 reviewers2nr_files_touched[reviewer] += 1
518 # Compute "match scores"
519 total_nr_lines = sum(reviewers2nr_lines_touched.values())
520 total_nr_files = len(diff.changes)
521 reviewers_matchscores = \
522 [(reviewer,
523 (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
524 if total_nr_lines != 0 else 0,
525 reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
526 if total_nr_files != 0 else 0))
527 for reviewer, nr_lines
528 in reviewers2nr_files_touched.items()]
529 reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
530 return reviewers_matchscores
533 def find_reviewers_for_review(review):
534 # Process the newest diff first.
535 diffs = sorted(
536 review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
537 if len(diffs) == 0:
538 return
539 diff = diffs[0]
540 matched_reviewers = find_reviewers_for_diff_heuristic(diff)
541 # Show progress, as this is a slow operation:
542 sys.stdout.write('.')
543 sys.stdout.flush()
544 logging.debug(u"matched_reviewers: {0}".format(matched_reviewers))
545 return matched_reviewers
548 def update_git_repos():
549 git_repos_directory = "git_repos"
550 for name, url in GIT_REPO_METADATA:
551 dirname = os.path.join(git_repos_directory, name)
552 if not os.path.exists(dirname):
553 cmd = "git clone {0} {1}".format(url, dirname)
554 output = get_git_cmd_output(cmd)
555 cmd = "git -C {0} pull --rebase".format(dirname)
556 output = get_git_cmd_output(cmd)
559 def send_emails(email_addresses, sender, msg):
560 s = smtplib.SMTP()
561 s.connect()
562 for email_address in email_addresses:
563 email_msg = email.mime.multipart.MIMEMultipart()
564 email_msg['From'] = sender
565 email_msg['To'] = email_address
566 email_msg['Subject'] = 'LLVM patches you may be able to review.'
567 email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain'))
568 # python 3.x: s.send_message(email_msg)
569 s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string())
570 s.quit()
573 def filter_reviewers_to_report_for(people_to_look_for):
574 # The below is just an example filter, to only report potential reviews
575 # to do for the people that will receive the report email.
576 return lambda potential_reviewers: [r for r in potential_reviewers
577 if r[0] in people_to_look_for]
580 def main():
581 parser = argparse.ArgumentParser(
582 description='Match open reviews to potential reviewers.')
583 parser.add_argument(
584 '--no-update-cache',
585 dest='update_cache',
586 action='store_false',
587 default=True,
588 help='Do not update cached Phabricator objects')
589 parser.add_argument(
590 '--email-report',
591 dest='email_report',
592 nargs='*',
593 default="",
594 help="A email addresses to send the report to.")
595 parser.add_argument(
596 '--sender',
597 dest='sender',
598 default="",
599 help="The email address to use in 'From' on messages emailed out.")
600 parser.add_argument(
601 '--email-addresses',
602 dest='email_addresses',
603 nargs='*',
604 help="The email addresses (as known by LLVM git) of " +
605 "the people to look for reviews for.")
606 parser.add_argument('--verbose', '-v', action='count')
608 args = parser.parse_args()
610 if args.verbose >= 1:
611 logging.basicConfig(level=logging.DEBUG)
613 people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
614 logging.debug("Will look for reviews that following contributors could " +
615 "review: {}".format(people_to_look_for))
616 logging.debug("Will email a report to: {}".format(args.email_report))
618 phab = init_phab_connection()
620 if args.update_cache:
621 update_cache(phab)
623 load_cache()
624 update_git_repos()
625 msg = print_most_recent_reviews(
626 phab,
627 days=1,
628 filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))
630 if args.email_report != []:
631 send_emails(args.email_report, args.sender, msg)
634 if __name__ == "__main__":
635 main()