[llvm-exegesis] Fix missing std::move.
[llvm-complete.git] / utils / Reviewing / find_interesting_reviews.py
blob5af462b54a9fb7f237ea36e262946e6aa1331d93
1 #!/usr/bin/env python
3 import argparse
4 import email.mime.multipart
5 import email.mime.text
6 import logging
7 import os.path
8 import pickle
9 import re
10 import smtplib
11 import subprocess
12 import sys
13 from datetime import datetime, timedelta
14 from phabricator import Phabricator
16 # Setting up a virtualenv to run this script can be done by running the
17 # following commands:
18 # $ virtualenv venv
19 # $ . ./venv/bin/activate
20 # $ pip install Phabricator
22 GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )
24 # The below PhabXXX classes represent objects as modelled by Phabricator.
25 # The classes can be serialized to disk, to try and make sure that we don't
26 # needlessly have to re-fetch lots of data from Phabricator, as that would
27 # make this script unusably slow.
30 class PhabObject:
31 OBJECT_KIND = None
33 def __init__(self, id):
34 self.id = id
37 class PhabObjectCache:
38 def __init__(self, PhabObjectClass):
39 self.PhabObjectClass = PhabObjectClass
40 self.most_recent_info = None
41 self.oldest_info = None
42 self.id2PhabObjects = {}
44 def get_name(self):
45 return self.PhabObjectClass.OBJECT_KIND + "sCache"
47 def get(self, id):
48 if id not in self.id2PhabObjects:
49 self.id2PhabObjects[id] = self.PhabObjectClass(id)
50 return self.id2PhabObjects[id]
52 def get_ids_in_cache(self):
53 return self.id2PhabObjects.keys()
55 def get_objects(self):
56 return self.id2PhabObjects.values()
58 DEFAULT_DIRECTORY = "PhabObjectCache"
60 def _get_pickle_name(self, directory):
61 file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
62 return os.path.join(directory, file_name)
64 def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
65 """
66 FIXME: consider if serializing to JSON would bring interoperability
67 advantages over serializing to pickle.
68 """
69 try:
70 f = open(self._get_pickle_name(directory), "rb")
71 except IOError as err:
72 print("Could not find cache. Error message: {0}. Continuing..."
73 .format(err))
74 else:
75 with f:
76 try:
77 d = pickle.load(f)
78 self.__dict__.update(d)
79 except EOFError as err:
80 print("Cache seems to be corrupt. " +
81 "Not using cache. Error message: {0}".format(err))
83 def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
84 if not os.path.exists(directory):
85 os.makedirs(directory)
86 with open(self._get_pickle_name(directory), "wb") as f:
87 pickle.dump(self.__dict__, f)
88 print("wrote cache to disk, most_recent_info= {0}".format(
89 datetime.fromtimestamp(self.most_recent_info)
90 if self.most_recent_info is not None else None))
93 class PhabReview(PhabObject):
94 OBJECT_KIND = "Review"
96 def __init__(self, id):
97 PhabObject.__init__(self, id)
99 def update(self, title, dateCreated, dateModified, author):
100 self.title = title
101 self.dateCreated = dateCreated
102 self.dateModified = dateModified
103 self.author = author
105 def setPhabDiffs(self, phabDiffs):
106 self.phabDiffs = phabDiffs
109 class PhabUser(PhabObject):
110 OBJECT_KIND = "User"
112 def __init__(self, id):
113 PhabObject.__init__(self, id)
115 def update(self, phid, realName):
116 self.phid = phid
117 self.realName = realName
120 class PhabHunk:
121 def __init__(self, rest_api_hunk):
122 self.oldOffset = int(rest_api_hunk["oldOffset"])
123 self.oldLength = int(rest_api_hunk["oldLength"])
124 # self.actual_lines_changed_offset will contain the offsets of the
125 # lines that were changed in this hunk.
126 self.actual_lines_changed_offset = []
127 offset = self.oldOffset
128 inHunk = False
129 hunkStart = -1
130 contextLines = 3
131 for line in rest_api_hunk["corpus"].split("\n"):
132 if line.startswith("+"):
133 # line is a new line that got introduced in this patch.
134 # Do not record it as a changed line.
135 if inHunk is False:
136 inHunk = True
137 hunkStart = max(self.oldOffset, offset - contextLines)
138 continue
139 if line.startswith("-"):
140 # line was changed or removed from the older version of the
141 # code. Record it as a changed line.
142 if inHunk is False:
143 inHunk = True
144 hunkStart = max(self.oldOffset, offset - contextLines)
145 offset += 1
146 continue
147 # line is a context line.
148 if inHunk is True:
149 inHunk = False
150 hunkEnd = offset + contextLines
151 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
152 offset += 1
153 if inHunk is True:
154 hunkEnd = offset + contextLines
155 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
157 # The above algorithm could result in adjacent or overlapping ranges
158 # being recorded into self.actual_lines_changed_offset.
159 # Merge the adjacent and overlapping ranges in there:
160 t = []
161 lastRange = None
162 for start, end in self.actual_lines_changed_offset + \
163 [(sys.maxsize, sys.maxsize)]:
164 if lastRange is None:
165 lastRange = (start, end)
166 else:
167 if lastRange[1] >= start:
168 lastRange = (lastRange[0], end)
169 else:
170 t.append(lastRange)
171 lastRange = (start, end)
172 self.actual_lines_changed_offset = t
175 class PhabChange:
176 def __init__(self, rest_api_change):
177 self.oldPath = rest_api_change["oldPath"]
178 self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
181 class PhabDiff(PhabObject):
182 OBJECT_KIND = "Diff"
184 def __init__(self, id):
185 PhabObject.__init__(self, id)
187 def update(self, rest_api_results):
188 self.revisionID = rest_api_results["revisionID"]
189 self.dateModified = int(rest_api_results["dateModified"])
190 self.dateCreated = int(rest_api_results["dateCreated"])
191 self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
194 class ReviewsCache(PhabObjectCache):
195 def __init__(self):
196 PhabObjectCache.__init__(self, PhabReview)
199 class UsersCache(PhabObjectCache):
200 def __init__(self):
201 PhabObjectCache.__init__(self, PhabUser)
204 reviews_cache = ReviewsCache()
205 users_cache = UsersCache()
208 def init_phab_connection():
209 phab = Phabricator()
210 phab.update_interfaces()
211 return phab
214 def update_cached_info(phab, cache, phab_query, order, record_results,
215 max_nr_entries_per_fetch, max_nr_days_to_cache):
216 q = phab
217 LIMIT = max_nr_entries_per_fetch
218 for query_step in phab_query:
219 q = getattr(q, query_step)
220 results = q(order=order, limit=LIMIT)
221 most_recent_info, oldest_info = record_results(cache, results, phab)
222 oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
223 timedelta(days=max_nr_days_to_cache)
224 most_recent_info_overall = most_recent_info
225 cache.write_cache_to_disk()
226 after = results["cursor"]["after"]
227 print("after: {0!r}".format(after))
228 print("most_recent_info: {0}".format(
229 datetime.fromtimestamp(most_recent_info)))
230 while (after is not None
231 and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
232 need_more_older_data = \
233 (cache.oldest_info is None or
234 datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
235 print(("need_more_older_data={0} cache.oldest_info={1} " +
236 "oldest_info_to_fetch={2}").format(
237 need_more_older_data,
238 datetime.fromtimestamp(cache.oldest_info)
239 if cache.oldest_info is not None else None,
240 oldest_info_to_fetch))
241 need_more_newer_data = \
242 (cache.most_recent_info is None or
243 cache.most_recent_info < most_recent_info)
244 print(("need_more_newer_data={0} cache.most_recent_info={1} " +
245 "most_recent_info={2}")
246 .format(need_more_newer_data, cache.most_recent_info,
247 most_recent_info))
248 if not need_more_older_data and not need_more_newer_data:
249 break
250 results = q(order=order, after=after, limit=LIMIT)
251 most_recent_info, oldest_info = record_results(cache, results, phab)
252 after = results["cursor"]["after"]
253 print("after: {0!r}".format(after))
254 print("most_recent_info: {0}".format(
255 datetime.fromtimestamp(most_recent_info)))
256 cache.write_cache_to_disk()
257 cache.most_recent_info = most_recent_info_overall
258 if after is None:
259 # We did fetch all records. Mark the cache to contain all info since
260 # the start of time.
261 oldest_info = 0
262 cache.oldest_info = oldest_info
263 cache.write_cache_to_disk()
266 def record_reviews(cache, reviews, phab):
267 most_recent_info = None
268 oldest_info = None
269 for reviewInfo in reviews["data"]:
270 if reviewInfo["type"] != "DREV":
271 continue
272 id = reviewInfo["id"]
273 # phid = reviewInfo["phid"]
274 dateModified = int(reviewInfo["fields"]["dateModified"])
275 dateCreated = int(reviewInfo["fields"]["dateCreated"])
276 title = reviewInfo["fields"]["title"]
277 author = reviewInfo["fields"]["authorPHID"]
278 phabReview = cache.get(id)
279 if "dateModified" not in phabReview.__dict__ or \
280 dateModified > phabReview.dateModified:
281 diff_results = phab.differential.querydiffs(revisionIDs=[id])
282 diff_ids = sorted(diff_results.keys())
283 phabDiffs = []
284 for diff_id in diff_ids:
285 diffInfo = diff_results[diff_id]
286 d = PhabDiff(diff_id)
287 d.update(diffInfo)
288 phabDiffs.append(d)
289 phabReview.update(title, dateCreated, dateModified, author)
290 phabReview.setPhabDiffs(phabDiffs)
291 print("Updated D{0} modified on {1} ({2} diffs)".format(
292 id, datetime.fromtimestamp(dateModified), len(phabDiffs)))
294 if most_recent_info is None:
295 most_recent_info = dateModified
296 elif most_recent_info < dateModified:
297 most_recent_info = dateModified
299 if oldest_info is None:
300 oldest_info = dateModified
301 elif oldest_info > dateModified:
302 oldest_info = dateModified
303 return most_recent_info, oldest_info
306 def record_users(cache, users, phab):
307 most_recent_info = None
308 oldest_info = None
309 for info in users["data"]:
310 if info["type"] != "USER":
311 continue
312 id = info["id"]
313 phid = info["phid"]
314 dateModified = int(info["fields"]["dateModified"])
315 # dateCreated = int(info["fields"]["dateCreated"])
316 realName = info["fields"]["realName"]
317 phabUser = cache.get(id)
318 phabUser.update(phid, realName)
319 if most_recent_info is None:
320 most_recent_info = dateModified
321 elif most_recent_info < dateModified:
322 most_recent_info = dateModified
323 if oldest_info is None:
324 oldest_info = dateModified
325 elif oldest_info > dateModified:
326 oldest_info = dateModified
327 return most_recent_info, oldest_info
330 PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
331 "updated", record_reviews, 5, 7),
332 (users_cache, ("user", "search"), "newest", record_users,
333 100, 1000))
336 def load_cache():
337 for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
338 cache.populate_cache_from_disk()
339 print("Loaded {0} nr entries: {1}".format(
340 cache.get_name(), len(cache.get_ids_in_cache())))
341 print("Loaded {0} has most recent info: {1}".format(
342 cache.get_name(),
343 datetime.fromtimestamp(cache.most_recent_info)
344 if cache.most_recent_info is not None else None))
347 def update_cache(phab):
348 load_cache()
349 for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
350 max_nr_days_to_cache in PHABCACHESINFO:
351 update_cached_info(phab, cache, phab_query, order, record_results,
352 max_nr_entries_per_fetch, max_nr_days_to_cache)
353 ids_in_cache = cache.get_ids_in_cache()
354 print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
355 cache.write_cache_to_disk()
358 def get_most_recent_reviews(days):
359 newest_reviews = sorted(
360 reviews_cache.get_objects(), key=lambda r: -r.dateModified)
361 if len(newest_reviews) == 0:
362 return newest_reviews
363 most_recent_review_time = \
364 datetime.fromtimestamp(newest_reviews[0].dateModified)
365 cut_off_date = most_recent_review_time - timedelta(days=days)
366 result = []
367 for review in newest_reviews:
368 if datetime.fromtimestamp(review.dateModified) < cut_off_date:
369 return result
370 result.append(review)
371 return result
374 # All of the above code is about fetching data from Phabricator and caching it
375 # on local disk. The below code contains the actual "business logic" for this
376 # script.
378 _userphid2realname = None
381 def get_real_name_from_author(user_phid):
382 global _userphid2realname
383 if _userphid2realname is None:
384 _userphid2realname = {}
385 for user in users_cache.get_objects():
386 _userphid2realname[user.phid] = user.realName
387 return _userphid2realname.get(user_phid, "unknown")
390 def print_most_recent_reviews(phab, days, filter_reviewers):
391 msgs = []
393 def add_msg(msg):
394 msgs.append(msg)
395 print(msg)
397 newest_reviews = get_most_recent_reviews(days)
398 add_msg(u"These are the reviews that look interesting to be reviewed. " +
399 u"The report below has 2 sections. The first " +
400 u"section is organized per review; the second section is organized "
401 + u"per potential reviewer.\n")
402 oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
403 oldest_datetime = \
404 datetime.fromtimestamp(oldest_review.dateModified) \
405 if oldest_review else None
406 add_msg((u"The report below is based on analyzing the reviews that got " +
407 u"touched in the past {0} days (since {1}). " +
408 u"The script found {2} such reviews.\n").format(
409 days, oldest_datetime, len(newest_reviews)))
410 reviewer2reviews_and_scores = {}
411 for i, review in enumerate(newest_reviews):
412 matched_reviewers = find_reviewers_for_review(review)
413 matched_reviewers = filter_reviewers(matched_reviewers)
414 if len(matched_reviewers) == 0:
415 continue
416 add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
417 u" Last updated on {4}").format(
418 i, review.id,
419 get_real_name_from_author(review.author), review.title,
420 datetime.fromtimestamp(review.dateModified)))
421 for reviewer, scores in matched_reviewers:
422 add_msg(u" potential reviewer {0}, score {1}".format(
423 reviewer,
424 "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
425 if reviewer not in reviewer2reviews_and_scores:
426 reviewer2reviews_and_scores[reviewer] = []
427 reviewer2reviews_and_scores[reviewer].append((review, scores))
429 # Print out a summary per reviewer.
430 for reviewer in sorted(reviewer2reviews_and_scores.keys()):
431 reviews_and_scores = reviewer2reviews_and_scores[reviewer]
432 reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
433 add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
434 reviewer, len(reviews_and_scores)))
435 for review, scores in reviews_and_scores:
436 add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
437 "/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
438 review.title, get_real_name_from_author(review.author)))
439 return "\n".join(msgs)
442 def get_git_cmd_output(cmd):
443 output = None
444 try:
445 logging.debug(cmd)
446 output = subprocess.check_output(
447 cmd, shell=True, stderr=subprocess.STDOUT)
448 except subprocess.CalledProcessError as e:
449 logging.debug(str(e))
450 if output is None:
451 return None
452 return output.decode("utf-8", errors='ignore')
455 reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
458 def parse_blame_output_line_porcelain(blame_output):
459 email2nr_occurences = {}
460 if blame_output is None:
461 return email2nr_occurences
462 for line in blame_output.split('\n'):
463 m = reAuthorMail.match(line)
464 if m:
465 author_email_address = m.group(1)
466 if author_email_address not in email2nr_occurences:
467 email2nr_occurences[author_email_address] = 1
468 else:
469 email2nr_occurences[author_email_address] += 1
470 return email2nr_occurences
473 def find_reviewers_for_diff_heuristic(diff):
474 # Heuristic 1: assume good reviewers are the ones that touched the same
475 # lines before as this patch is touching.
476 # Heuristic 2: assume good reviewers are the ones that touched the same
477 # files before as this patch is touching.
478 reviewers2nr_lines_touched = {}
479 reviewers2nr_files_touched = {}
480 # Assume last revision before diff was modified is the revision the diff
481 # applies to.
482 git_repo = "git_repos/llvm"
483 cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
484 git_repo,
485 datetime.fromtimestamp(
486 diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
487 base_revision = get_git_cmd_output(cmd).strip()
488 logging.debug("Base revision={0}".format(base_revision))
489 for change in diff.changes:
490 path = change.oldPath
491 # Compute heuristic 1: look at context of patch lines.
492 for hunk in change.hunks:
493 for start_line, end_line in hunk.actual_lines_changed_offset:
494 # Collect git blame results for authors in those ranges.
495 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
496 "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
497 git_repo, start_line, end_line, base_revision, path)
498 blame_output = get_git_cmd_output(cmd)
499 for reviewer, nr_occurences in \
500 parse_blame_output_line_porcelain(blame_output).items():
501 if reviewer not in reviewers2nr_lines_touched:
502 reviewers2nr_lines_touched[reviewer] = 0
503 reviewers2nr_lines_touched[reviewer] += nr_occurences
504 # Compute heuristic 2: don't look at context, just at files touched.
505 # Collect git blame results for authors in those ranges.
506 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
507 "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
508 path)
509 blame_output = get_git_cmd_output(cmd)
510 for reviewer, nr_occurences in parse_blame_output_line_porcelain(
511 blame_output).items():
512 if reviewer not in reviewers2nr_files_touched:
513 reviewers2nr_files_touched[reviewer] = 0
514 reviewers2nr_files_touched[reviewer] += 1
516 # Compute "match scores"
517 total_nr_lines = sum(reviewers2nr_lines_touched.values())
518 total_nr_files = len(diff.changes)
519 reviewers_matchscores = \
520 [(reviewer,
521 (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
522 if total_nr_lines != 0 else 0,
523 reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
524 if total_nr_files != 0 else 0))
525 for reviewer, nr_lines
526 in reviewers2nr_files_touched.items()]
527 reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
528 return reviewers_matchscores
531 def find_reviewers_for_review(review):
532 # Process the newest diff first.
533 diffs = sorted(
534 review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
535 if len(diffs) == 0:
536 return
537 diff = diffs[0]
538 matched_reviewers = find_reviewers_for_diff_heuristic(diff)
539 # Show progress, as this is a slow operation:
540 sys.stdout.write('.')
541 sys.stdout.flush()
542 logging.debug(u"matched_reviewers: {0}".format(matched_reviewers))
543 return matched_reviewers
546 def update_git_repos():
547 git_repos_directory = "git_repos"
548 for name, url in GIT_REPO_METADATA:
549 dirname = os.path.join(git_repos_directory, name)
550 if not os.path.exists(dirname):
551 cmd = "git clone {0} {1}".format(url, dirname)
552 output = get_git_cmd_output(cmd)
553 cmd = "git -C {0} pull --rebase".format(dirname)
554 output = get_git_cmd_output(cmd)
557 def send_emails(email_addresses, sender, msg):
558 s = smtplib.SMTP()
559 s.connect()
560 for email_address in email_addresses:
561 email_msg = email.mime.multipart.MIMEMultipart()
562 email_msg['From'] = sender
563 email_msg['To'] = email_address
564 email_msg['Subject'] = 'LLVM patches you may be able to review.'
565 email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain'))
566 # python 3.x: s.send_message(email_msg)
567 s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string())
568 s.quit()
571 def filter_reviewers_to_report_for(people_to_look_for):
572 # The below is just an example filter, to only report potential reviews
573 # to do for the people that will receive the report email.
574 return lambda potential_reviewers: [r for r in potential_reviewers
575 if r[0] in people_to_look_for]
578 def main():
579 parser = argparse.ArgumentParser(
580 description='Match open reviews to potential reviewers.')
581 parser.add_argument(
582 '--no-update-cache',
583 dest='update_cache',
584 action='store_false',
585 default=True,
586 help='Do not update cached Phabricator objects')
587 parser.add_argument(
588 '--email-report',
589 dest='email_report',
590 nargs='*',
591 default="",
592 help="A email addresses to send the report to.")
593 parser.add_argument(
594 '--sender',
595 dest='sender',
596 default="",
597 help="The email address to use in 'From' on messages emailed out.")
598 parser.add_argument(
599 '--email-addresses',
600 dest='email_addresses',
601 nargs='*',
602 help="The email addresses (as known by LLVM git) of " +
603 "the people to look for reviews for.")
604 parser.add_argument('--verbose', '-v', action='count')
606 args = parser.parse_args()
608 if args.verbose >= 1:
609 logging.basicConfig(level=logging.DEBUG)
611 people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
612 logging.debug("Will look for reviews that following contributors could " +
613 "review: {}".format(people_to_look_for))
614 logging.debug("Will email a report to: {}".format(args.email_report))
616 phab = init_phab_connection()
618 if args.update_cache:
619 update_cache(phab)
621 load_cache()
622 update_git_repos()
623 msg = print_most_recent_reviews(
624 phab,
625 days=1,
626 filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))
628 if args.email_report != []:
629 send_emails(args.email_report, args.sender, msg)
632 if __name__ == "__main__":
633 main()