3 from __future__
import print_function
6 import email
.mime
.multipart
15 from datetime
import datetime
, timedelta
16 from phabricator
import Phabricator
18 # Setting up a virtualenv to run this script can be done by running the
21 # $ . ./venv/bin/activate
22 # $ pip install Phabricator
24 GIT_REPO_METADATA
= (("llvm-monorepo", "https://github.com/llvm/llvm-project"),)
26 # The below PhabXXX classes represent objects as modelled by Phabricator.
27 # The classes can be serialized to disk, to try and make sure that we don't
28 # needlessly have to re-fetch lots of data from Phabricator, as that would
29 # make this script unusably slow.
35 def __init__(self
, id):
39 class PhabObjectCache
:
40 def __init__(self
, PhabObjectClass
):
41 self
.PhabObjectClass
= PhabObjectClass
42 self
.most_recent_info
= None
43 self
.oldest_info
= None
44 self
.id2PhabObjects
= {}
47 return self
.PhabObjectClass
.OBJECT_KIND
+ "sCache"
50 if id not in self
.id2PhabObjects
:
51 self
.id2PhabObjects
[id] = self
.PhabObjectClass(id)
52 return self
.id2PhabObjects
[id]
54 def get_ids_in_cache(self
):
55 return list(self
.id2PhabObjects
.keys())
57 def get_objects(self
):
58 return list(self
.id2PhabObjects
.values())
60 DEFAULT_DIRECTORY
= "PhabObjectCache"
62 def _get_pickle_name(self
, directory
):
63 file_name
= "Phab" + self
.PhabObjectClass
.OBJECT_KIND
+ "s.pickle"
64 return os
.path
.join(directory
, file_name
)
66 def populate_cache_from_disk(self
, directory
=DEFAULT_DIRECTORY
):
68 FIXME: consider if serializing to JSON would bring interoperability
69 advantages over serializing to pickle.
72 f
= open(self
._get
_pickle
_name
(directory
), "rb")
73 except IOError as err
:
74 print("Could not find cache. Error message: {0}. Continuing...".format(err
))
79 self
.__dict
__.update(d
)
80 except EOFError as err
:
82 "Cache seems to be corrupt. "
83 + "Not using cache. Error message: {0}".format(err
)
86 def write_cache_to_disk(self
, directory
=DEFAULT_DIRECTORY
):
87 if not os
.path
.exists(directory
):
88 os
.makedirs(directory
)
89 with
open(self
._get
_pickle
_name
(directory
), "wb") as f
:
90 pickle
.dump(self
.__dict
__, f
)
92 "wrote cache to disk, most_recent_info= {0}".format(
93 datetime
.fromtimestamp(self
.most_recent_info
)
94 if self
.most_recent_info
is not None
100 class PhabReview(PhabObject
):
101 OBJECT_KIND
= "Review"
103 def __init__(self
, id):
104 PhabObject
.__init
__(self
, id)
106 def update(self
, title
, dateCreated
, dateModified
, author
):
108 self
.dateCreated
= dateCreated
109 self
.dateModified
= dateModified
112 def setPhabDiffs(self
, phabDiffs
):
113 self
.phabDiffs
= phabDiffs
116 class PhabUser(PhabObject
):
119 def __init__(self
, id):
120 PhabObject
.__init
__(self
, id)
122 def update(self
, phid
, realName
):
124 self
.realName
= realName
128 def __init__(self
, rest_api_hunk
):
129 self
.oldOffset
= int(rest_api_hunk
["oldOffset"])
130 self
.oldLength
= int(rest_api_hunk
["oldLength"])
131 # self.actual_lines_changed_offset will contain the offsets of the
132 # lines that were changed in this hunk.
133 self
.actual_lines_changed_offset
= []
134 offset
= self
.oldOffset
138 for line
in rest_api_hunk
["corpus"].split("\n"):
139 if line
.startswith("+"):
140 # line is a new line that got introduced in this patch.
141 # Do not record it as a changed line.
144 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
146 if line
.startswith("-"):
147 # line was changed or removed from the older version of the
148 # code. Record it as a changed line.
151 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
154 # line is a context line.
157 hunkEnd
= offset
+ contextLines
158 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
161 hunkEnd
= offset
+ contextLines
162 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
164 # The above algorithm could result in adjacent or overlapping ranges
165 # being recorded into self.actual_lines_changed_offset.
166 # Merge the adjacent and overlapping ranges in there:
169 for start
, end
in self
.actual_lines_changed_offset
+ [
170 (sys
.maxsize
, sys
.maxsize
)
172 if lastRange
is None:
173 lastRange
= (start
, end
)
175 if lastRange
[1] >= start
:
176 lastRange
= (lastRange
[0], end
)
179 lastRange
= (start
, end
)
180 self
.actual_lines_changed_offset
= t
184 def __init__(self
, rest_api_change
):
185 self
.oldPath
= rest_api_change
["oldPath"]
186 self
.hunks
= [PhabHunk(h
) for h
in rest_api_change
["hunks"]]
189 class PhabDiff(PhabObject
):
192 def __init__(self
, id):
193 PhabObject
.__init
__(self
, id)
195 def update(self
, rest_api_results
):
196 self
.revisionID
= rest_api_results
["revisionID"]
197 self
.dateModified
= int(rest_api_results
["dateModified"])
198 self
.dateCreated
= int(rest_api_results
["dateCreated"])
199 self
.changes
= [PhabChange(c
) for c
in rest_api_results
["changes"]]
202 class ReviewsCache(PhabObjectCache
):
204 PhabObjectCache
.__init
__(self
, PhabReview
)
207 class UsersCache(PhabObjectCache
):
209 PhabObjectCache
.__init
__(self
, PhabUser
)
212 reviews_cache
= ReviewsCache()
213 users_cache
= UsersCache()
216 def init_phab_connection():
218 phab
.update_interfaces()
222 def update_cached_info(
228 max_nr_entries_per_fetch
,
229 max_nr_days_to_cache
,
232 LIMIT
= max_nr_entries_per_fetch
233 for query_step
in phab_query
:
234 q
= getattr(q
, query_step
)
235 results
= q(order
=order
, limit
=LIMIT
)
236 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
237 oldest_info_to_fetch
= datetime
.fromtimestamp(most_recent_info
) - timedelta(
238 days
=max_nr_days_to_cache
240 most_recent_info_overall
= most_recent_info
241 cache
.write_cache_to_disk()
242 after
= results
["cursor"]["after"]
243 print("after: {0!r}".format(after
))
244 print("most_recent_info: {0}".format(datetime
.fromtimestamp(most_recent_info
)))
246 after
is not None and datetime
.fromtimestamp(oldest_info
) > oldest_info_to_fetch
248 need_more_older_data
= (
249 cache
.oldest_info
is None
250 or datetime
.fromtimestamp(cache
.oldest_info
) > oldest_info_to_fetch
254 "need_more_older_data={0} cache.oldest_info={1} "
255 + "oldest_info_to_fetch={2}"
257 need_more_older_data
,
258 datetime
.fromtimestamp(cache
.oldest_info
)
259 if cache
.oldest_info
is not None
261 oldest_info_to_fetch
,
264 need_more_newer_data
= (
265 cache
.most_recent_info
is None or cache
.most_recent_info
< most_recent_info
269 "need_more_newer_data={0} cache.most_recent_info={1} "
270 + "most_recent_info={2}"
271 ).format(need_more_newer_data
, cache
.most_recent_info
, most_recent_info
)
273 if not need_more_older_data
and not need_more_newer_data
:
275 results
= q(order
=order
, after
=after
, limit
=LIMIT
)
276 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
277 after
= results
["cursor"]["after"]
278 print("after: {0!r}".format(after
))
279 print("most_recent_info: {0}".format(datetime
.fromtimestamp(most_recent_info
)))
280 cache
.write_cache_to_disk()
281 cache
.most_recent_info
= most_recent_info_overall
283 # We did fetch all records. Mark the cache to contain all info since
286 cache
.oldest_info
= oldest_info
287 cache
.write_cache_to_disk()
290 def record_reviews(cache
, reviews
, phab
):
291 most_recent_info
= None
293 for reviewInfo
in reviews
["data"]:
294 if reviewInfo
["type"] != "DREV":
296 id = reviewInfo
["id"]
297 # phid = reviewInfo["phid"]
298 dateModified
= int(reviewInfo
["fields"]["dateModified"])
299 dateCreated
= int(reviewInfo
["fields"]["dateCreated"])
300 title
= reviewInfo
["fields"]["title"]
301 author
= reviewInfo
["fields"]["authorPHID"]
302 phabReview
= cache
.get(id)
304 "dateModified" not in phabReview
.__dict
__
305 or dateModified
> phabReview
.dateModified
307 diff_results
= phab
.differential
.querydiffs(revisionIDs
=[id])
308 diff_ids
= sorted(diff_results
.keys())
310 for diff_id
in diff_ids
:
311 diffInfo
= diff_results
[diff_id
]
312 d
= PhabDiff(diff_id
)
315 phabReview
.update(title
, dateCreated
, dateModified
, author
)
316 phabReview
.setPhabDiffs(phabDiffs
)
318 "Updated D{0} modified on {1} ({2} diffs)".format(
319 id, datetime
.fromtimestamp(dateModified
), len(phabDiffs
)
323 if most_recent_info
is None:
324 most_recent_info
= dateModified
325 elif most_recent_info
< dateModified
:
326 most_recent_info
= dateModified
328 if oldest_info
is None:
329 oldest_info
= dateModified
330 elif oldest_info
> dateModified
:
331 oldest_info
= dateModified
332 return most_recent_info
, oldest_info
335 def record_users(cache
, users
, phab
):
336 most_recent_info
= None
338 for info
in users
["data"]:
339 if info
["type"] != "USER":
343 dateModified
= int(info
["fields"]["dateModified"])
344 # dateCreated = int(info["fields"]["dateCreated"])
345 realName
= info
["fields"]["realName"]
346 phabUser
= cache
.get(id)
347 phabUser
.update(phid
, realName
)
348 if most_recent_info
is None:
349 most_recent_info
= dateModified
350 elif most_recent_info
< dateModified
:
351 most_recent_info
= dateModified
352 if oldest_info
is None:
353 oldest_info
= dateModified
354 elif oldest_info
> dateModified
:
355 oldest_info
= dateModified
356 return most_recent_info
, oldest_info
362 ("differential", "revision", "search"),
368 (users_cache
, ("user", "search"), "newest", record_users
, 100, 1000),
373 for cache
, phab_query
, order
, record_results
, _
, _
in PHABCACHESINFO
:
374 cache
.populate_cache_from_disk()
376 "Loaded {0} nr entries: {1}".format(
377 cache
.get_name(), len(cache
.get_ids_in_cache())
381 "Loaded {0} has most recent info: {1}".format(
383 datetime
.fromtimestamp(cache
.most_recent_info
)
384 if cache
.most_recent_info
is not None
390 def update_cache(phab
):
397 max_nr_entries_per_fetch
,
398 max_nr_days_to_cache
,
406 max_nr_entries_per_fetch
,
407 max_nr_days_to_cache
,
409 ids_in_cache
= cache
.get_ids_in_cache()
410 print("{0} objects in {1}".format(len(ids_in_cache
), cache
.get_name()))
411 cache
.write_cache_to_disk()
414 def get_most_recent_reviews(days
):
415 newest_reviews
= sorted(reviews_cache
.get_objects(), key
=lambda r
: -r
.dateModified
)
416 if len(newest_reviews
) == 0:
417 return newest_reviews
418 most_recent_review_time
= datetime
.fromtimestamp(newest_reviews
[0].dateModified
)
419 cut_off_date
= most_recent_review_time
- timedelta(days
=days
)
421 for review
in newest_reviews
:
422 if datetime
.fromtimestamp(review
.dateModified
) < cut_off_date
:
424 result
.append(review
)
428 # All of the above code is about fetching data from Phabricator and caching it
429 # on local disk. The below code contains the actual "business logic" for this
432 _userphid2realname
= None
435 def get_real_name_from_author(user_phid
):
436 global _userphid2realname
437 if _userphid2realname
is None:
438 _userphid2realname
= {}
439 for user
in users_cache
.get_objects():
440 _userphid2realname
[user
.phid
] = user
.realName
441 return _userphid2realname
.get(user_phid
, "unknown")
444 def print_most_recent_reviews(phab
, days
, filter_reviewers
):
449 print(msg
.encode("utf-8"))
451 newest_reviews
= get_most_recent_reviews(days
)
453 "These are the reviews that look interesting to be reviewed. "
454 + "The report below has 2 sections. The first "
455 + "section is organized per review; the second section is organized "
456 + "per potential reviewer.\n"
458 oldest_review
= newest_reviews
[-1] if len(newest_reviews
) > 0 else None
460 datetime
.fromtimestamp(oldest_review
.dateModified
) if oldest_review
else None
464 "The report below is based on analyzing the reviews that got "
465 + "touched in the past {0} days (since {1}). "
466 + "The script found {2} such reviews.\n"
467 ).format(days
, oldest_datetime
, len(newest_reviews
))
469 reviewer2reviews_and_scores
= {}
470 for i
, review
in enumerate(newest_reviews
):
471 matched_reviewers
= find_reviewers_for_review(review
)
472 matched_reviewers
= filter_reviewers(matched_reviewers
)
473 if len(matched_reviewers
) == 0:
477 "{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n"
478 + " Last updated on {4}"
482 get_real_name_from_author(review
.author
),
484 datetime
.fromtimestamp(review
.dateModified
),
487 for reviewer
, scores
in matched_reviewers
:
489 " potential reviewer {0}, score {1}".format(
491 "(" + "/".join(["{0:.1f}%".format(s
) for s
in scores
]) + ")",
494 if reviewer
not in reviewer2reviews_and_scores
:
495 reviewer2reviews_and_scores
[reviewer
] = []
496 reviewer2reviews_and_scores
[reviewer
].append((review
, scores
))
498 # Print out a summary per reviewer.
499 for reviewer
in sorted(reviewer2reviews_and_scores
.keys()):
500 reviews_and_scores
= reviewer2reviews_and_scores
[reviewer
]
501 reviews_and_scores
.sort(key
=lambda rs
: rs
[1], reverse
=True)
503 "\n\nSUMMARY FOR {0} (found {1} reviews):".format(
504 reviewer
, len(reviews_and_scores
)
507 for review
, scores
in reviews_and_scores
:
509 "[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
510 "/".join(["{0:.1f}%".format(s
) for s
in scores
]),
513 get_real_name_from_author(review
.author
),
516 return "\n".join(msgs
)
519 def get_git_cmd_output(cmd
):
523 output
= subprocess
.check_output(cmd
, shell
=True, stderr
=subprocess
.STDOUT
)
524 except subprocess
.CalledProcessError
as e
:
525 logging
.debug(str(e
))
528 return output
.decode("utf-8", errors
="ignore")
531 reAuthorMail
= re
.compile("^author-mail <([^>]*)>.*$")
534 def parse_blame_output_line_porcelain(blame_output_lines
):
535 email2nr_occurences
= {}
536 if blame_output_lines
is None:
537 return email2nr_occurences
538 for line
in blame_output_lines
:
539 m
= reAuthorMail
.match(line
)
541 author_email_address
= m
.group(1)
542 if author_email_address
not in email2nr_occurences
:
543 email2nr_occurences
[author_email_address
] = 1
545 email2nr_occurences
[author_email_address
] += 1
546 return email2nr_occurences
549 class BlameOutputCache
:
553 def _populate_cache_for(self
, cache_key
):
554 assert cache_key
not in self
.cache
555 git_repo
, base_revision
, path
= cache_key
557 "git -C {0} blame --encoding=utf-8 --date iso -f -e -w "
558 + "--line-porcelain {1} -- {2}"
559 ).format(git_repo
, base_revision
, path
)
560 blame_output
= get_git_cmd_output(cmd
)
561 self
.cache
[cache_key
] = (
562 blame_output
.split("\n") if blame_output
is not None else None
564 # FIXME: the blame cache could probably be made more effective still if
565 # instead of storing the requested base_revision in the cache, the last
566 # revision before the base revision this file/path got changed in gets
567 # stored. That way multiple project revisions for which this specific
568 # file/patch hasn't changed would get cache hits (instead of misses in
569 # the current implementation).
571 def get_blame_output_for(
572 self
, git_repo
, base_revision
, path
, start_line
=-1, end_line
=-1
574 cache_key
= (git_repo
, base_revision
, path
)
575 if cache_key
not in self
.cache
:
576 self
._populate
_cache
_for
(cache_key
)
577 assert cache_key
in self
.cache
578 all_blame_lines
= self
.cache
[cache_key
]
579 if all_blame_lines
is None:
581 if start_line
== -1 and end_line
== -1:
582 return all_blame_lines
583 assert start_line
>= 0
585 assert end_line
<= len(all_blame_lines
)
586 assert start_line
<= len(all_blame_lines
)
587 assert start_line
<= end_line
588 return all_blame_lines
[start_line
:end_line
]
590 def get_parsed_git_blame_for(
591 self
, git_repo
, base_revision
, path
, start_line
=-1, end_line
=-1
593 return parse_blame_output_line_porcelain(
594 self
.get_blame_output_for(
595 git_repo
, base_revision
, path
, start_line
, end_line
600 blameOutputCache
= BlameOutputCache()
603 def find_reviewers_for_diff_heuristic(diff
):
604 # Heuristic 1: assume good reviewers are the ones that touched the same
605 # lines before as this patch is touching.
606 # Heuristic 2: assume good reviewers are the ones that touched the same
607 # files before as this patch is touching.
608 reviewers2nr_lines_touched
= {}
609 reviewers2nr_files_touched
= {}
610 # Assume last revision before diff was modified is the revision the diff
612 assert len(GIT_REPO_METADATA
) == 1
613 git_repo
= os
.path
.join("git_repos", GIT_REPO_METADATA
[0][0])
614 cmd
= 'git -C {0} rev-list -n 1 --before="{1}" main'.format(
616 datetime
.fromtimestamp(diff
.dateModified
).strftime("%Y-%m-%d %H:%M:%s"),
618 base_revision
= get_git_cmd_output(cmd
).strip()
619 logging
.debug("Base revision={0}".format(base_revision
))
620 for change
in diff
.changes
:
621 path
= change
.oldPath
622 # Compute heuristic 1: look at context of patch lines.
623 for hunk
in change
.hunks
:
624 for start_line
, end_line
in hunk
.actual_lines_changed_offset
:
625 # Collect git blame results for authors in those ranges.
629 ) in blameOutputCache
.get_parsed_git_blame_for(
630 git_repo
, base_revision
, path
, start_line
, end_line
632 if reviewer
not in reviewers2nr_lines_touched
:
633 reviewers2nr_lines_touched
[reviewer
] = 0
634 reviewers2nr_lines_touched
[reviewer
] += nr_occurences
635 # Compute heuristic 2: don't look at context, just at files touched.
636 # Collect git blame results for authors in those ranges.
637 for reviewer
, nr_occurences
in blameOutputCache
.get_parsed_git_blame_for(
638 git_repo
, base_revision
, path
640 if reviewer
not in reviewers2nr_files_touched
:
641 reviewers2nr_files_touched
[reviewer
] = 0
642 reviewers2nr_files_touched
[reviewer
] += 1
644 # Compute "match scores"
645 total_nr_lines
= sum(reviewers2nr_lines_touched
.values())
646 total_nr_files
= len(diff
.changes
)
647 reviewers_matchscores
= [
651 reviewers2nr_lines_touched
.get(reviewer
, 0) * 100.0 / total_nr_lines
652 if total_nr_lines
!= 0
654 reviewers2nr_files_touched
[reviewer
] * 100.0 / total_nr_files
655 if total_nr_files
!= 0
659 for reviewer
, nr_lines
in reviewers2nr_files_touched
.items()
661 reviewers_matchscores
.sort(key
=lambda i
: i
[1], reverse
=True)
662 return reviewers_matchscores
665 def find_reviewers_for_review(review
):
666 # Process the newest diff first.
667 diffs
= sorted(review
.phabDiffs
, key
=lambda d
: d
.dateModified
, reverse
=True)
671 matched_reviewers
= find_reviewers_for_diff_heuristic(diff
)
672 # Show progress, as this is a slow operation:
673 sys
.stdout
.write(".")
675 logging
.debug("matched_reviewers: {0}".format(matched_reviewers
))
676 return matched_reviewers
679 def update_git_repos():
680 git_repos_directory
= "git_repos"
681 for name
, url
in GIT_REPO_METADATA
:
682 dirname
= os
.path
.join(git_repos_directory
, name
)
683 if not os
.path
.exists(dirname
):
684 cmd
= "git clone {0} {1}".format(url
, dirname
)
685 output
= get_git_cmd_output(cmd
)
686 cmd
= "git -C {0} pull --rebase".format(dirname
)
687 output
= get_git_cmd_output(cmd
)
690 def send_emails(email_addresses
, sender
, msg
):
693 for email_address
in email_addresses
:
694 email_msg
= email
.mime
.multipart
.MIMEMultipart()
695 email_msg
["From"] = sender
696 email_msg
["To"] = email_address
697 email_msg
["Subject"] = "LLVM patches you may be able to review."
698 email_msg
.attach(email
.mime
.text
.MIMEText(msg
.encode("utf-8"), "plain"))
699 # python 3.x: s.send_message(email_msg)
700 s
.sendmail(email_msg
["From"], email_msg
["To"], email_msg
.as_string())
704 def filter_reviewers_to_report_for(people_to_look_for
):
705 # The below is just an example filter, to only report potential reviews
706 # to do for the people that will receive the report email.
707 return lambda potential_reviewers
: [
708 r
for r
in potential_reviewers
if r
[0] in people_to_look_for
713 parser
= argparse
.ArgumentParser(
714 description
="Match open reviews to potential reviewers."
719 action
="store_false",
721 help="Do not update cached Phabricator objects",
728 help="A email addresses to send the report to.",
734 help="The email address to use in 'From' on messages emailed out.",
738 dest
="email_addresses",
740 help="The email addresses (as known by LLVM git) of "
741 + "the people to look for reviews for.",
743 parser
.add_argument("--verbose", "-v", action
="count")
745 args
= parser
.parse_args()
747 if args
.verbose
>= 1:
748 logging
.basicConfig(level
=logging
.DEBUG
)
750 people_to_look_for
= [e
.decode("utf-8") for e
in args
.email_addresses
]
752 "Will look for reviews that following contributors could "
753 + "review: {}".format(people_to_look_for
)
755 logging
.debug("Will email a report to: {}".format(args
.email_report
))
757 phab
= init_phab_connection()
759 if args
.update_cache
:
764 msg
= print_most_recent_reviews(
767 filter_reviewers
=filter_reviewers_to_report_for(people_to_look_for
),
770 if args
.email_report
!= []:
771 send_emails(args
.email_report
, args
.sender
, msg
)
774 if __name__
== "__main__":