3 from __future__
import print_function
6 import email
.mime
.multipart
15 from datetime
import datetime
, timedelta
16 from phabricator
import Phabricator
18 # Setting up a virtualenv to run this script can be done by running the
21 # $ . ./venv/bin/activate
22 # $ pip install Phabricator
24 GIT_REPO_METADATA
= (("llvm-monorepo", "https://github.com/llvm/llvm-project"),
27 # The below PhabXXX classes represent objects as modelled by Phabricator.
28 # The classes can be serialized to disk, to try and make sure that we don't
29 # needlessly have to re-fetch lots of data from Phabricator, as that would
30 # make this script unusably slow.
36 def __init__(self
, id):
40 class PhabObjectCache
:
41 def __init__(self
, PhabObjectClass
):
42 self
.PhabObjectClass
= PhabObjectClass
43 self
.most_recent_info
= None
44 self
.oldest_info
= None
45 self
.id2PhabObjects
= {}
48 return self
.PhabObjectClass
.OBJECT_KIND
+ "sCache"
51 if id not in self
.id2PhabObjects
:
52 self
.id2PhabObjects
[id] = self
.PhabObjectClass(id)
53 return self
.id2PhabObjects
[id]
55 def get_ids_in_cache(self
):
56 return list(self
.id2PhabObjects
.keys())
58 def get_objects(self
):
59 return list(self
.id2PhabObjects
.values())
61 DEFAULT_DIRECTORY
= "PhabObjectCache"
63 def _get_pickle_name(self
, directory
):
64 file_name
= "Phab" + self
.PhabObjectClass
.OBJECT_KIND
+ "s.pickle"
65 return os
.path
.join(directory
, file_name
)
67 def populate_cache_from_disk(self
, directory
=DEFAULT_DIRECTORY
):
69 FIXME: consider if serializing to JSON would bring interoperability
70 advantages over serializing to pickle.
73 f
= open(self
._get
_pickle
_name
(directory
), "rb")
74 except IOError as err
:
75 print("Could not find cache. Error message: {0}. Continuing..."
81 self
.__dict
__.update(d
)
82 except EOFError as err
:
83 print("Cache seems to be corrupt. " +
84 "Not using cache. Error message: {0}".format(err
))
86 def write_cache_to_disk(self
, directory
=DEFAULT_DIRECTORY
):
87 if not os
.path
.exists(directory
):
88 os
.makedirs(directory
)
89 with
open(self
._get
_pickle
_name
(directory
), "wb") as f
:
90 pickle
.dump(self
.__dict
__, f
)
91 print("wrote cache to disk, most_recent_info= {0}".format(
92 datetime
.fromtimestamp(self
.most_recent_info
)
93 if self
.most_recent_info
is not None else None))
96 class PhabReview(PhabObject
):
97 OBJECT_KIND
= "Review"
99 def __init__(self
, id):
100 PhabObject
.__init
__(self
, id)
102 def update(self
, title
, dateCreated
, dateModified
, author
):
104 self
.dateCreated
= dateCreated
105 self
.dateModified
= dateModified
108 def setPhabDiffs(self
, phabDiffs
):
109 self
.phabDiffs
= phabDiffs
112 class PhabUser(PhabObject
):
115 def __init__(self
, id):
116 PhabObject
.__init
__(self
, id)
118 def update(self
, phid
, realName
):
120 self
.realName
= realName
124 def __init__(self
, rest_api_hunk
):
125 self
.oldOffset
= int(rest_api_hunk
["oldOffset"])
126 self
.oldLength
= int(rest_api_hunk
["oldLength"])
127 # self.actual_lines_changed_offset will contain the offsets of the
128 # lines that were changed in this hunk.
129 self
.actual_lines_changed_offset
= []
130 offset
= self
.oldOffset
134 for line
in rest_api_hunk
["corpus"].split("\n"):
135 if line
.startswith("+"):
136 # line is a new line that got introduced in this patch.
137 # Do not record it as a changed line.
140 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
142 if line
.startswith("-"):
143 # line was changed or removed from the older version of the
144 # code. Record it as a changed line.
147 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
150 # line is a context line.
153 hunkEnd
= offset
+ contextLines
154 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
157 hunkEnd
= offset
+ contextLines
158 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
160 # The above algorithm could result in adjacent or overlapping ranges
161 # being recorded into self.actual_lines_changed_offset.
162 # Merge the adjacent and overlapping ranges in there:
165 for start
, end
in self
.actual_lines_changed_offset
+ \
166 [(sys
.maxsize
, sys
.maxsize
)]:
167 if lastRange
is None:
168 lastRange
= (start
, end
)
170 if lastRange
[1] >= start
:
171 lastRange
= (lastRange
[0], end
)
174 lastRange
= (start
, end
)
175 self
.actual_lines_changed_offset
= t
179 def __init__(self
, rest_api_change
):
180 self
.oldPath
= rest_api_change
["oldPath"]
181 self
.hunks
= [PhabHunk(h
) for h
in rest_api_change
["hunks"]]
184 class PhabDiff(PhabObject
):
187 def __init__(self
, id):
188 PhabObject
.__init
__(self
, id)
190 def update(self
, rest_api_results
):
191 self
.revisionID
= rest_api_results
["revisionID"]
192 self
.dateModified
= int(rest_api_results
["dateModified"])
193 self
.dateCreated
= int(rest_api_results
["dateCreated"])
194 self
.changes
= [PhabChange(c
) for c
in rest_api_results
["changes"]]
197 class ReviewsCache(PhabObjectCache
):
199 PhabObjectCache
.__init
__(self
, PhabReview
)
202 class UsersCache(PhabObjectCache
):
204 PhabObjectCache
.__init
__(self
, PhabUser
)
207 reviews_cache
= ReviewsCache()
208 users_cache
= UsersCache()
211 def init_phab_connection():
213 phab
.update_interfaces()
217 def update_cached_info(phab
, cache
, phab_query
, order
, record_results
,
218 max_nr_entries_per_fetch
, max_nr_days_to_cache
):
220 LIMIT
= max_nr_entries_per_fetch
221 for query_step
in phab_query
:
222 q
= getattr(q
, query_step
)
223 results
= q(order
=order
, limit
=LIMIT
)
224 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
225 oldest_info_to_fetch
= datetime
.fromtimestamp(most_recent_info
) - \
226 timedelta(days
=max_nr_days_to_cache
)
227 most_recent_info_overall
= most_recent_info
228 cache
.write_cache_to_disk()
229 after
= results
["cursor"]["after"]
230 print("after: {0!r}".format(after
))
231 print("most_recent_info: {0}".format(
232 datetime
.fromtimestamp(most_recent_info
)))
233 while (after
is not None
234 and datetime
.fromtimestamp(oldest_info
) > oldest_info_to_fetch
):
235 need_more_older_data
= \
236 (cache
.oldest_info
is None or
237 datetime
.fromtimestamp(cache
.oldest_info
) > oldest_info_to_fetch
)
238 print(("need_more_older_data={0} cache.oldest_info={1} " +
239 "oldest_info_to_fetch={2}").format(
240 need_more_older_data
,
241 datetime
.fromtimestamp(cache
.oldest_info
)
242 if cache
.oldest_info
is not None else None,
243 oldest_info_to_fetch
))
244 need_more_newer_data
= \
245 (cache
.most_recent_info
is None or
246 cache
.most_recent_info
< most_recent_info
)
247 print(("need_more_newer_data={0} cache.most_recent_info={1} " +
248 "most_recent_info={2}")
249 .format(need_more_newer_data
, cache
.most_recent_info
,
251 if not need_more_older_data
and not need_more_newer_data
:
253 results
= q(order
=order
, after
=after
, limit
=LIMIT
)
254 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
255 after
= results
["cursor"]["after"]
256 print("after: {0!r}".format(after
))
257 print("most_recent_info: {0}".format(
258 datetime
.fromtimestamp(most_recent_info
)))
259 cache
.write_cache_to_disk()
260 cache
.most_recent_info
= most_recent_info_overall
262 # We did fetch all records. Mark the cache to contain all info since
265 cache
.oldest_info
= oldest_info
266 cache
.write_cache_to_disk()
269 def record_reviews(cache
, reviews
, phab
):
270 most_recent_info
= None
272 for reviewInfo
in reviews
["data"]:
273 if reviewInfo
["type"] != "DREV":
275 id = reviewInfo
["id"]
276 # phid = reviewInfo["phid"]
277 dateModified
= int(reviewInfo
["fields"]["dateModified"])
278 dateCreated
= int(reviewInfo
["fields"]["dateCreated"])
279 title
= reviewInfo
["fields"]["title"]
280 author
= reviewInfo
["fields"]["authorPHID"]
281 phabReview
= cache
.get(id)
282 if "dateModified" not in phabReview
.__dict
__ or \
283 dateModified
> phabReview
.dateModified
:
284 diff_results
= phab
.differential
.querydiffs(revisionIDs
=[id])
285 diff_ids
= sorted(diff_results
.keys())
287 for diff_id
in diff_ids
:
288 diffInfo
= diff_results
[diff_id
]
289 d
= PhabDiff(diff_id
)
292 phabReview
.update(title
, dateCreated
, dateModified
, author
)
293 phabReview
.setPhabDiffs(phabDiffs
)
294 print("Updated D{0} modified on {1} ({2} diffs)".format(
295 id, datetime
.fromtimestamp(dateModified
), len(phabDiffs
)))
297 if most_recent_info
is None:
298 most_recent_info
= dateModified
299 elif most_recent_info
< dateModified
:
300 most_recent_info
= dateModified
302 if oldest_info
is None:
303 oldest_info
= dateModified
304 elif oldest_info
> dateModified
:
305 oldest_info
= dateModified
306 return most_recent_info
, oldest_info
309 def record_users(cache
, users
, phab
):
310 most_recent_info
= None
312 for info
in users
["data"]:
313 if info
["type"] != "USER":
317 dateModified
= int(info
["fields"]["dateModified"])
318 # dateCreated = int(info["fields"]["dateCreated"])
319 realName
= info
["fields"]["realName"]
320 phabUser
= cache
.get(id)
321 phabUser
.update(phid
, realName
)
322 if most_recent_info
is None:
323 most_recent_info
= dateModified
324 elif most_recent_info
< dateModified
:
325 most_recent_info
= dateModified
326 if oldest_info
is None:
327 oldest_info
= dateModified
328 elif oldest_info
> dateModified
:
329 oldest_info
= dateModified
330 return most_recent_info
, oldest_info
333 PHABCACHESINFO
= ((reviews_cache
, ("differential", "revision", "search"),
334 "updated", record_reviews
, 5, 7),
335 (users_cache
, ("user", "search"), "newest", record_users
,
340 for cache
, phab_query
, order
, record_results
, _
, _
in PHABCACHESINFO
:
341 cache
.populate_cache_from_disk()
342 print("Loaded {0} nr entries: {1}".format(
343 cache
.get_name(), len(cache
.get_ids_in_cache())))
344 print("Loaded {0} has most recent info: {1}".format(
346 datetime
.fromtimestamp(cache
.most_recent_info
)
347 if cache
.most_recent_info
is not None else None))
350 def update_cache(phab
):
352 for cache
, phab_query
, order
, record_results
, max_nr_entries_per_fetch
, \
353 max_nr_days_to_cache
in PHABCACHESINFO
:
354 update_cached_info(phab
, cache
, phab_query
, order
, record_results
,
355 max_nr_entries_per_fetch
, max_nr_days_to_cache
)
356 ids_in_cache
= cache
.get_ids_in_cache()
357 print("{0} objects in {1}".format(len(ids_in_cache
), cache
.get_name()))
358 cache
.write_cache_to_disk()
361 def get_most_recent_reviews(days
):
362 newest_reviews
= sorted(
363 reviews_cache
.get_objects(), key
=lambda r
: -r
.dateModified
)
364 if len(newest_reviews
) == 0:
365 return newest_reviews
366 most_recent_review_time
= \
367 datetime
.fromtimestamp(newest_reviews
[0].dateModified
)
368 cut_off_date
= most_recent_review_time
- timedelta(days
=days
)
370 for review
in newest_reviews
:
371 if datetime
.fromtimestamp(review
.dateModified
) < cut_off_date
:
373 result
.append(review
)
377 # All of the above code is about fetching data from Phabricator and caching it
378 # on local disk. The below code contains the actual "business logic" for this
381 _userphid2realname
= None
384 def get_real_name_from_author(user_phid
):
385 global _userphid2realname
386 if _userphid2realname
is None:
387 _userphid2realname
= {}
388 for user
in users_cache
.get_objects():
389 _userphid2realname
[user
.phid
] = user
.realName
390 return _userphid2realname
.get(user_phid
, "unknown")
393 def print_most_recent_reviews(phab
, days
, filter_reviewers
):
398 print(msg
.encode('utf-8'))
400 newest_reviews
= get_most_recent_reviews(days
)
401 add_msg(u
"These are the reviews that look interesting to be reviewed. " +
402 u
"The report below has 2 sections. The first " +
403 u
"section is organized per review; the second section is organized "
404 + u
"per potential reviewer.\n")
405 oldest_review
= newest_reviews
[-1] if len(newest_reviews
) > 0 else None
407 datetime
.fromtimestamp(oldest_review
.dateModified
) \
408 if oldest_review
else None
409 add_msg((u
"The report below is based on analyzing the reviews that got " +
410 u
"touched in the past {0} days (since {1}). " +
411 u
"The script found {2} such reviews.\n").format(
412 days
, oldest_datetime
, len(newest_reviews
)))
413 reviewer2reviews_and_scores
= {}
414 for i
, review
in enumerate(newest_reviews
):
415 matched_reviewers
= find_reviewers_for_review(review
)
416 matched_reviewers
= filter_reviewers(matched_reviewers
)
417 if len(matched_reviewers
) == 0:
419 add_msg((u
"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
420 u
" Last updated on {4}").format(
422 get_real_name_from_author(review
.author
), review
.title
,
423 datetime
.fromtimestamp(review
.dateModified
)))
424 for reviewer
, scores
in matched_reviewers
:
425 add_msg(u
" potential reviewer {0}, score {1}".format(
427 "(" + "/".join(["{0:.1f}%".format(s
) for s
in scores
]) + ")"))
428 if reviewer
not in reviewer2reviews_and_scores
:
429 reviewer2reviews_and_scores
[reviewer
] = []
430 reviewer2reviews_and_scores
[reviewer
].append((review
, scores
))
432 # Print out a summary per reviewer.
433 for reviewer
in sorted(reviewer2reviews_and_scores
.keys()):
434 reviews_and_scores
= reviewer2reviews_and_scores
[reviewer
]
435 reviews_and_scores
.sort(key
=lambda rs
: rs
[1], reverse
=True)
436 add_msg(u
"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
437 reviewer
, len(reviews_and_scores
)))
438 for review
, scores
in reviews_and_scores
:
439 add_msg(u
"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
440 "/".join(["{0:.1f}%".format(s
) for s
in scores
]), review
.id,
441 review
.title
, get_real_name_from_author(review
.author
)))
442 return "\n".join(msgs
)
445 def get_git_cmd_output(cmd
):
449 output
= subprocess
.check_output(
450 cmd
, shell
=True, stderr
=subprocess
.STDOUT
)
451 except subprocess
.CalledProcessError
as e
:
452 logging
.debug(str(e
))
455 return output
.decode("utf-8", errors
='ignore')
458 reAuthorMail
= re
.compile("^author-mail <([^>]*)>.*$")
461 def parse_blame_output_line_porcelain(blame_output_lines
):
462 email2nr_occurences
= {}
463 if blame_output_lines
is None:
464 return email2nr_occurences
465 for line
in blame_output_lines
:
466 m
= reAuthorMail
.match(line
)
468 author_email_address
= m
.group(1)
469 if author_email_address
not in email2nr_occurences
:
470 email2nr_occurences
[author_email_address
] = 1
472 email2nr_occurences
[author_email_address
] += 1
473 return email2nr_occurences
476 class BlameOutputCache
:
480 def _populate_cache_for(self
, cache_key
):
481 assert cache_key
not in self
.cache
482 git_repo
, base_revision
, path
= cache_key
483 cmd
= ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
484 "--line-porcelain {1} -- {2}").format(git_repo
, base_revision
,
486 blame_output
= get_git_cmd_output(cmd
)
487 self
.cache
[cache_key
] = \
488 blame_output
.split('\n') if blame_output
is not None else None
489 # FIXME: the blame cache could probably be made more effective still if
490 # instead of storing the requested base_revision in the cache, the last
491 # revision before the base revision this file/path got changed in gets
492 # stored. That way multiple project revisions for which this specific
493 # file/patch hasn't changed would get cache hits (instead of misses in
494 # the current implementation).
496 def get_blame_output_for(self
, git_repo
, base_revision
, path
, start_line
=-1,
498 cache_key
= (git_repo
, base_revision
, path
)
499 if cache_key
not in self
.cache
:
500 self
._populate
_cache
_for
(cache_key
)
501 assert cache_key
in self
.cache
502 all_blame_lines
= self
.cache
[cache_key
]
503 if all_blame_lines
is None:
505 if start_line
== -1 and end_line
== -1:
506 return all_blame_lines
507 assert start_line
>= 0
509 assert end_line
<= len(all_blame_lines
)
510 assert start_line
<= len(all_blame_lines
)
511 assert start_line
<= end_line
512 return all_blame_lines
[start_line
:end_line
]
514 def get_parsed_git_blame_for(self
, git_repo
, base_revision
, path
,
515 start_line
=-1, end_line
=-1):
516 return parse_blame_output_line_porcelain(
517 self
.get_blame_output_for(git_repo
, base_revision
, path
, start_line
,
521 blameOutputCache
= BlameOutputCache()
524 def find_reviewers_for_diff_heuristic(diff
):
525 # Heuristic 1: assume good reviewers are the ones that touched the same
526 # lines before as this patch is touching.
527 # Heuristic 2: assume good reviewers are the ones that touched the same
528 # files before as this patch is touching.
529 reviewers2nr_lines_touched
= {}
530 reviewers2nr_files_touched
= {}
531 # Assume last revision before diff was modified is the revision the diff
533 assert len(GIT_REPO_METADATA
) == 1
534 git_repo
= os
.path
.join("git_repos", GIT_REPO_METADATA
[0][0])
535 cmd
= 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
537 datetime
.fromtimestamp(
538 diff
.dateModified
).strftime("%Y-%m-%d %H:%M:%s"))
539 base_revision
= get_git_cmd_output(cmd
).strip()
540 logging
.debug("Base revision={0}".format(base_revision
))
541 for change
in diff
.changes
:
542 path
= change
.oldPath
543 # Compute heuristic 1: look at context of patch lines.
544 for hunk
in change
.hunks
:
545 for start_line
, end_line
in hunk
.actual_lines_changed_offset
:
546 # Collect git blame results for authors in those ranges.
547 for reviewer
, nr_occurences
in \
548 blameOutputCache
.get_parsed_git_blame_for(
549 git_repo
, base_revision
, path
, start_line
, end_line
551 if reviewer
not in reviewers2nr_lines_touched
:
552 reviewers2nr_lines_touched
[reviewer
] = 0
553 reviewers2nr_lines_touched
[reviewer
] += nr_occurences
554 # Compute heuristic 2: don't look at context, just at files touched.
555 # Collect git blame results for authors in those ranges.
556 for reviewer
, nr_occurences
in \
557 blameOutputCache
.get_parsed_git_blame_for(
558 git_repo
, base_revision
, path
).items():
559 if reviewer
not in reviewers2nr_files_touched
:
560 reviewers2nr_files_touched
[reviewer
] = 0
561 reviewers2nr_files_touched
[reviewer
] += 1
563 # Compute "match scores"
564 total_nr_lines
= sum(reviewers2nr_lines_touched
.values())
565 total_nr_files
= len(diff
.changes
)
566 reviewers_matchscores
= \
568 (reviewers2nr_lines_touched
.get(reviewer
, 0)*100.0/total_nr_lines
569 if total_nr_lines
!= 0 else 0,
570 reviewers2nr_files_touched
[reviewer
]*100.0/total_nr_files
571 if total_nr_files
!= 0 else 0))
572 for reviewer
, nr_lines
573 in reviewers2nr_files_touched
.items()]
574 reviewers_matchscores
.sort(key
=lambda i
: i
[1], reverse
=True)
575 return reviewers_matchscores
578 def find_reviewers_for_review(review
):
579 # Process the newest diff first.
581 review
.phabDiffs
, key
=lambda d
: d
.dateModified
, reverse
=True)
585 matched_reviewers
= find_reviewers_for_diff_heuristic(diff
)
586 # Show progress, as this is a slow operation:
587 sys
.stdout
.write('.')
589 logging
.debug(u
"matched_reviewers: {0}".format(matched_reviewers
))
590 return matched_reviewers
593 def update_git_repos():
594 git_repos_directory
= "git_repos"
595 for name
, url
in GIT_REPO_METADATA
:
596 dirname
= os
.path
.join(git_repos_directory
, name
)
597 if not os
.path
.exists(dirname
):
598 cmd
= "git clone {0} {1}".format(url
, dirname
)
599 output
= get_git_cmd_output(cmd
)
600 cmd
= "git -C {0} pull --rebase".format(dirname
)
601 output
= get_git_cmd_output(cmd
)
604 def send_emails(email_addresses
, sender
, msg
):
607 for email_address
in email_addresses
:
608 email_msg
= email
.mime
.multipart
.MIMEMultipart()
609 email_msg
['From'] = sender
610 email_msg
['To'] = email_address
611 email_msg
['Subject'] = 'LLVM patches you may be able to review.'
612 email_msg
.attach(email
.mime
.text
.MIMEText(msg
.encode('utf-8'), 'plain'))
613 # python 3.x: s.send_message(email_msg)
614 s
.sendmail(email_msg
['From'], email_msg
['To'], email_msg
.as_string())
618 def filter_reviewers_to_report_for(people_to_look_for
):
619 # The below is just an example filter, to only report potential reviews
620 # to do for the people that will receive the report email.
621 return lambda potential_reviewers
: [r
for r
in potential_reviewers
622 if r
[0] in people_to_look_for
]
626 parser
= argparse
.ArgumentParser(
627 description
='Match open reviews to potential reviewers.')
631 action
='store_false',
633 help='Do not update cached Phabricator objects')
639 help="A email addresses to send the report to.")
644 help="The email address to use in 'From' on messages emailed out.")
647 dest
='email_addresses',
649 help="The email addresses (as known by LLVM git) of " +
650 "the people to look for reviews for.")
651 parser
.add_argument('--verbose', '-v', action
='count')
653 args
= parser
.parse_args()
655 if args
.verbose
>= 1:
656 logging
.basicConfig(level
=logging
.DEBUG
)
658 people_to_look_for
= [e
.decode('utf-8') for e
in args
.email_addresses
]
659 logging
.debug("Will look for reviews that following contributors could " +
660 "review: {}".format(people_to_look_for
))
661 logging
.debug("Will email a report to: {}".format(args
.email_report
))
663 phab
= init_phab_connection()
665 if args
.update_cache
:
670 msg
= print_most_recent_reviews(
673 filter_reviewers
=filter_reviewers_to_report_for(people_to_look_for
))
675 if args
.email_report
!= []:
676 send_emails(args
.email_report
, args
.sender
, msg
)
679 if __name__
== "__main__":