4 import email
.mime
.multipart
13 from datetime
import datetime
, timedelta
14 from phabricator
import Phabricator
16 # Setting up a virtualenv to run this script can be done by running the
19 # $ . ./venv/bin/activate
20 # $ pip install Phabricator
22 GIT_REPO_METADATA
= (("llvm", "https://llvm.org/git/llvm.git"), )
24 # The below PhabXXX classes represent objects as modelled by Phabricator.
25 # The classes can be serialized to disk, to try and make sure that we don't
26 # needlessly have to re-fetch lots of data from Phabricator, as that would
27 # make this script unusably slow.
33 def __init__(self
, id):
37 class PhabObjectCache
:
38 def __init__(self
, PhabObjectClass
):
39 self
.PhabObjectClass
= PhabObjectClass
40 self
.most_recent_info
= None
41 self
.oldest_info
= None
42 self
.id2PhabObjects
= {}
45 return self
.PhabObjectClass
.OBJECT_KIND
+ "sCache"
48 if id not in self
.id2PhabObjects
:
49 self
.id2PhabObjects
[id] = self
.PhabObjectClass(id)
50 return self
.id2PhabObjects
[id]
52 def get_ids_in_cache(self
):
53 return self
.id2PhabObjects
.keys()
55 def get_objects(self
):
56 return self
.id2PhabObjects
.values()
58 DEFAULT_DIRECTORY
= "PhabObjectCache"
60 def _get_pickle_name(self
, directory
):
61 file_name
= "Phab" + self
.PhabObjectClass
.OBJECT_KIND
+ "s.pickle"
62 return os
.path
.join(directory
, file_name
)
64 def populate_cache_from_disk(self
, directory
=DEFAULT_DIRECTORY
):
66 FIXME: consider if serializing to JSON would bring interoperability
67 advantages over serializing to pickle.
70 f
= open(self
._get
_pickle
_name
(directory
), "rb")
71 except IOError as err
:
72 print("Could not find cache. Error message: {0}. Continuing..."
78 self
.__dict
__.update(d
)
79 except EOFError as err
:
80 print("Cache seems to be corrupt. " +
81 "Not using cache. Error message: {0}".format(err
))
83 def write_cache_to_disk(self
, directory
=DEFAULT_DIRECTORY
):
84 if not os
.path
.exists(directory
):
85 os
.makedirs(directory
)
86 with
open(self
._get
_pickle
_name
(directory
), "wb") as f
:
87 pickle
.dump(self
.__dict
__, f
)
88 print("wrote cache to disk, most_recent_info= {0}".format(
89 datetime
.fromtimestamp(self
.most_recent_info
)
90 if self
.most_recent_info
is not None else None))
93 class PhabReview(PhabObject
):
94 OBJECT_KIND
= "Review"
96 def __init__(self
, id):
97 PhabObject
.__init
__(self
, id)
99 def update(self
, title
, dateCreated
, dateModified
, author
):
101 self
.dateCreated
= dateCreated
102 self
.dateModified
= dateModified
105 def setPhabDiffs(self
, phabDiffs
):
106 self
.phabDiffs
= phabDiffs
109 class PhabUser(PhabObject
):
112 def __init__(self
, id):
113 PhabObject
.__init
__(self
, id)
115 def update(self
, phid
, realName
):
117 self
.realName
= realName
121 def __init__(self
, rest_api_hunk
):
122 self
.oldOffset
= int(rest_api_hunk
["oldOffset"])
123 self
.oldLength
= int(rest_api_hunk
["oldLength"])
124 # self.actual_lines_changed_offset will contain the offsets of the
125 # lines that were changed in this hunk.
126 self
.actual_lines_changed_offset
= []
127 offset
= self
.oldOffset
131 for line
in rest_api_hunk
["corpus"].split("\n"):
132 if line
.startswith("+"):
133 # line is a new line that got introduced in this patch.
134 # Do not record it as a changed line.
137 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
139 if line
.startswith("-"):
140 # line was changed or removed from the older version of the
141 # code. Record it as a changed line.
144 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
147 # line is a context line.
150 hunkEnd
= offset
+ contextLines
151 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
154 hunkEnd
= offset
+ contextLines
155 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
157 # The above algorithm could result in adjacent or overlapping ranges
158 # being recorded into self.actual_lines_changed_offset.
159 # Merge the adjacent and overlapping ranges in there:
162 for start
, end
in self
.actual_lines_changed_offset
+ \
163 [(sys
.maxsize
, sys
.maxsize
)]:
164 if lastRange
is None:
165 lastRange
= (start
, end
)
167 if lastRange
[1] >= start
:
168 lastRange
= (lastRange
[0], end
)
171 lastRange
= (start
, end
)
172 self
.actual_lines_changed_offset
= t
176 def __init__(self
, rest_api_change
):
177 self
.oldPath
= rest_api_change
["oldPath"]
178 self
.hunks
= [PhabHunk(h
) for h
in rest_api_change
["hunks"]]
181 class PhabDiff(PhabObject
):
184 def __init__(self
, id):
185 PhabObject
.__init
__(self
, id)
187 def update(self
, rest_api_results
):
188 self
.revisionID
= rest_api_results
["revisionID"]
189 self
.dateModified
= int(rest_api_results
["dateModified"])
190 self
.dateCreated
= int(rest_api_results
["dateCreated"])
191 self
.changes
= [PhabChange(c
) for c
in rest_api_results
["changes"]]
194 class ReviewsCache(PhabObjectCache
):
196 PhabObjectCache
.__init
__(self
, PhabReview
)
199 class UsersCache(PhabObjectCache
):
201 PhabObjectCache
.__init
__(self
, PhabUser
)
204 reviews_cache
= ReviewsCache()
205 users_cache
= UsersCache()
208 def init_phab_connection():
210 phab
.update_interfaces()
214 def update_cached_info(phab
, cache
, phab_query
, order
, record_results
,
215 max_nr_entries_per_fetch
, max_nr_days_to_cache
):
217 LIMIT
= max_nr_entries_per_fetch
218 for query_step
in phab_query
:
219 q
= getattr(q
, query_step
)
220 results
= q(order
=order
, limit
=LIMIT
)
221 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
222 oldest_info_to_fetch
= datetime
.fromtimestamp(most_recent_info
) - \
223 timedelta(days
=max_nr_days_to_cache
)
224 most_recent_info_overall
= most_recent_info
225 cache
.write_cache_to_disk()
226 after
= results
["cursor"]["after"]
227 print("after: {0!r}".format(after
))
228 print("most_recent_info: {0}".format(
229 datetime
.fromtimestamp(most_recent_info
)))
230 while (after
is not None
231 and datetime
.fromtimestamp(oldest_info
) > oldest_info_to_fetch
):
232 need_more_older_data
= \
233 (cache
.oldest_info
is None or
234 datetime
.fromtimestamp(cache
.oldest_info
) > oldest_info_to_fetch
)
235 print(("need_more_older_data={0} cache.oldest_info={1} " +
236 "oldest_info_to_fetch={2}").format(
237 need_more_older_data
,
238 datetime
.fromtimestamp(cache
.oldest_info
)
239 if cache
.oldest_info
is not None else None,
240 oldest_info_to_fetch
))
241 need_more_newer_data
= \
242 (cache
.most_recent_info
is None or
243 cache
.most_recent_info
< most_recent_info
)
244 print(("need_more_newer_data={0} cache.most_recent_info={1} " +
245 "most_recent_info={2}")
246 .format(need_more_newer_data
, cache
.most_recent_info
,
248 if not need_more_older_data
and not need_more_newer_data
:
250 results
= q(order
=order
, after
=after
, limit
=LIMIT
)
251 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
252 after
= results
["cursor"]["after"]
253 print("after: {0!r}".format(after
))
254 print("most_recent_info: {0}".format(
255 datetime
.fromtimestamp(most_recent_info
)))
256 cache
.write_cache_to_disk()
257 cache
.most_recent_info
= most_recent_info_overall
259 # We did fetch all records. Mark the cache to contain all info since
262 cache
.oldest_info
= oldest_info
263 cache
.write_cache_to_disk()
266 def record_reviews(cache
, reviews
, phab
):
267 most_recent_info
= None
269 for reviewInfo
in reviews
["data"]:
270 if reviewInfo
["type"] != "DREV":
272 id = reviewInfo
["id"]
273 # phid = reviewInfo["phid"]
274 dateModified
= int(reviewInfo
["fields"]["dateModified"])
275 dateCreated
= int(reviewInfo
["fields"]["dateCreated"])
276 title
= reviewInfo
["fields"]["title"]
277 author
= reviewInfo
["fields"]["authorPHID"]
278 phabReview
= cache
.get(id)
279 if "dateModified" not in phabReview
.__dict
__ or \
280 dateModified
> phabReview
.dateModified
:
281 diff_results
= phab
.differential
.querydiffs(revisionIDs
=[id])
282 diff_ids
= sorted(diff_results
.keys())
284 for diff_id
in diff_ids
:
285 diffInfo
= diff_results
[diff_id
]
286 d
= PhabDiff(diff_id
)
289 phabReview
.update(title
, dateCreated
, dateModified
, author
)
290 phabReview
.setPhabDiffs(phabDiffs
)
291 print("Updated D{0} modified on {1} ({2} diffs)".format(
292 id, datetime
.fromtimestamp(dateModified
), len(phabDiffs
)))
294 if most_recent_info
is None:
295 most_recent_info
= dateModified
296 elif most_recent_info
< dateModified
:
297 most_recent_info
= dateModified
299 if oldest_info
is None:
300 oldest_info
= dateModified
301 elif oldest_info
> dateModified
:
302 oldest_info
= dateModified
303 return most_recent_info
, oldest_info
306 def record_users(cache
, users
, phab
):
307 most_recent_info
= None
309 for info
in users
["data"]:
310 if info
["type"] != "USER":
314 dateModified
= int(info
["fields"]["dateModified"])
315 # dateCreated = int(info["fields"]["dateCreated"])
316 realName
= info
["fields"]["realName"]
317 phabUser
= cache
.get(id)
318 phabUser
.update(phid
, realName
)
319 if most_recent_info
is None:
320 most_recent_info
= dateModified
321 elif most_recent_info
< dateModified
:
322 most_recent_info
= dateModified
323 if oldest_info
is None:
324 oldest_info
= dateModified
325 elif oldest_info
> dateModified
:
326 oldest_info
= dateModified
327 return most_recent_info
, oldest_info
330 PHABCACHESINFO
= ((reviews_cache
, ("differential", "revision", "search"),
331 "updated", record_reviews
, 5, 7),
332 (users_cache
, ("user", "search"), "newest", record_users
,
337 for cache
, phab_query
, order
, record_results
, _
, _
in PHABCACHESINFO
:
338 cache
.populate_cache_from_disk()
339 print("Loaded {0} nr entries: {1}".format(
340 cache
.get_name(), len(cache
.get_ids_in_cache())))
341 print("Loaded {0} has most recent info: {1}".format(
343 datetime
.fromtimestamp(cache
.most_recent_info
)
344 if cache
.most_recent_info
is not None else None))
347 def update_cache(phab
):
349 for cache
, phab_query
, order
, record_results
, max_nr_entries_per_fetch
, \
350 max_nr_days_to_cache
in PHABCACHESINFO
:
351 update_cached_info(phab
, cache
, phab_query
, order
, record_results
,
352 max_nr_entries_per_fetch
, max_nr_days_to_cache
)
353 ids_in_cache
= cache
.get_ids_in_cache()
354 print("{0} objects in {1}".format(len(ids_in_cache
), cache
.get_name()))
355 cache
.write_cache_to_disk()
358 def get_most_recent_reviews(days
):
359 newest_reviews
= sorted(
360 reviews_cache
.get_objects(), key
=lambda r
: -r
.dateModified
)
361 if len(newest_reviews
) == 0:
362 return newest_reviews
363 most_recent_review_time
= \
364 datetime
.fromtimestamp(newest_reviews
[0].dateModified
)
365 cut_off_date
= most_recent_review_time
- timedelta(days
=days
)
367 for review
in newest_reviews
:
368 if datetime
.fromtimestamp(review
.dateModified
) < cut_off_date
:
370 result
.append(review
)
374 # All of the above code is about fetching data from Phabricator and caching it
375 # on local disk. The below code contains the actual "business logic" for this
378 _userphid2realname
= None
381 def get_real_name_from_author(user_phid
):
382 global _userphid2realname
383 if _userphid2realname
is None:
384 _userphid2realname
= {}
385 for user
in users_cache
.get_objects():
386 _userphid2realname
[user
.phid
] = user
.realName
387 return _userphid2realname
.get(user_phid
, "unknown")
390 def print_most_recent_reviews(phab
, days
, filter_reviewers
):
397 newest_reviews
= get_most_recent_reviews(days
)
398 add_msg(u
"These are the reviews that look interesting to be reviewed. " +
399 u
"The report below has 2 sections. The first " +
400 u
"section is organized per review; the second section is organized "
401 + u
"per potential reviewer.\n")
402 oldest_review
= newest_reviews
[-1] if len(newest_reviews
) > 0 else None
404 datetime
.fromtimestamp(oldest_review
.dateModified
) \
405 if oldest_review
else None
406 add_msg((u
"The report below is based on analyzing the reviews that got " +
407 u
"touched in the past {0} days (since {1}). " +
408 u
"The script found {2} such reviews.\n").format(
409 days
, oldest_datetime
, len(newest_reviews
)))
410 reviewer2reviews_and_scores
= {}
411 for i
, review
in enumerate(newest_reviews
):
412 matched_reviewers
= find_reviewers_for_review(review
)
413 matched_reviewers
= filter_reviewers(matched_reviewers
)
414 if len(matched_reviewers
) == 0:
416 add_msg((u
"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
417 u
" Last updated on {4}").format(
419 get_real_name_from_author(review
.author
), review
.title
,
420 datetime
.fromtimestamp(review
.dateModified
)))
421 for reviewer
, scores
in matched_reviewers
:
422 add_msg(u
" potential reviewer {0}, score {1}".format(
424 "(" + "/".join(["{0:.1f}%".format(s
) for s
in scores
]) + ")"))
425 if reviewer
not in reviewer2reviews_and_scores
:
426 reviewer2reviews_and_scores
[reviewer
] = []
427 reviewer2reviews_and_scores
[reviewer
].append((review
, scores
))
429 # Print out a summary per reviewer.
430 for reviewer
in sorted(reviewer2reviews_and_scores
.keys()):
431 reviews_and_scores
= reviewer2reviews_and_scores
[reviewer
]
432 reviews_and_scores
.sort(key
=lambda rs
: rs
[1], reverse
=True)
433 add_msg(u
"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
434 reviewer
, len(reviews_and_scores
)))
435 for review
, scores
in reviews_and_scores
:
436 add_msg(u
"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
437 "/".join(["{0:.1f}%".format(s
) for s
in scores
]), review
.id,
438 review
.title
, get_real_name_from_author(review
.author
)))
439 return "\n".join(msgs
)
442 def get_git_cmd_output(cmd
):
446 output
= subprocess
.check_output(
447 cmd
, shell
=True, stderr
=subprocess
.STDOUT
)
448 except subprocess
.CalledProcessError
as e
:
449 logging
.debug(str(e
))
452 return output
.decode("utf-8", errors
='ignore')
455 reAuthorMail
= re
.compile("^author-mail <([^>]*)>.*$")
458 def parse_blame_output_line_porcelain(blame_output
):
459 email2nr_occurences
= {}
460 if blame_output
is None:
461 return email2nr_occurences
462 for line
in blame_output
.split('\n'):
463 m
= reAuthorMail
.match(line
)
465 author_email_address
= m
.group(1)
466 if author_email_address
not in email2nr_occurences
:
467 email2nr_occurences
[author_email_address
] = 1
469 email2nr_occurences
[author_email_address
] += 1
470 return email2nr_occurences
473 def find_reviewers_for_diff_heuristic(diff
):
474 # Heuristic 1: assume good reviewers are the ones that touched the same
475 # lines before as this patch is touching.
476 # Heuristic 2: assume good reviewers are the ones that touched the same
477 # files before as this patch is touching.
478 reviewers2nr_lines_touched
= {}
479 reviewers2nr_files_touched
= {}
480 # Assume last revision before diff was modified is the revision the diff
482 git_repo
= "git_repos/llvm"
483 cmd
= 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
485 datetime
.fromtimestamp(
486 diff
.dateModified
).strftime("%Y-%m-%d %H:%M:%s"))
487 base_revision
= get_git_cmd_output(cmd
).strip()
488 logging
.debug("Base revision={0}".format(base_revision
))
489 for change
in diff
.changes
:
490 path
= change
.oldPath
491 # Compute heuristic 1: look at context of patch lines.
492 for hunk
in change
.hunks
:
493 for start_line
, end_line
in hunk
.actual_lines_changed_offset
:
494 # Collect git blame results for authors in those ranges.
495 cmd
= ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
496 "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
497 git_repo
, start_line
, end_line
, base_revision
, path
)
498 blame_output
= get_git_cmd_output(cmd
)
499 for reviewer
, nr_occurences
in \
500 parse_blame_output_line_porcelain(blame_output
).items():
501 if reviewer
not in reviewers2nr_lines_touched
:
502 reviewers2nr_lines_touched
[reviewer
] = 0
503 reviewers2nr_lines_touched
[reviewer
] += nr_occurences
504 # Compute heuristic 2: don't look at context, just at files touched.
505 # Collect git blame results for authors in those ranges.
506 cmd
= ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
507 "--line-porcelain {1} -- {2}").format(git_repo
, base_revision
,
509 blame_output
= get_git_cmd_output(cmd
)
510 for reviewer
, nr_occurences
in parse_blame_output_line_porcelain(
511 blame_output
).items():
512 if reviewer
not in reviewers2nr_files_touched
:
513 reviewers2nr_files_touched
[reviewer
] = 0
514 reviewers2nr_files_touched
[reviewer
] += 1
516 # Compute "match scores"
517 total_nr_lines
= sum(reviewers2nr_lines_touched
.values())
518 total_nr_files
= len(diff
.changes
)
519 reviewers_matchscores
= \
521 (reviewers2nr_lines_touched
.get(reviewer
, 0)*100.0/total_nr_lines
522 if total_nr_lines
!= 0 else 0,
523 reviewers2nr_files_touched
[reviewer
]*100.0/total_nr_files
524 if total_nr_files
!= 0 else 0))
525 for reviewer
, nr_lines
526 in reviewers2nr_files_touched
.items()]
527 reviewers_matchscores
.sort(key
=lambda i
: i
[1], reverse
=True)
528 return reviewers_matchscores
531 def find_reviewers_for_review(review
):
532 # Process the newest diff first.
534 review
.phabDiffs
, key
=lambda d
: d
.dateModified
, reverse
=True)
538 matched_reviewers
= find_reviewers_for_diff_heuristic(diff
)
539 # Show progress, as this is a slow operation:
540 sys
.stdout
.write('.')
542 logging
.debug(u
"matched_reviewers: {0}".format(matched_reviewers
))
543 return matched_reviewers
546 def update_git_repos():
547 git_repos_directory
= "git_repos"
548 for name
, url
in GIT_REPO_METADATA
:
549 dirname
= os
.path
.join(git_repos_directory
, name
)
550 if not os
.path
.exists(dirname
):
551 cmd
= "git clone {0} {1}".format(url
, dirname
)
552 output
= get_git_cmd_output(cmd
)
553 cmd
= "git -C {0} pull --rebase".format(dirname
)
554 output
= get_git_cmd_output(cmd
)
557 def send_emails(email_addresses
, sender
, msg
):
560 for email_address
in email_addresses
:
561 email_msg
= email
.mime
.multipart
.MIMEMultipart()
562 email_msg
['From'] = sender
563 email_msg
['To'] = email_address
564 email_msg
['Subject'] = 'LLVM patches you may be able to review.'
565 email_msg
.attach(email
.mime
.text
.MIMEText(msg
.encode('utf-8'), 'plain'))
566 # python 3.x: s.send_message(email_msg)
567 s
.sendmail(email_msg
['From'], email_msg
['To'], email_msg
.as_string())
571 def filter_reviewers_to_report_for(people_to_look_for
):
572 # The below is just an example filter, to only report potential reviews
573 # to do for the people that will receive the report email.
574 return lambda potential_reviewers
: [r
for r
in potential_reviewers
575 if r
[0] in people_to_look_for
]
579 parser
= argparse
.ArgumentParser(
580 description
='Match open reviews to potential reviewers.')
584 action
='store_false',
586 help='Do not update cached Phabricator objects')
592 help="A email addresses to send the report to.")
597 help="The email address to use in 'From' on messages emailed out.")
600 dest
='email_addresses',
602 help="The email addresses (as known by LLVM git) of " +
603 "the people to look for reviews for.")
604 parser
.add_argument('--verbose', '-v', action
='count')
606 args
= parser
.parse_args()
608 if args
.verbose
>= 1:
609 logging
.basicConfig(level
=logging
.DEBUG
)
611 people_to_look_for
= [e
.decode('utf-8') for e
in args
.email_addresses
]
612 logging
.debug("Will look for reviews that following contributors could " +
613 "review: {}".format(people_to_look_for
))
614 logging
.debug("Will email a report to: {}".format(args
.email_report
))
616 phab
= init_phab_connection()
618 if args
.update_cache
:
623 msg
= print_most_recent_reviews(
626 filter_reviewers
=filter_reviewers_to_report_for(people_to_look_for
))
628 if args
.email_report
!= []:
629 send_emails(args
.email_report
, args
.sender
, msg
)
632 if __name__
== "__main__":