3 from __future__
import print_function
6 import email
.mime
.multipart
15 from datetime
import datetime
, timedelta
16 from phabricator
import Phabricator
18 # Setting up a virtualenv to run this script can be done by running the
21 # $ . ./venv/bin/activate
22 # $ pip install Phabricator
24 GIT_REPO_METADATA
= (("llvm", "https://llvm.org/git/llvm.git"), )
26 # The below PhabXXX classes represent objects as modelled by Phabricator.
27 # The classes can be serialized to disk, to try and make sure that we don't
28 # needlessly have to re-fetch lots of data from Phabricator, as that would
29 # make this script unusably slow.
35 def __init__(self
, id):
39 class PhabObjectCache
:
40 def __init__(self
, PhabObjectClass
):
41 self
.PhabObjectClass
= PhabObjectClass
42 self
.most_recent_info
= None
43 self
.oldest_info
= None
44 self
.id2PhabObjects
= {}
47 return self
.PhabObjectClass
.OBJECT_KIND
+ "sCache"
50 if id not in self
.id2PhabObjects
:
51 self
.id2PhabObjects
[id] = self
.PhabObjectClass(id)
52 return self
.id2PhabObjects
[id]
54 def get_ids_in_cache(self
):
55 return list(self
.id2PhabObjects
.keys())
57 def get_objects(self
):
58 return list(self
.id2PhabObjects
.values())
60 DEFAULT_DIRECTORY
= "PhabObjectCache"
62 def _get_pickle_name(self
, directory
):
63 file_name
= "Phab" + self
.PhabObjectClass
.OBJECT_KIND
+ "s.pickle"
64 return os
.path
.join(directory
, file_name
)
66 def populate_cache_from_disk(self
, directory
=DEFAULT_DIRECTORY
):
68 FIXME: consider if serializing to JSON would bring interoperability
69 advantages over serializing to pickle.
72 f
= open(self
._get
_pickle
_name
(directory
), "rb")
73 except IOError as err
:
74 print("Could not find cache. Error message: {0}. Continuing..."
80 self
.__dict
__.update(d
)
81 except EOFError as err
:
82 print("Cache seems to be corrupt. " +
83 "Not using cache. Error message: {0}".format(err
))
85 def write_cache_to_disk(self
, directory
=DEFAULT_DIRECTORY
):
86 if not os
.path
.exists(directory
):
87 os
.makedirs(directory
)
88 with
open(self
._get
_pickle
_name
(directory
), "wb") as f
:
89 pickle
.dump(self
.__dict
__, f
)
90 print("wrote cache to disk, most_recent_info= {0}".format(
91 datetime
.fromtimestamp(self
.most_recent_info
)
92 if self
.most_recent_info
is not None else None))
95 class PhabReview(PhabObject
):
96 OBJECT_KIND
= "Review"
98 def __init__(self
, id):
99 PhabObject
.__init
__(self
, id)
101 def update(self
, title
, dateCreated
, dateModified
, author
):
103 self
.dateCreated
= dateCreated
104 self
.dateModified
= dateModified
107 def setPhabDiffs(self
, phabDiffs
):
108 self
.phabDiffs
= phabDiffs
111 class PhabUser(PhabObject
):
114 def __init__(self
, id):
115 PhabObject
.__init
__(self
, id)
117 def update(self
, phid
, realName
):
119 self
.realName
= realName
123 def __init__(self
, rest_api_hunk
):
124 self
.oldOffset
= int(rest_api_hunk
["oldOffset"])
125 self
.oldLength
= int(rest_api_hunk
["oldLength"])
126 # self.actual_lines_changed_offset will contain the offsets of the
127 # lines that were changed in this hunk.
128 self
.actual_lines_changed_offset
= []
129 offset
= self
.oldOffset
133 for line
in rest_api_hunk
["corpus"].split("\n"):
134 if line
.startswith("+"):
135 # line is a new line that got introduced in this patch.
136 # Do not record it as a changed line.
139 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
141 if line
.startswith("-"):
142 # line was changed or removed from the older version of the
143 # code. Record it as a changed line.
146 hunkStart
= max(self
.oldOffset
, offset
- contextLines
)
149 # line is a context line.
152 hunkEnd
= offset
+ contextLines
153 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
156 hunkEnd
= offset
+ contextLines
157 self
.actual_lines_changed_offset
.append((hunkStart
, hunkEnd
))
159 # The above algorithm could result in adjacent or overlapping ranges
160 # being recorded into self.actual_lines_changed_offset.
161 # Merge the adjacent and overlapping ranges in there:
164 for start
, end
in self
.actual_lines_changed_offset
+ \
165 [(sys
.maxsize
, sys
.maxsize
)]:
166 if lastRange
is None:
167 lastRange
= (start
, end
)
169 if lastRange
[1] >= start
:
170 lastRange
= (lastRange
[0], end
)
173 lastRange
= (start
, end
)
174 self
.actual_lines_changed_offset
= t
178 def __init__(self
, rest_api_change
):
179 self
.oldPath
= rest_api_change
["oldPath"]
180 self
.hunks
= [PhabHunk(h
) for h
in rest_api_change
["hunks"]]
183 class PhabDiff(PhabObject
):
186 def __init__(self
, id):
187 PhabObject
.__init
__(self
, id)
189 def update(self
, rest_api_results
):
190 self
.revisionID
= rest_api_results
["revisionID"]
191 self
.dateModified
= int(rest_api_results
["dateModified"])
192 self
.dateCreated
= int(rest_api_results
["dateCreated"])
193 self
.changes
= [PhabChange(c
) for c
in rest_api_results
["changes"]]
196 class ReviewsCache(PhabObjectCache
):
198 PhabObjectCache
.__init
__(self
, PhabReview
)
201 class UsersCache(PhabObjectCache
):
203 PhabObjectCache
.__init
__(self
, PhabUser
)
206 reviews_cache
= ReviewsCache()
207 users_cache
= UsersCache()
210 def init_phab_connection():
212 phab
.update_interfaces()
216 def update_cached_info(phab
, cache
, phab_query
, order
, record_results
,
217 max_nr_entries_per_fetch
, max_nr_days_to_cache
):
219 LIMIT
= max_nr_entries_per_fetch
220 for query_step
in phab_query
:
221 q
= getattr(q
, query_step
)
222 results
= q(order
=order
, limit
=LIMIT
)
223 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
224 oldest_info_to_fetch
= datetime
.fromtimestamp(most_recent_info
) - \
225 timedelta(days
=max_nr_days_to_cache
)
226 most_recent_info_overall
= most_recent_info
227 cache
.write_cache_to_disk()
228 after
= results
["cursor"]["after"]
229 print("after: {0!r}".format(after
))
230 print("most_recent_info: {0}".format(
231 datetime
.fromtimestamp(most_recent_info
)))
232 while (after
is not None
233 and datetime
.fromtimestamp(oldest_info
) > oldest_info_to_fetch
):
234 need_more_older_data
= \
235 (cache
.oldest_info
is None or
236 datetime
.fromtimestamp(cache
.oldest_info
) > oldest_info_to_fetch
)
237 print(("need_more_older_data={0} cache.oldest_info={1} " +
238 "oldest_info_to_fetch={2}").format(
239 need_more_older_data
,
240 datetime
.fromtimestamp(cache
.oldest_info
)
241 if cache
.oldest_info
is not None else None,
242 oldest_info_to_fetch
))
243 need_more_newer_data
= \
244 (cache
.most_recent_info
is None or
245 cache
.most_recent_info
< most_recent_info
)
246 print(("need_more_newer_data={0} cache.most_recent_info={1} " +
247 "most_recent_info={2}")
248 .format(need_more_newer_data
, cache
.most_recent_info
,
250 if not need_more_older_data
and not need_more_newer_data
:
252 results
= q(order
=order
, after
=after
, limit
=LIMIT
)
253 most_recent_info
, oldest_info
= record_results(cache
, results
, phab
)
254 after
= results
["cursor"]["after"]
255 print("after: {0!r}".format(after
))
256 print("most_recent_info: {0}".format(
257 datetime
.fromtimestamp(most_recent_info
)))
258 cache
.write_cache_to_disk()
259 cache
.most_recent_info
= most_recent_info_overall
261 # We did fetch all records. Mark the cache to contain all info since
264 cache
.oldest_info
= oldest_info
265 cache
.write_cache_to_disk()
268 def record_reviews(cache
, reviews
, phab
):
269 most_recent_info
= None
271 for reviewInfo
in reviews
["data"]:
272 if reviewInfo
["type"] != "DREV":
274 id = reviewInfo
["id"]
275 # phid = reviewInfo["phid"]
276 dateModified
= int(reviewInfo
["fields"]["dateModified"])
277 dateCreated
= int(reviewInfo
["fields"]["dateCreated"])
278 title
= reviewInfo
["fields"]["title"]
279 author
= reviewInfo
["fields"]["authorPHID"]
280 phabReview
= cache
.get(id)
281 if "dateModified" not in phabReview
.__dict
__ or \
282 dateModified
> phabReview
.dateModified
:
283 diff_results
= phab
.differential
.querydiffs(revisionIDs
=[id])
284 diff_ids
= sorted(diff_results
.keys())
286 for diff_id
in diff_ids
:
287 diffInfo
= diff_results
[diff_id
]
288 d
= PhabDiff(diff_id
)
291 phabReview
.update(title
, dateCreated
, dateModified
, author
)
292 phabReview
.setPhabDiffs(phabDiffs
)
293 print("Updated D{0} modified on {1} ({2} diffs)".format(
294 id, datetime
.fromtimestamp(dateModified
), len(phabDiffs
)))
296 if most_recent_info
is None:
297 most_recent_info
= dateModified
298 elif most_recent_info
< dateModified
:
299 most_recent_info
= dateModified
301 if oldest_info
is None:
302 oldest_info
= dateModified
303 elif oldest_info
> dateModified
:
304 oldest_info
= dateModified
305 return most_recent_info
, oldest_info
308 def record_users(cache
, users
, phab
):
309 most_recent_info
= None
311 for info
in users
["data"]:
312 if info
["type"] != "USER":
316 dateModified
= int(info
["fields"]["dateModified"])
317 # dateCreated = int(info["fields"]["dateCreated"])
318 realName
= info
["fields"]["realName"]
319 phabUser
= cache
.get(id)
320 phabUser
.update(phid
, realName
)
321 if most_recent_info
is None:
322 most_recent_info
= dateModified
323 elif most_recent_info
< dateModified
:
324 most_recent_info
= dateModified
325 if oldest_info
is None:
326 oldest_info
= dateModified
327 elif oldest_info
> dateModified
:
328 oldest_info
= dateModified
329 return most_recent_info
, oldest_info
332 PHABCACHESINFO
= ((reviews_cache
, ("differential", "revision", "search"),
333 "updated", record_reviews
, 5, 7),
334 (users_cache
, ("user", "search"), "newest", record_users
,
339 for cache
, phab_query
, order
, record_results
, _
, _
in PHABCACHESINFO
:
340 cache
.populate_cache_from_disk()
341 print("Loaded {0} nr entries: {1}".format(
342 cache
.get_name(), len(cache
.get_ids_in_cache())))
343 print("Loaded {0} has most recent info: {1}".format(
345 datetime
.fromtimestamp(cache
.most_recent_info
)
346 if cache
.most_recent_info
is not None else None))
349 def update_cache(phab
):
351 for cache
, phab_query
, order
, record_results
, max_nr_entries_per_fetch
, \
352 max_nr_days_to_cache
in PHABCACHESINFO
:
353 update_cached_info(phab
, cache
, phab_query
, order
, record_results
,
354 max_nr_entries_per_fetch
, max_nr_days_to_cache
)
355 ids_in_cache
= cache
.get_ids_in_cache()
356 print("{0} objects in {1}".format(len(ids_in_cache
), cache
.get_name()))
357 cache
.write_cache_to_disk()
360 def get_most_recent_reviews(days
):
361 newest_reviews
= sorted(
362 reviews_cache
.get_objects(), key
=lambda r
: -r
.dateModified
)
363 if len(newest_reviews
) == 0:
364 return newest_reviews
365 most_recent_review_time
= \
366 datetime
.fromtimestamp(newest_reviews
[0].dateModified
)
367 cut_off_date
= most_recent_review_time
- timedelta(days
=days
)
369 for review
in newest_reviews
:
370 if datetime
.fromtimestamp(review
.dateModified
) < cut_off_date
:
372 result
.append(review
)
376 # All of the above code is about fetching data from Phabricator and caching it
377 # on local disk. The below code contains the actual "business logic" for this
380 _userphid2realname
= None
383 def get_real_name_from_author(user_phid
):
384 global _userphid2realname
385 if _userphid2realname
is None:
386 _userphid2realname
= {}
387 for user
in users_cache
.get_objects():
388 _userphid2realname
[user
.phid
] = user
.realName
389 return _userphid2realname
.get(user_phid
, "unknown")
392 def print_most_recent_reviews(phab
, days
, filter_reviewers
):
399 newest_reviews
= get_most_recent_reviews(days
)
400 add_msg(u
"These are the reviews that look interesting to be reviewed. " +
401 u
"The report below has 2 sections. The first " +
402 u
"section is organized per review; the second section is organized "
403 + u
"per potential reviewer.\n")
404 oldest_review
= newest_reviews
[-1] if len(newest_reviews
) > 0 else None
406 datetime
.fromtimestamp(oldest_review
.dateModified
) \
407 if oldest_review
else None
408 add_msg((u
"The report below is based on analyzing the reviews that got " +
409 u
"touched in the past {0} days (since {1}). " +
410 u
"The script found {2} such reviews.\n").format(
411 days
, oldest_datetime
, len(newest_reviews
)))
412 reviewer2reviews_and_scores
= {}
413 for i
, review
in enumerate(newest_reviews
):
414 matched_reviewers
= find_reviewers_for_review(review
)
415 matched_reviewers
= filter_reviewers(matched_reviewers
)
416 if len(matched_reviewers
) == 0:
418 add_msg((u
"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
419 u
" Last updated on {4}").format(
421 get_real_name_from_author(review
.author
), review
.title
,
422 datetime
.fromtimestamp(review
.dateModified
)))
423 for reviewer
, scores
in matched_reviewers
:
424 add_msg(u
" potential reviewer {0}, score {1}".format(
426 "(" + "/".join(["{0:.1f}%".format(s
) for s
in scores
]) + ")"))
427 if reviewer
not in reviewer2reviews_and_scores
:
428 reviewer2reviews_and_scores
[reviewer
] = []
429 reviewer2reviews_and_scores
[reviewer
].append((review
, scores
))
431 # Print out a summary per reviewer.
432 for reviewer
in sorted(reviewer2reviews_and_scores
.keys()):
433 reviews_and_scores
= reviewer2reviews_and_scores
[reviewer
]
434 reviews_and_scores
.sort(key
=lambda rs
: rs
[1], reverse
=True)
435 add_msg(u
"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
436 reviewer
, len(reviews_and_scores
)))
437 for review
, scores
in reviews_and_scores
:
438 add_msg(u
"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
439 "/".join(["{0:.1f}%".format(s
) for s
in scores
]), review
.id,
440 review
.title
, get_real_name_from_author(review
.author
)))
441 return "\n".join(msgs
)
444 def get_git_cmd_output(cmd
):
448 output
= subprocess
.check_output(
449 cmd
, shell
=True, stderr
=subprocess
.STDOUT
)
450 except subprocess
.CalledProcessError
as e
:
451 logging
.debug(str(e
))
454 return output
.decode("utf-8", errors
='ignore')
457 reAuthorMail
= re
.compile("^author-mail <([^>]*)>.*$")
460 def parse_blame_output_line_porcelain(blame_output
):
461 email2nr_occurences
= {}
462 if blame_output
is None:
463 return email2nr_occurences
464 for line
in blame_output
.split('\n'):
465 m
= reAuthorMail
.match(line
)
467 author_email_address
= m
.group(1)
468 if author_email_address
not in email2nr_occurences
:
469 email2nr_occurences
[author_email_address
] = 1
471 email2nr_occurences
[author_email_address
] += 1
472 return email2nr_occurences
475 def find_reviewers_for_diff_heuristic(diff
):
476 # Heuristic 1: assume good reviewers are the ones that touched the same
477 # lines before as this patch is touching.
478 # Heuristic 2: assume good reviewers are the ones that touched the same
479 # files before as this patch is touching.
480 reviewers2nr_lines_touched
= {}
481 reviewers2nr_files_touched
= {}
482 # Assume last revision before diff was modified is the revision the diff
484 git_repo
= "git_repos/llvm"
485 cmd
= 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
487 datetime
.fromtimestamp(
488 diff
.dateModified
).strftime("%Y-%m-%d %H:%M:%s"))
489 base_revision
= get_git_cmd_output(cmd
).strip()
490 logging
.debug("Base revision={0}".format(base_revision
))
491 for change
in diff
.changes
:
492 path
= change
.oldPath
493 # Compute heuristic 1: look at context of patch lines.
494 for hunk
in change
.hunks
:
495 for start_line
, end_line
in hunk
.actual_lines_changed_offset
:
496 # Collect git blame results for authors in those ranges.
497 cmd
= ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
498 "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
499 git_repo
, start_line
, end_line
, base_revision
, path
)
500 blame_output
= get_git_cmd_output(cmd
)
501 for reviewer
, nr_occurences
in \
502 parse_blame_output_line_porcelain(blame_output
).items():
503 if reviewer
not in reviewers2nr_lines_touched
:
504 reviewers2nr_lines_touched
[reviewer
] = 0
505 reviewers2nr_lines_touched
[reviewer
] += nr_occurences
506 # Compute heuristic 2: don't look at context, just at files touched.
507 # Collect git blame results for authors in those ranges.
508 cmd
= ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
509 "--line-porcelain {1} -- {2}").format(git_repo
, base_revision
,
511 blame_output
= get_git_cmd_output(cmd
)
512 for reviewer
, nr_occurences
in parse_blame_output_line_porcelain(
513 blame_output
).items():
514 if reviewer
not in reviewers2nr_files_touched
:
515 reviewers2nr_files_touched
[reviewer
] = 0
516 reviewers2nr_files_touched
[reviewer
] += 1
518 # Compute "match scores"
519 total_nr_lines
= sum(reviewers2nr_lines_touched
.values())
520 total_nr_files
= len(diff
.changes
)
521 reviewers_matchscores
= \
523 (reviewers2nr_lines_touched
.get(reviewer
, 0)*100.0/total_nr_lines
524 if total_nr_lines
!= 0 else 0,
525 reviewers2nr_files_touched
[reviewer
]*100.0/total_nr_files
526 if total_nr_files
!= 0 else 0))
527 for reviewer
, nr_lines
528 in reviewers2nr_files_touched
.items()]
529 reviewers_matchscores
.sort(key
=lambda i
: i
[1], reverse
=True)
530 return reviewers_matchscores
533 def find_reviewers_for_review(review
):
534 # Process the newest diff first.
536 review
.phabDiffs
, key
=lambda d
: d
.dateModified
, reverse
=True)
540 matched_reviewers
= find_reviewers_for_diff_heuristic(diff
)
541 # Show progress, as this is a slow operation:
542 sys
.stdout
.write('.')
544 logging
.debug(u
"matched_reviewers: {0}".format(matched_reviewers
))
545 return matched_reviewers
548 def update_git_repos():
549 git_repos_directory
= "git_repos"
550 for name
, url
in GIT_REPO_METADATA
:
551 dirname
= os
.path
.join(git_repos_directory
, name
)
552 if not os
.path
.exists(dirname
):
553 cmd
= "git clone {0} {1}".format(url
, dirname
)
554 output
= get_git_cmd_output(cmd
)
555 cmd
= "git -C {0} pull --rebase".format(dirname
)
556 output
= get_git_cmd_output(cmd
)
559 def send_emails(email_addresses
, sender
, msg
):
562 for email_address
in email_addresses
:
563 email_msg
= email
.mime
.multipart
.MIMEMultipart()
564 email_msg
['From'] = sender
565 email_msg
['To'] = email_address
566 email_msg
['Subject'] = 'LLVM patches you may be able to review.'
567 email_msg
.attach(email
.mime
.text
.MIMEText(msg
.encode('utf-8'), 'plain'))
568 # python 3.x: s.send_message(email_msg)
569 s
.sendmail(email_msg
['From'], email_msg
['To'], email_msg
.as_string())
573 def filter_reviewers_to_report_for(people_to_look_for
):
574 # The below is just an example filter, to only report potential reviews
575 # to do for the people that will receive the report email.
576 return lambda potential_reviewers
: [r
for r
in potential_reviewers
577 if r
[0] in people_to_look_for
]
581 parser
= argparse
.ArgumentParser(
582 description
='Match open reviews to potential reviewers.')
586 action
='store_false',
588 help='Do not update cached Phabricator objects')
594 help="A email addresses to send the report to.")
599 help="The email address to use in 'From' on messages emailed out.")
602 dest
='email_addresses',
604 help="The email addresses (as known by LLVM git) of " +
605 "the people to look for reviews for.")
606 parser
.add_argument('--verbose', '-v', action
='count')
608 args
= parser
.parse_args()
610 if args
.verbose
>= 1:
611 logging
.basicConfig(level
=logging
.DEBUG
)
613 people_to_look_for
= [e
.decode('utf-8') for e
in args
.email_addresses
]
614 logging
.debug("Will look for reviews that following contributors could " +
615 "review: {}".format(people_to_look_for
))
616 logging
.debug("Will email a report to: {}".format(args
.email_report
))
618 phab
= init_phab_connection()
620 if args
.update_cache
:
625 msg
= print_most_recent_reviews(
628 filter_reviewers
=filter_reviewers_to_report_for(people_to_look_for
))
630 if args
.email_report
!= []:
631 send_emails(args
.email_report
, args
.sender
, msg
)
634 if __name__
== "__main__":