Index: llvm/trunk/utils/Reviewing/find_interesting_reviews.py =================================================================== --- llvm/trunk/utils/Reviewing/find_interesting_reviews.py +++ llvm/trunk/utils/Reviewing/find_interesting_reviews.py @@ -0,0 +1,616 @@ +#!/usr/bin/env python + +import argparse +import email.mime.multipart +import email.mime.text +import logging +import os.path +import pickle +import re +import smtplib +import subprocess +import sys +from datetime import datetime, timedelta +from phabricator import Phabricator + +# Setting up a virtualenv to run this script can be done by running the +# following commands: +# $ virtualenv venv +# $ . ./venv/bin/activate +# $ pip install Phabricator + +GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), ) + +# The below PhabXXX classes represent objects as modelled by Phabricator. +# The classes can be serialized to disk, to try and make sure that we don't +# needlessly have to re-fetch lots of data from Phabricator, as that would +# make this script unusably slow. + + +class PhabObject: + OBJECT_KIND = None + + def __init__(self, id): + self.id = id + + +class PhabObjectCache: + def __init__(self, PhabObjectClass): + self.PhabObjectClass = PhabObjectClass + self.most_recent_info = None + self.oldest_info = None + self.id2PhabObjects = {} + + def get_name(self): + return self.PhabObjectClass.OBJECT_KIND + "sCache" + + def get(self, id): + if id not in self.id2PhabObjects: + self.id2PhabObjects[id] = self.PhabObjectClass(id) + return self.id2PhabObjects[id] + + def get_ids_in_cache(self): + return self.id2PhabObjects.keys() + + def get_objects(self): + return self.id2PhabObjects.values() + + DEFAULT_DIRECTORY = "PhabObjectCache" + + def _get_pickle_name(self, directory): + file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle" + return os.path.join(directory, file_name) + + def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY): + """ + FIXME: consider if serializing to JSON would bring interoperability + advantages over serializing to pickle. + """ + try: + f = open(self._get_pickle_name(directory), "rb") + except IOError as err: + print("Could not find cache. Error message: {0}. Continuing..." + .format(err)) + else: + with f: + try: + d = pickle.load(f) + self.__dict__.update(d) + except EOFError as err: + print("Cache seems to be corrupt. " + + "Not using cache. Error message: {0}".format(err)) + + def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY): + if not os.path.exists(directory): + os.makedirs(directory) + with open(self._get_pickle_name(directory), "wb") as f: + pickle.dump(self.__dict__, f) + print("wrote cache to disk, most_recent_info= {0}".format( + datetime.fromtimestamp(self.most_recent_info) + if self.most_recent_info is not None else None)) + + +class PhabReview(PhabObject): + OBJECT_KIND = "Review" + + def __init__(self, id): + PhabObject.__init__(self, id) + + def update(self, title, dateCreated, dateModified, author): + self.title = title + self.dateCreated = dateCreated + self.dateModified = dateModified + self.author = author + + def setPhabDiffs(self, phabDiffs): + self.phabDiffs = phabDiffs + + +class PhabUser(PhabObject): + OBJECT_KIND = "User" + + def __init__(self, id): + PhabObject.__init__(self, id) + + def update(self, phid, realName): + self.phid = phid + self.realName = realName + + +class PhabHunk: + def __init__(self, rest_api_hunk): + self.oldOffset = int(rest_api_hunk["oldOffset"]) + self.oldLength = int(rest_api_hunk["oldLength"]) + # self.actual_lines_changed_offset will contain the offsets of the + # lines that were changed in this hunk. + self.actual_lines_changed_offset = [] + offset = self.oldOffset + inHunk = False + hunkStart = -1 + contextLines = 3 + for line in rest_api_hunk["corpus"].split("\n"): + if line.startswith("+"): + # line is a new line that got introduced in this patch. + # Do not record it as a changed line. + if inHunk is False: + inHunk = True + hunkStart = max(self.oldOffset, offset - contextLines) + continue + if line.startswith("-"): + # line was changed or removed from the older version of the + # code. Record it as a changed line. + if inHunk is False: + inHunk = True + hunkStart = max(self.oldOffset, offset - contextLines) + offset += 1 + continue + # line is a context line. + if inHunk is True: + inHunk = False + hunkEnd = offset + contextLines + self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) + offset += 1 + if inHunk is True: + hunkEnd = offset + contextLines + self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) + + # The above algorithm could result in adjacent or overlapping ranges + # being recorded into self.actual_lines_changed_offset. + # Merge the adjacent and overlapping ranges in there: + t = [] + lastRange = None + for start, end in self.actual_lines_changed_offset + \ + [(sys.maxsize, sys.maxsize)]: + if lastRange is None: + lastRange = (start, end) + else: + if lastRange[1] >= start: + lastRange = (lastRange[0], end) + else: + t.append(lastRange) + lastRange = (start, end) + self.actual_lines_changed_offset = t + + +class PhabChange: + def __init__(self, rest_api_change): + self.oldPath = rest_api_change["oldPath"] + self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]] + + +class PhabDiff(PhabObject): + OBJECT_KIND = "Diff" + + def __init__(self, id): + PhabObject.__init__(self, id) + + def update(self, rest_api_results): + self.revisionID = rest_api_results["revisionID"] + self.dateModified = int(rest_api_results["dateModified"]) + self.dateCreated = int(rest_api_results["dateCreated"]) + self.changes = [PhabChange(c) for c in rest_api_results["changes"]] + + +class ReviewsCache(PhabObjectCache): + def __init__(self): + PhabObjectCache.__init__(self, PhabReview) + + +class UsersCache(PhabObjectCache): + def __init__(self): + PhabObjectCache.__init__(self, PhabUser) + + +reviews_cache = ReviewsCache() +users_cache = UsersCache() + + +def init_phab_connection(): + phab = Phabricator() + phab.update_interfaces() + return phab + + +def update_cached_info(phab, cache, phab_query, order, record_results, + max_nr_entries_per_fetch, max_nr_days_to_cache): + q = phab + LIMIT = max_nr_entries_per_fetch + for query_step in phab_query: + q = getattr(q, query_step) + results = q(order=order, limit=LIMIT) + most_recent_info, oldest_info = record_results(cache, results, phab) + oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \ + timedelta(days=max_nr_days_to_cache) + most_recent_info_overall = most_recent_info + cache.write_cache_to_disk() + after = results["cursor"]["after"] + print("after: {0!r}".format(after)) + print("most_recent_info: {0}".format( + datetime.fromtimestamp(most_recent_info))) + while (after is not None + and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch): + need_more_older_data = \ + (cache.oldest_info is None or + datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch) + print(("need_more_older_data={0} cache.oldest_info={1} " + + "oldest_info_to_fetch={2}").format( + need_more_older_data, + datetime.fromtimestamp(cache.oldest_info) + if cache.oldest_info is not None else None, + oldest_info_to_fetch)) + need_more_newer_data = \ + (cache.most_recent_info is None or + cache.most_recent_info < most_recent_info) + print(("need_more_newer_data={0} cache.most_recent_info={1} " + + "most_recent_info={2}") + .format(need_more_newer_data, cache.most_recent_info, + most_recent_info)) + if not need_more_older_data and not need_more_newer_data: + break + results = q(order=order, after=after, limit=LIMIT) + most_recent_info, oldest_info = record_results(cache, results, phab) + after = results["cursor"]["after"] + print("after: {0!r}".format(after)) + print("most_recent_info: {0}".format( + datetime.fromtimestamp(most_recent_info))) + cache.write_cache_to_disk() + cache.most_recent_info = most_recent_info_overall + if after is None: + # We did fetch all records. Mark the cache to contain all info since + # the start of time. + oldest_info = 0 + cache.oldest_info = oldest_info + cache.write_cache_to_disk() + + +def record_reviews(cache, reviews, phab): + most_recent_info = None + oldest_info = None + for reviewInfo in reviews["data"]: + if reviewInfo["type"] != "DREV": + continue + id = reviewInfo["id"] + # phid = reviewInfo["phid"] + dateModified = int(reviewInfo["fields"]["dateModified"]) + dateCreated = int(reviewInfo["fields"]["dateCreated"]) + title = reviewInfo["fields"]["title"] + author = reviewInfo["fields"]["authorPHID"] + phabReview = cache.get(id) + if "dateModified" not in phabReview.__dict__ or \ + dateModified > phabReview.dateModified: + diff_results = phab.differential.querydiffs(revisionIDs=[id]) + diff_ids = sorted(diff_results.keys()) + phabDiffs = [] + for diff_id in diff_ids: + diffInfo = diff_results[diff_id] + d = PhabDiff(diff_id) + d.update(diffInfo) + phabDiffs.append(d) + phabReview.update(title, dateCreated, dateModified, author) + phabReview.setPhabDiffs(phabDiffs) + print("Updated D{0} modified on {1} ({2} diffs)".format( + id, datetime.fromtimestamp(dateModified), len(phabDiffs))) + + if most_recent_info is None: + most_recent_info = dateModified + elif most_recent_info < dateModified: + most_recent_info = dateModified + + if oldest_info is None: + oldest_info = dateModified + elif oldest_info > dateModified: + oldest_info = dateModified + return most_recent_info, oldest_info + + +def record_users(cache, users, phab): + most_recent_info = None + oldest_info = None + for info in users["data"]: + if info["type"] != "USER": + continue + id = info["id"] + phid = info["phid"] + dateModified = int(info["fields"]["dateModified"]) + # dateCreated = int(info["fields"]["dateCreated"]) + realName = info["fields"]["realName"] + phabUser = cache.get(id) + phabUser.update(phid, realName) + if most_recent_info is None: + most_recent_info = dateModified + elif most_recent_info < dateModified: + most_recent_info = dateModified + if oldest_info is None: + oldest_info = dateModified + elif oldest_info > dateModified: + oldest_info = dateModified + return most_recent_info, oldest_info + + +PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"), + "updated", record_reviews, 5, 7), + (users_cache, ("user", "search"), "newest", record_users, + 100, 1000)) + + +def load_cache(): + for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO: + cache.populate_cache_from_disk() + print("Loaded {0} nr entries: {1}".format( + cache.get_name(), len(cache.get_ids_in_cache()))) + print("Loaded {0} has most recent info: {1}".format( + cache.get_name(), + datetime.fromtimestamp(cache.most_recent_info) + if cache.most_recent_info is not None else None)) + + +def update_cache(phab): + load_cache() + for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \ + max_nr_days_to_cache in PHABCACHESINFO: + update_cached_info(phab, cache, phab_query, order, record_results, + max_nr_entries_per_fetch, max_nr_days_to_cache) + ids_in_cache = cache.get_ids_in_cache() + print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name())) + cache.write_cache_to_disk() + + +def get_most_recent_reviews(days): + newest_reviews = sorted( + reviews_cache.get_objects(), key=lambda r: -r.dateModified) + if len(newest_reviews) == 0: + return newest_reviews + most_recent_review_time = \ + datetime.fromtimestamp(newest_reviews[0].dateModified) + cut_off_date = most_recent_review_time - timedelta(days=days) + result = [] + for review in newest_reviews: + if datetime.fromtimestamp(review.dateModified) < cut_off_date: + return result + result.append(review) + return result + + +# All of the above code is about fetching data from Phabricator and caching it +# on local disk. The below code contains the actual "business logic" for this +# script. + +_userphid2realname = None + + +def get_real_name_from_author(user_phid): + global _userphid2realname + if _userphid2realname is None: + _userphid2realname = {} + for user in users_cache.get_objects(): + _userphid2realname[user.phid] = user.realName + return _userphid2realname.get(user_phid, "unknown") + + +def print_most_recent_reviews(phab, days, filter_reviewers): + msgs = [] + + def add_msg(msg): + msgs.append(msg) + print(msg) + + newest_reviews = get_most_recent_reviews(days) + add_msg("These are the reviews that look interesting to be reviewed. " + + "The report below has 2 sections. The first " + + "section is organized per review; the second section is organized " + + "per potential reviewer.\n") + oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None + oldest_datetime = \ + datetime.fromtimestamp(oldest_review.dateModified) \ + if oldest_review else None + add_msg(("The report below is based on analyzing the reviews that got " + + "touched in the past {0} days (since {1}). " + + "The script found {2} such reviews.\n").format( + days, oldest_datetime, len(newest_reviews))) + reviewer2reviews_and_scores = {} + for i, review in enumerate(newest_reviews): + matched_reviewers = find_reviewers_for_review(review) + matched_reviewers = filter_reviewers(matched_reviewers) + if len(matched_reviewers) == 0: + continue + add_msg(("{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" + + " Last updated on {4}").format( + i, review.id, + get_real_name_from_author(review.author), review.title, + datetime.fromtimestamp(review.dateModified))) + for reviewer, scores in matched_reviewers: + add_msg(" potential reviewer {0}, score {1}".format( + reviewer, + "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")")) + if reviewer not in reviewer2reviews_and_scores: + reviewer2reviews_and_scores[reviewer] = [] + reviewer2reviews_and_scores[reviewer].append((review, scores)) + + # Print out a summary per reviewer. + for reviewer in sorted(reviewer2reviews_and_scores.keys()): + reviews_and_scores = reviewer2reviews_and_scores[reviewer] + reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True) + add_msg("\n\nSUMMARY FOR {0} (found {1} reviews):".format( + reviewer, len(reviews_and_scores))) + for review, scores in reviews_and_scores: + add_msg("[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format( + "/".join(["{0:.1f}%".format(s) for s in scores]), review.id, + review.title, get_real_name_from_author(review.author))) + return "\n".join(msgs) + + +def get_git_cmd_output(cmd): + output = None + try: + logging.debug(cmd) + output = subprocess.check_output( + cmd, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + logging.debug(str(e)) + if output is None: + return None + return output.decode("utf-8", errors='ignore') + + +reAuthorMail = re.compile("^author-mail <([^>]*)>.*$") + + +def parse_blame_output_line_porcelain(blame_output): + email2nr_occurences = {} + if blame_output is None: + return email2nr_occurences + for line in blame_output.split('\n'): + m = reAuthorMail.match(line) + if m: + author_email_address = m.group(1) + if author_email_address not in email2nr_occurences: + email2nr_occurences[author_email_address] = 1 + else: + email2nr_occurences[author_email_address] += 1 + return email2nr_occurences + + +def find_reviewers_for_diff_heuristic(diff): + # Heuristic 1: assume good reviewers are the ones that touched the same + # lines before as this patch is touching. + # Heuristic 2: assume good reviewers are the ones that touched the same + # files before as this patch is touching. + reviewers2nr_lines_touched = {} + reviewers2nr_files_touched = {} + # Assume last revision before diff was modified is the revision the diff + # applies to. + git_repo = "git_repos/llvm" + cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format( + git_repo, + datetime.fromtimestamp( + diff.dateModified).strftime("%Y-%m-%d %H:%M:%s")) + base_revision = get_git_cmd_output(cmd).strip() + logging.debug("Base revision={0}".format(base_revision)) + for change in diff.changes: + path = change.oldPath + # Compute heuristic 1: look at context of patch lines. + for hunk in change.hunks: + for start_line, end_line in hunk.actual_lines_changed_offset: + # Collect git blame results for authors in those ranges. + cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " + + "-w --line-porcelain -L {1},{2} {3} -- {4}").format( + git_repo, start_line, end_line, base_revision, path) + blame_output = get_git_cmd_output(cmd) + for reviewer, nr_occurences in \ + parse_blame_output_line_porcelain(blame_output).items(): + if reviewer not in reviewers2nr_lines_touched: + reviewers2nr_lines_touched[reviewer] = 0 + reviewers2nr_lines_touched[reviewer] += nr_occurences + # Compute heuristic 2: don't look at context, just at files touched. + # Collect git blame results for authors in those ranges. + cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " + + "--line-porcelain {1} -- {2}").format(git_repo, base_revision, + path) + blame_output = get_git_cmd_output(cmd) + for reviewer, nr_occurences in parse_blame_output_line_porcelain( + blame_output).items(): + if reviewer not in reviewers2nr_files_touched: + reviewers2nr_files_touched[reviewer] = 0 + reviewers2nr_files_touched[reviewer] += 1 + + # Compute "match scores" + total_nr_lines = sum(reviewers2nr_lines_touched.values()) + total_nr_files = len(diff.changes) + reviewers_matchscores = \ + [(reviewer, + (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines + if total_nr_lines != 0 else 0, + reviewers2nr_files_touched[reviewer]*100.0/total_nr_files + if total_nr_files != 0 else 0)) + for reviewer, nr_lines + in reviewers2nr_files_touched.items()] + reviewers_matchscores.sort(key=lambda i: i[1], reverse=True) + return reviewers_matchscores + + +def find_reviewers_for_review(review): + # Process the newest diff first. + diffs = sorted( + review.phabDiffs, key=lambda d: d.dateModified, reverse=True) + if len(diffs) == 0: + return + diff = diffs[0] + matched_reviewers = find_reviewers_for_diff_heuristic(diff) + # Show progress, as this is a slow operation: + sys.stdout.write('.') + sys.stdout.flush() + logging.debug("matched_reviewers: {0}".format(matched_reviewers)) + return matched_reviewers + + +def update_git_repos(): + git_repos_directory = "git_repos" + for name, url in GIT_REPO_METADATA: + dirname = os.path.join(git_repos_directory, name) + if not os.path.exists(dirname): + cmd = "git clone {0} {1}".format(url, dirname) + output = get_git_cmd_output(cmd) + cmd = "git -C {0} pull --rebase".format(dirname) + output = get_git_cmd_output(cmd) + + +def send_emails(email_addresses, msg): + s = smtplib.SMTP() + s.connect() + for email_address in email_addresses: + email_msg = email.mime.multipart.MIMEMultipart() + email_msg['From'] = '' + email_msg['To'] = email_address + email_msg['Subject'] = 'LLVM patches you may be able to review.' + email_msg.attach(email.mime.text.MIMEText(msg, 'plain')) + # python 3.x: s.send_message(email_msg) + s.sendmail(email_msg['From'], email_msg['To'], msg) + s.quit() + + +def filter_reviewers_to_report_for(people_to_look_for): + # The below is just an example filter, to only report potential reviews + # to do for the people that will receive the report email. + return lambda potential_reviewers: [r for r in potential_reviewers + if r[0] in people_to_look_for] + + +def main(): + parser = argparse.ArgumentParser( + description='Match open reviews to potential reviewers.') + parser.add_argument( + '--no-update-cache', + dest='update_cache', + action='store_false', + default=True, + help='Do not update cached Phabricator objects') + parser.add_argument( + 'email_addresses', + nargs='*', + help="The email addresses (as known by LLVM git) of " + + "the people to look for reviews for.") + parser.add_argument('--verbose', '-v', action='count') + + args = parser.parse_args() + + if args.verbose >= 1: + logging.basicConfig(level=logging.DEBUG) + + people_to_look_for = [e.decode('utf-8') for e in args.email_addresses] + + phab = init_phab_connection() + + if args.update_cache: + update_cache(phab) + + load_cache() + update_git_repos() + msg = print_most_recent_reviews( + phab, + days=1, + filter_reviewers=filter_reviewers_to_report_for(people_to_look_for)) + send_emails(people_to_look_for, msg) + + +if __name__ == "__main__": + main()