Index: utils/Reviewing/find_interesting_reviews.py
===================================================================
--- /dev/null
+++ utils/Reviewing/find_interesting_reviews.py
@@ -0,0 +1,587 @@
+from phabricator import Phabricator
+import pickle
+import os.path
+from datetime import datetime, timedelta
+import sys
+import subprocess
+import re
+import smtplib
+import email.mime.multipart
+import email.mime.text
+import argparse
+
+# settting up a virtualenv to run this script can be done by running:
+# $ virtualenv venv
+# $ . ./venv/bin/activate
+# $ pip install Phabricator
+
+
+# The below PhabXXX classes represent objects as modelled by Phabricator.
+# The classes can be serialized to disk, to try and make sure that we don't
+# needlessly have to re-fetch lots of data from Phabricator, as that would
+# make this script unusably slow.
+
+class PhabObject:
+    object_kind = None
+
+    def __init__(self, id):
+        self.id = id
+
+
+class PhabObjectCache:
+    def __init__(self, PhabObjectClass):
+        self.PhabObjectClass = PhabObjectClass
+        self.most_recent_info = None
+        self.oldest_info = None
+        self.id2PhabObjects = {}
+
+    def get_name(self):
+        return self.PhabObjectClass.object_kind + "sCache"
+
+    def get(self, id):
+        if id not in self.id2PhabObjects:
+            self.id2PhabObjects[id] = self.PhabObjectClass(id)
+        return self.id2PhabObjects[id]
+
+    def get_ids_in_cache(self):
+        return self.id2PhabObjects.keys()
+
+    def get_objects(self):
+        return self.id2PhabObjects.values()
+
+    DEFAULT_DIRECTORY = "PhabObjectCache"
+
+    def _get_pickle_name(self, directory):
+        file_name = "Phab" + self.PhabObjectClass.object_kind + "s.pickle"
+        return os.path.join(directory, file_name)
+
+    def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
+        """
+        FIXME: consider if serializing to JSON would bring interoperability
+        advantages over serializing to pickle.
+        """
+        try:
+            f = open(self._get_pickle_name(directory), "rb")
+        except IOError as err:
+            print ("Could not find cache. Error message: {0}. Continuing..."
+                   .format(err))
+        else:
+            with f:
+                try:
+                    d = pickle.load(f)
+                    self.__dict__.update(d)
+                except EOFError as err:
+                    print ("Cache seems to be corrupt. " +
+                           "Not using cache. Error message: {0}".format(err))
+
+    def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
+        if not os.path.exists(directory):
+                os.makedirs(directory)
+        with open(self._get_pickle_name(directory), "wb") as f:
+            pickle.dump(self.__dict__, f)
+        print ("wrote cache to disk, most_recent_info=%s"
+               % (datetime.fromtimestamp(self.most_recent_info)
+                  if self.most_recent_info is not None else None))
+
+
+class PhabReview(PhabObject):
+    object_kind = "Review"
+
+    def __init__(self, id):
+        PhabObject.__init__(self, id)
+
+    def update(self, title, dateCreated, dateModified, author):
+        self.title = title
+        self.dateCreated = dateCreated
+        self.dateModified = dateModified
+        self.author = author
+
+    def setPhabDiffs(self, phabDiffs):
+        self.phabDiffs = phabDiffs
+
+
+class PhabUser(PhabObject):
+    object_kind = "User"
+
+    def __init__(self, id):
+        PhabObject.__init__(self, id)
+
+    def update(self, phid, realName):
+        self.phid = phid
+        self.realName = realName
+
+
+class PhabHunk:
+    def __init__(self, rest_api_hunk):
+        self.oldOffset = int(rest_api_hunk["oldOffset"])
+        self.oldLength = int(rest_api_hunk["oldLength"])
+        self.actual_lines_changed_offset = []
+        offset = self.oldOffset
+        inHunk = False
+        hunkStart = -1
+        contextLines = 3
+        for line in rest_api_hunk["corpus"].split("\n"):
+            if line.startswith("+"):
+                # adding a new line, ignore this
+                if inHunk is False:
+                    inHunk = True
+                    hunkStart = max(self.oldOffset, offset-contextLines)
+                continue
+            if line.startswith("-"):
+                # changing/removing a line from the old version. record this
+                if inHunk is False:
+                    inHunk = True
+                    hunkStart = max(self.oldOffset, offset-contextLines)
+                offset += 1
+                continue
+            # This is just a context line
+            if inHunk is True:
+                inHunk = False
+                hunkEnd = offset+contextLines
+                self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
+            offset += 1
+        if inHunk is True:
+            hunkEnd = offset+contextLines
+            self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
+
+        # merge adjacent and overlapping ranges
+        t = []
+        lastRange = None
+        for start, end in self.actual_lines_changed_offset + \
+                [(sys.maxsize, sys.maxsize)]:
+            if lastRange is None:
+                lastRange = (start, end)
+            else:
+                if lastRange[1] >= start:
+                    lastRange = (lastRange[0], end)
+                else:
+                    t.append(lastRange)
+                    lastRange = (start, end)
+        self.actual_lines_changed_offset = t
+
+
+class PhabChange:
+    def __init__(self, rest_api_change):
+        self.oldPath = rest_api_change["oldPath"]
+        self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
+
+
+class PhabDiff(PhabObject):
+    object_kind = "Diff"
+
+    def __init__(self, id):
+        PhabObject.__init__(self, id)
+
+    def update(self, rest_api_results):
+        self.revisionID = rest_api_results["revisionID"]
+        self.dateModified = int(rest_api_results["dateModified"])
+        self.dateCreated = int(rest_api_results["dateCreated"])
+        self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
+
+
+class ReviewsCache(PhabObjectCache):
+    def __init__(self):
+        PhabObjectCache.__init__(self, PhabReview)
+
+
+class UsersCache(PhabObjectCache):
+    def __init__(self):
+        PhabObjectCache.__init__(self, PhabUser)
+
+
+reviews_cache = ReviewsCache()
+users_cache = UsersCache()
+
+
+def init_phab_connection():
+    phab = Phabricator()
+    phab.update_interfaces()
+    return phab
+
+
+def update_cached_info(phab, cache, phab_query, order, record_results,
+                       max_nr_entries_per_fetch, max_nr_days_to_cache):
+    q = phab
+    LIMIT = max_nr_entries_per_fetch
+    for query_step in phab_query:
+        q = getattr(q, query_step)
+    results = q(order=order, limit=LIMIT)
+    most_recent_info, oldest_info = record_results(cache, results, phab)
+    oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
+        timedelta(days=max_nr_days_to_cache)
+    most_recent_info_overall = most_recent_info
+    cache.write_cache_to_disk()
+    after = results["cursor"]["after"]
+    print ("after: %s" % repr(after))
+    print ("most_recent_info: %s" % datetime.fromtimestamp(most_recent_info))
+    while (after is not None and
+           datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
+        need_more_older_data = \
+            (cache.oldest_info is None or
+             datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
+        print("need_more_older_data=%s cache.oldest_info=%s oldest_info_to_fetch=%s"
+              % (need_more_older_data,
+                 datetime.fromtimestamp(cache.oldest_info)
+                 if cache.oldest_info is not None else None,
+                 oldest_info_to_fetch))
+        need_more_newer_data = \
+            (cache.most_recent_info is None or
+             cache.most_recent_info < most_recent_info)
+        print("need_more_newer_data=%s cache.most_recent_info=%s most_recent_info=%s"
+              % (need_more_newer_data, cache.most_recent_info, most_recent_info))
+        if not need_more_older_data and not need_more_newer_data:
+            break
+        results = q(order=order, after=after, limit=LIMIT)
+        most_recent_info, oldest_info = record_results(cache, results, phab)
+        after = results["cursor"]["after"]
+        print ("after: %s" % repr(after))
+        print ("most_recent_info: %s"
+               % datetime.fromtimestamp(most_recent_info))
+        cache.write_cache_to_disk()
+    cache.most_recent_info = most_recent_info_overall
+    if after is None:
+        # we did fetch all records. Mark the cache to contain all info since
+        # the start of time.
+        oldest_info = 0
+    cache.oldest_info = oldest_info
+    cache.write_cache_to_disk()
+
+
+def record_reviews(cache, reviews, phab):
+    most_recent_info = None
+    oldest_info = None
+    for reviewInfo in reviews["data"]:
+        if reviewInfo["type"] != "DREV":
+            continue
+        id = reviewInfo["id"]
+        # phid = reviewInfo["phid"]
+        dateModified = int(reviewInfo["fields"]["dateModified"])
+        dateCreated = int(reviewInfo["fields"]["dateCreated"])
+        title = reviewInfo["fields"]["title"]
+        author = reviewInfo["fields"]["authorPHID"]
+        phabReview = cache.get(id)
+        if "dateModified" not in phabReview.__dict__ or \
+           dateModified > phabReview.dateModified:
+            diff_results = phab.differential.querydiffs(revisionIDs=[id])
+            diff_ids = sorted(diff_results.keys())
+            phabDiffs = []
+            for diff_id in diff_ids:
+                diffInfo = diff_results[diff_id]
+                d = PhabDiff(diff_id)
+                d.update(diffInfo)
+                phabDiffs.append(d)
+            phabReview.update(title, dateCreated, dateModified, author)
+            phabReview.setPhabDiffs(phabDiffs)
+            print("Updated D%d modified on %s (%d diffs)"
+                  % (id, datetime.fromtimestamp(dateModified), len(phabDiffs)))
+
+        if most_recent_info is None:
+            most_recent_info = dateModified
+        elif most_recent_info < dateModified:
+            most_recent_info = dateModified
+
+        if oldest_info is None:
+            oldest_info = dateModified
+        elif oldest_info > dateModified:
+            oldest_info = dateModified
+    return most_recent_info, oldest_info
+
+
+def record_users(cache, users, phab):
+    most_recent_info = None
+    oldest_info = None
+    for info in users["data"]:
+        if info["type"] != "USER":
+            continue
+        id = info["id"]
+        phid = info["phid"]
+        dateModified = int(info["fields"]["dateModified"])
+        # dateCreated = int(info["fields"]["dateCreated"])
+        realName = info["fields"]["realName"]
+        phabUser = cache.get(id)
+        phabUser.update(phid, realName)
+        if most_recent_info is None:
+            most_recent_info = dateModified
+        elif most_recent_info < dateModified:
+            most_recent_info = dateModified
+        if oldest_info is None:
+            oldest_info = dateModified
+        elif oldest_info > dateModified:
+            oldest_info = dateModified
+    return most_recent_info, oldest_info
+
+
+PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
+                   "updated", record_reviews, 5, 7),
+                  (users_cache, ("user", "search"),
+                   "newest", record_users, 100, 1000))
+
+
+def load_cache():
+    for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
+        cache.populate_cache_from_disk()
+        print ("Loaded %s nr entries: %d" %
+               (cache.get_name(), len(cache.get_ids_in_cache())))
+        print ("Loaded %s has most recent info: %s" %
+               (cache.get_name(),
+                datetime.fromtimestamp(cache.most_recent_info)
+                if cache.most_recent_info is not None else None))
+
+
+def update_cache(phab):
+    load_cache()
+    for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
+            max_nr_days_to_cache in PHABCACHESINFO:
+        update_cached_info(phab, cache, phab_query, order, record_results,
+                           max_nr_entries_per_fetch, max_nr_days_to_cache)
+        ids_in_cache = cache.get_ids_in_cache()
+        print ("%d objects in %s" %
+               (len(ids_in_cache), cache.get_name()))
+        cache.write_cache_to_disk()
+
+
+def get_most_recent_reviews(days):
+    newest_reviews = sorted(reviews_cache.get_objects(),
+                            key=lambda r: -r.dateModified)
+    if len(newest_reviews) == 0:
+        return newest_reviews
+    most_recent_review_time = \
+        datetime.fromtimestamp(newest_reviews[0].dateModified)
+    cut_off_date = most_recent_review_time - timedelta(days=days)
+    result = []
+    for review in newest_reviews:
+        if datetime.fromtimestamp(review.dateModified) < cut_off_date:
+            return result
+        result.append(review)
+    return result
+
+
+# All of the above code is about fetching data from Phabricator and caching it
+# on local disk. The below code contains the actual "business logic" for this
+# script
+
+_userphid2realname = None
+def get_real_name_from_author(user_phid):
+    global _userphid2realname
+    if _userphid2realname is None:
+        _userphid2realname = {}
+        for user in users_cache.get_objects():
+            _userphid2realname[user.phid] = user.realName
+    return _userphid2realname.get(user_phid, "unknown")
+
+
+def print_most_recent_reviews(phab, days, filter_reviewers):
+    msgs = []
+
+    def add_msg(msg):
+        msgs.append(msg)
+        print(msg)
+
+    newest_reviews = get_most_recent_reviews(days)
+    add_msg("These are the reviews that look interesting to be reviewed. "+
+            "The report below has 2 sections. The first " +
+            "section is organized per review; the second section is organized " +
+            "per potential reviewer.\n")
+    oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
+    oldest_datetime = \
+        datetime.fromtimestamp(oldest_review.dateModified) \
+        if oldest_review else None
+    add_msg(("The report below is based on analyzing the reviews that got " +
+             "touched in the past %d days (since %s). " +
+             "The script found %d such reviews.\n")
+            % (days, oldest_datetime, len(newest_reviews)))
+    reviewer2reviews_and_scores = {}
+    for i, review in enumerate(newest_reviews):
+        matched_reviewers = find_reviewers_for_review(review)
+        matched_reviewers = filter_reviewers(matched_reviewers)
+        if len(matched_reviewers) == 0:
+            continue
+        add_msg(("%3d. https://reviews.llvm.org/D%s by %s\n     %s\n"
+                 + "     Last updated on %s") %
+                (i, review.id,
+                 get_real_name_from_author(review.author),
+                 review.title,
+                 datetime.fromtimestamp(review.dateModified)))
+        for reviewer, scores in matched_reviewers:
+            add_msg("    potential reviewer %s, score %s"
+                    % (reviewer,
+                       "("+"/".join(["%.1f%%" % s for s in scores])+")"))
+            if reviewer not in reviewer2reviews_and_scores:
+                reviewer2reviews_and_scores[reviewer] = []
+            reviewer2reviews_and_scores[reviewer].append((review, scores))
+
+    # Print out summary per reviewer
+    for reviewer in sorted(reviewer2reviews_and_scores.keys()):
+        reviews_and_scores = reviewer2reviews_and_scores[reviewer]
+        reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
+        add_msg("\n\nSUMMARY FOR %s (found %d reviews):"
+                % (reviewer, len(reviews_and_scores)))
+        for review, scores in reviews_and_scores:
+            add_msg("[%s] https://reviews.llvm.org/D%s '%s' by %s"
+                    % ("/".join(["%2.2f%%" % s for s in scores]),
+                       review.id, review.title,
+                       get_real_name_from_author(review.author)))
+    return "\n".join(msgs)
+
+
+def get_git_cmd_output(cmd):
+    output = None
+    try:
+        # print(cmd)
+        output = subprocess.check_output(cmd, shell=True,
+                                         stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        pass
+        # print(e)
+    if output is None:
+        return None
+    return output.decode("utf-8", errors='ignore')
+
+
+reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
+def parse_blame_output_line_porcelain(blame_output):
+    email2nr_occurences = {}
+    if blame_output is None:
+        return email2nr_occurences
+    for line in blame_output.split('\n'):
+        m = reAuthorMail.match(line)
+        if m:
+            author_email_address = m.group(1)
+            if author_email_address not in email2nr_occurences:
+                email2nr_occurences[author_email_address] = 1
+            else:
+                email2nr_occurences[author_email_address] += 1
+    return email2nr_occurences
+
+
+def find_reviewers_for_diff_heuristic(diff):
+    # heuristic 1: assume good reviewers are the ones that touched the same
+    # lines before as this patch is touching.
+    # heuristic 2: assume good reviewers are the ones that touched the same
+    # files before as this patch is touching.
+    reviewers2nr_lines_touched = {}
+    reviewers2nr_files_touched = {}
+    # Assume last revision before diff was modified is the revision the diff
+    # applies to.
+    git_repo = "git_repos/llvm"
+    cmd = 'git -C %s rev-list -n 1 --before="%s" master' % \
+        (git_repo, datetime.fromtimestamp(diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
+    base_revision = get_git_cmd_output(cmd).strip()
+    #print ("Base revision=%s" % base_revision)
+    for change in diff.changes:
+        path = change.oldPath
+        # compute heuristic 1: look at context of patch lines.
+        for hunk in change.hunks:
+            for start_line, end_line in hunk.actual_lines_changed_offset:
+                # collect git blame results for authors in those ranges.
+                cmd = "git -C %s blame --encoding=utf-8 --date iso -f -e -w --line-porcelain -L %d,%d %s -- %s" % \
+                    (git_repo, start_line, end_line, base_revision, path)
+                blame_output = get_git_cmd_output(cmd)
+                for reviewer, nr_occurences in \
+                        parse_blame_output_line_porcelain(blame_output).items():
+                    if reviewer not in reviewers2nr_lines_touched:
+                        reviewers2nr_lines_touched[reviewer] = 0
+                    reviewers2nr_lines_touched[reviewer] += nr_occurences
+        # compute heuristic 2: don't look at context, just at files touched.
+        # collect git blame results for authors in those ranges.
+        cmd = "git -C %s blame --encoding=utf-8 --date iso -f -e -w --line-porcelain %s -- %s" % \
+              (git_repo, base_revision, path)
+        blame_output = get_git_cmd_output(cmd)
+        for reviewer, nr_occurences in \
+            parse_blame_output_line_porcelain(blame_output).items():
+                if reviewer not in reviewers2nr_files_touched:
+                    reviewers2nr_files_touched[reviewer] = 0
+                reviewers2nr_files_touched[reviewer] += 1
+
+    # Compute "match scores"
+    total_nr_lines = sum(reviewers2nr_lines_touched.values())
+    total_nr_files = len(diff.changes)
+    reviewers_matchscores = \
+        [(reviewer,
+          (reviewers2nr_lines_touched.get(reviewer,0)*100.0/total_nr_lines if total_nr_lines != 0 else 0,
+           reviewers2nr_files_touched[reviewer]*100.0/total_nr_files if total_nr_files != 0 else 0))
+         for reviewer, nr_lines
+         in reviewers2nr_files_touched.items()]
+    reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
+    return reviewers_matchscores
+
+
+def find_reviewers_for_review(review):
+    # newest diff first
+    diffs = sorted(review.phabDiffs,
+                   key=lambda d: d.dateModified,
+                   reverse=True)
+    if len(diffs) == 0:
+        return
+    diff = diffs[0]
+    # print ("    Most recent diff on D%s (out of %d) is from %s" %
+    #       (review.id, len(diffs),
+    #        datetime.fromtimestamp(diffs[0].dateModified)))
+    matched_reviewers = find_reviewers_for_diff_heuristic(diff)
+    # show progress, as this is a slow operation:
+    sys.stdout.write('.')
+    sys.stdout.flush()
+    # print("matched_reviewers: %s" % matched_reviewers)
+    return matched_reviewers
+
+
+def update_git_repos():
+    git_repos_directory = "git_repos"
+    git_repo_metadata = (("llvm", "https://llvm.org/git/llvm.git"),)
+    for name, url in git_repo_metadata:
+        dirname = os.path.join(git_repos_directory, name)
+        if not os.path.exists(dirname):
+            cmd = "git clone %s %s" % (url, dirname)
+            output = get_git_cmd_output(cmd)
+        cmd = "git -C %s pull --rebase" % (dirname)
+        output = get_git_cmd_output(cmd)
+
+
+def send_emails(email_addresses, msg):
+    s = smtplib.SMTP()
+    s.connect()
+    for email_address in email_addresses:
+        email_msg = email.mime.multipart.MIMEMultipart()
+        email_msg['From'] = ''
+        email_msg['To'] = email_address
+        email_msg['Subject'] = 'LLVM patches you may be able to review.'
+        email_msg.attach(email.mime.text.MIMEText(msg, 'plain'))
+        # python 3.x: s.send_message(email_msg)
+        s.sendmail(email_msg['From'], email_msg['To'], msg)
+    s.quit()
+
+
+def filter_reviewers_to_report_for(people_to_look_for):
+    # The below is just an example filter, to only report potential reviews
+    # to do for the people that will receive the report email.
+    return lambda potential_reviewers: [r for r in potential_reviewers
+                                        if r[0] in people_to_look_for]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Match open reviews to potential reviewers.')
+    parser.add_argument('--no-update-cache', dest='update_cache',
+                        action='store_false', default=True,
+                        help='Do not update cached Phabricator objects')
+    parser.add_argument('email_addresses', nargs='*',
+                        help="The email addresses (as known by LLVM git) of " +
+                        "the people to look for reviews for.")
+    args = parser.parse_args()
+
+    people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
+
+    phab = init_phab_connection()
+
+    if args.update_cache:
+        update_cache(phab)
+
+    load_cache()
+    update_git_repos()
+    msg = print_most_recent_reviews(
+        phab, days=1,
+        filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))
+    send_emails(people_to_look_for, msg)
+
+
+main()