diff --git a/llvm/utils/revert_checker.py b/llvm/utils/revert_checker.py new file mode 100755 --- /dev/null +++ b/llvm/utils/revert_checker.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +#===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===## +"""Checks for reverts of commits across a given git commit. + +To clarify the meaning of 'across' with an example, if we had the following +commit history (where `a -> b` notes that `b` is a direct child of `a`): + +123abc -> 223abc -> 323abc -> 423abc -> 523abc + +And where 423abc is a revert of 223abc, this revert is considered to be 'across' +323abc. More generally, a revert A of a parent commit B is considered to be +'across' a commit C if C is a parent of A and B is a parent of C. + +Please note that revert detection in general is really difficult, since merge +conflicts/etc always introduce _some_ amount of fuzziness. This script just +uses a bundle of heuristics, and is bound to ignore / incorrectly flag some +reverts. The hope is that it'll easily catch the vast majority (>90%) of them, +though. + +This is designed to be used in one of two ways: an import in Python, or run +directly from a shell. If you want to import this, the `find_reverts` +function is the thing to look at. If you'd rather use this from a shell, have a +usage example: + +``` +./revert_checker.py c47f97169 origin/main origin/release/12.x +``` + +This checks for all reverts from the tip of origin/main to c47f97169, which are +across the latter. It then does the same for origin/release/12.x to c47f97169. +Duplicate reverts discovered when walking both roots (origin/main and +origin/release/12.x) are deduplicated in output. +""" + +import argparse +import collections +import logging +import re +import subprocess +import sys +from typing import Generator, List, NamedTuple, Iterable + +assert sys.version_info >= (3, 6), 'Only Python 3.6+ is supported.' + +# People are creative with their reverts, and heuristics are a bit difficult. +# Like 90% of of reverts have "This reverts commit ${full_sha}". +# Some lack that entirely, while others have many of them specified in ad-hoc +# ways, while others use short SHAs and whatever. +# +# The 90% case is trivial to handle (and 100% free + automatic). The extra 10% +# starts involving human intervention, which is probably not worth it for now. + + +def _try_parse_reverts_from_commit_message(commit_message: str) -> List[str]: + if not commit_message: + return [] + + results = re.findall(r'This reverts commit ([a-f0-9]{40})\b', commit_message) + + first_line = commit_message.splitlines()[0] + initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line) + if initial_revert: + results.append(initial_revert.group(1)) + return results + + +def _stream_stdout(command: List[str]) -> Generator[str, None, None]: + with subprocess.Popen( + command, stdout=subprocess.PIPE, encoding='utf-8', errors='replace') as p: + assert p.stdout is not None # for mypy's happiness. + yield from p.stdout + + +def _resolve_sha(git_dir: str, sha: str) -> str: + if len(sha) == 40: + return sha + + return subprocess.check_output( + ['git', '-C', git_dir, 'rev-parse', sha], + encoding='utf-8', + stderr=subprocess.DEVNULL, + ).strip() + + +_LogEntry = NamedTuple('_LogEntry', [ + ('sha', str), + ('commit_message', str), +]) + + +def _log_stream(git_dir: str, root_sha: str, + end_at_sha: str) -> Iterable[_LogEntry]: + sep = 50 * '<>' + log_command = [ + 'git', + '-C', + git_dir, + 'log', + '^' + end_at_sha, + root_sha, + '--format=' + sep + '%n%H%n%B%n', + ] + + stdout_stream = iter(_stream_stdout(log_command)) + + # Find the next separator line. If there's nothing to log, it may not exist. + # It might not be the first line if git feels complainy. + found_commit_header = False + for line in stdout_stream: + if line.rstrip() == sep: + found_commit_header = True + break + + while found_commit_header: + sha = next(stdout_stream, None) + assert sha is not None, 'git died?' + sha = sha.rstrip() + + commit_message = [] + + found_commit_header = False + for line in stdout_stream: + line = line.rstrip() + if line.rstrip() == sep: + found_commit_header = True + break + commit_message.append(line) + + yield _LogEntry(sha, '\n'.join(commit_message).rstrip()) + + +def _shas_between(git_dir: str, base_ref: str, head_ref: str) -> Iterable[str]: + rev_list = [ + 'git', + '-C', + git_dir, + 'rev-list', + '--first-parent', + f'{base_ref}..{head_ref}', + ] + return (x.strip() for x in _stream_stdout(rev_list)) + + +def _rev_parse(git_dir: str, ref: str) -> str: + return subprocess.check_output( + ['git', '-C', git_dir, 'rev-parse', ref], + encoding='utf-8', + ).strip() + + +Revert = NamedTuple('Revert', [ + ('sha', str), + ('reverted_sha', str), +]) + + +def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str: + """Finds the closest common parent commit between `ref_a` and `ref_b`.""" + return subprocess.check_output( + ['git', '-C', git_dir, 'merge-base', ref_a, ref_b], + encoding='utf-8', + ).strip() + + +def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]: + """Finds reverts across `across_ref` in `git_dir`, starting from `root`.""" + across_sha = _rev_parse(git_dir, across_ref) + root_sha = _rev_parse(git_dir, root) + + common_ancestor = _find_common_parent_commit(git_dir, across_sha, root_sha) + if common_ancestor != across_sha: + raise ValueError(f"{across_sha} isn't an ancestor of {root_sha} " + '(common ancestor: {common_ancestor})') + + intermediate_commits = set(_shas_between(git_dir, across_sha, root_sha)) + assert across_ref not in intermediate_commits + + logging.debug('%d commits appear between %s and %s', + len(intermediate_commits), across_sha, root_sha) + + all_reverts = [] + for sha, commit_message in _log_stream(git_dir, root_sha, across_sha): + reverts = _try_parse_reverts_from_commit_message(commit_message) + if not reverts: + continue + + resolved_reverts = sorted(set(_resolve_sha(git_dir, x) for x in reverts)) + for reverted_sha in resolved_reverts: + if reverted_sha in intermediate_commits: + logging.debug('Commit %s reverts %s, which happened after %s', sha, + reverted_sha, across_sha) + continue + + try: + object_type = subprocess.check_output( + ['git', '-C', git_dir, 'cat-file', '-t', reverted_sha], + encoding='utf-8', + stderr=subprocess.DEVNULL, + ).strip() + except subprocess.CalledProcessError: + logging.warning( + 'Failed to resolve reverted object %s (claimed to be reverted ' + 'by sha %s)', reverted_sha, sha) + continue + + if object_type == 'commit': + all_reverts.append(Revert(sha, reverted_sha)) + continue + + logging.error("%s claims to revert %s -- which isn't a commit -- %s", sha, + object_type, reverted_sha) + + return all_reverts + + +def _main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + 'base_ref', help='Git ref or sha to check for reverts around.') + parser.add_argument( + '-C', '--git_dir', default='.', help='Git directory to use.') + parser.add_argument( + 'root', nargs='+', help='Root(s) to search for commits from.') + parser.add_argument('--debug', action='store_true') + opts = parser.parse_args() + + logging.basicConfig( + format='%(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: %(message)s', + level=logging.DEBUG if opts.debug else logging.INFO, + ) + + # `root`s can have related history, so we want to filter duplicate commits + # out. The overwhelmingly common case is also to have one root, and it's way + # easier to reason about output that comes in an order that's meaningful to + # git. + seen_reverts = set() + all_reverts = [] + for root in opts.root: + for revert in find_reverts(opts.git_dir, opts.base_ref, root): + if revert not in seen_reverts: + seen_reverts.add(revert) + all_reverts.append(revert) + + for revert in all_reverts: + print(f'{revert.sha} claims to revert {revert.reverted_sha}') + + +if __name__ == '__main__': + _main() diff --git a/llvm/utils/revert_checker_test.py b/llvm/utils/revert_checker_test.py new file mode 100755 --- /dev/null +++ b/llvm/utils/revert_checker_test.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +#===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===## +"""Tests for revert_checker. + +Note that these tests require having LLVM's git history available, since our +repository has a few interesting instances of edge-cases. +""" + +import os +import logging +import unittest +from typing import List + +import revert_checker + +# pylint: disable=protected-access + + +def get_llvm_project_path() -> str: + """Returns the path to llvm-project's root.""" + my_dir = os.path.dirname(__file__) + return os.path.realpath(os.path.join(my_dir, '..', '..')) + + +class _SilencingFilter(logging.Filter): + """Silences all log messages. + + Also collects info about log messages that would've been emitted. + """ + + def __init__(self) -> None: + self.messages: List[str] = [] + + def filter(self, record: logging.LogRecord) -> bool: + self.messages.append(record.getMessage()) + return False + + +class Test(unittest.TestCase): + """Tests for revert_checker.""" + + def silence_logging(self) -> _SilencingFilter: + root = logging.getLogger() + filt = _SilencingFilter() + root.addFilter(filt) + self.addCleanup(root.removeFilter, filt) + return filt + + def test_log_stream_with_known_sha_range(self) -> None: + start_sha = 'e241573d5972d34a323fa5c64774c4207340beb3' + end_sha = 'a7a37517751ffb0f5529011b4ba96e67fcb27510' + commits = [ + revert_checker._LogEntry( + 'e241573d5972d34a323fa5c64774c4207340beb3', '\n'.join(( + '[mlir] NFC: remove IntegerValueSet / MutableIntegerSet', + '', + 'Summary:', + '- these are unused and really not needed now given flat ' + 'affine', + ' constraints', + '', + 'Differential Revision: https://reviews.llvm.org/D75792', + ))), + revert_checker._LogEntry( + '97572fa6e9daecd648873496fd11f7d1e25a55f0', + '[NFC] use hasAnyOperatorName and hasAnyOverloadedOperatorName ' + 'functions in clang-tidy matchers', + ), + ] + + logs = list( + revert_checker._log_stream( + get_llvm_project_path(), + root_sha=start_sha, + end_at_sha=end_sha, + )) + self.assertEqual(commits, logs) + + def test_reverted_noncommit_object_is_a_nop(self) -> None: + log_filter = self.silence_logging() + # c9944df916e41b1014dff5f6f75d52297b48ecdc mentions reverting a non-commit + # object. It sits between the given base_ref and root. + reverts = revert_checker.find_reverts( + git_dir=get_llvm_project_path(), + across_ref='c9944df916e41b1014dff5f6f75d52297b48ecdc~', + root='c9944df916e41b1014dff5f6f75d52297b48ecdc') + self.assertEqual(reverts, []) + + complaint = ('Failed to resolve reverted object ' + 'edd18355be574122aaa9abf58c15d8c50fb085a1') + self.assertTrue( + any(x.startswith(complaint) for x in log_filter.messages), + log_filter.messages) + + def test_known_reverts_across_arbitrary_llvm_rev(self) -> None: + reverts = revert_checker.find_reverts( + git_dir=get_llvm_project_path(), + across_ref='c47f971694be0159ffddfee8a75ae515eba91439', + root='9f981e9adf9c8d29bb80306daf08d2770263ade6') + self.assertEqual(reverts, [ + revert_checker.Revert( + sha='9f981e9adf9c8d29bb80306daf08d2770263ade6', + reverted_sha='4060016fce3e6a0b926ee9fc59e440a612d3a2ec'), + revert_checker.Revert( + sha='4e0fe038f438ae1679eae9e156e1f248595b2373', + reverted_sha='65b21282c710afe9c275778820c6e3c1cf46734b'), + ]) + + +if __name__ == '__main__': + unittest.main()