diff --git a/llvm/utils/filecheck_lint/README.md b/llvm/utils/filecheck_lint/README.md new file mode 100644 --- /dev/null +++ b/llvm/utils/filecheck_lint/README.md @@ -0,0 +1,18 @@ +# filecheck_lint + +## About + +`filecheck_lint` is a tool for detecting invalid FileCheck directives that are +likely to be due to typos in a given test file. `filecheck_lint` achieves that +by computing the edit distance between the directives used and the set of +directives that are valid within the test file. A invalid directive that is +close to being a valid directive within a parameterizable edit distance is +reported on the standard output as a likely typo, and a fix is suggested. + +## Usage + +```bash +filecheck_lint path/to/test/file/1 ... path/to/test/file/n +# With a custom edit distance reporting threshold (default: 3) +filecheck_lint --threshold 4 path/to/test/file/1 ... path/to/test/file/n +``` \ No newline at end of file diff --git a/llvm/utils/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint.py new file mode 100644 --- /dev/null +++ b/llvm/utils/filecheck_lint/filecheck_lint.py @@ -0,0 +1,43 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## +"""Main file for the FileCheck linter.""" +import argparse +import logging +import pathlib + +import filecheck_lint.filecheck_lint as fcl + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--distance-threshold', + default=3, + type=int, + help=( + 'The largest string distance allowed for matching a custom directive' + ), + ) + parser.add_argument( + 'files', nargs='+', type=str, help='The files to check for typos' + ) + args = parser.parse_args() + + for filepath in args.files: + logging.info('Checking %s', filepath) + comment_prefix = fcl.comment_prefix_from_filename(filepath) + for diagnostic in fcl.find_directive_typos( + pathlib.Path(filepath), + threshold=args.distance_threshold, + comment_prefix=comment_prefix, + ): + print(diagnostic) + + +if __name__ == '__main__': + main() diff --git a/llvm/utils/filecheck_lint/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint/filecheck_lint.py new file mode 100644 --- /dev/null +++ b/llvm/utils/filecheck_lint/filecheck_lint/filecheck_lint.py @@ -0,0 +1,317 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## +"""A linter that detects potential typos in FileCheck directive names.""" + +import itertools +import pathlib +import re +from typing import Callable, List, Optional, Sequence, Set, Tuple + + +_prefixes = {'CHECK'} +_suffixes = {'DAG', 'COUNT', 'EMPTY', 'LABEL', 'NEXT', 'NOT', 'SAME'} +# 'NOTE' and 'TODO' are not directives, but are likely to be false positives +# if encountered and to generate noise as a result. We filter them out also to +# avoid this. +_lit_directives = { + 'RUN', + 'REQUIRES', + 'UNSUPPORTED', + 'XFAIL', + 'DEFINE', + 'REDEFINE', +} +# 'COM' and 'RUN' are default comment prefixes for FileCheck. +_comment_prefixes = {'COM', 'RUN'} +_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'}) + + +def levenshtein(s1: str, s2: str) -> int: + """Computes the edit distance between two strings. + + Additions, deletions, and substitutions all count as a single operation. + + Args: + s1: a string + s2: another string + + Returns: + The edit distance between the two input strings. + """ + if not s1: return len(s2) + if not s2: return len(s1) + + distances = range(len(s2) + 1) + for i in range(len(s1)): + new_distances = [i + 1] + for j in range(len(s2)): + cost = min(distances[j] + int(s1[i] != s2[j]), + distances[j + 1] + 1, + new_distances[-1] + 1) + new_distances.append(cost) + distances = new_distances + return distances[-1] + + +class FilePos: + """Stores the coordinates of a span on a single line within a file. + + Attributes: + line: the line number + start_column: the (inclusive) column where the span starts + end_column: the (inclusive) column where the span ends + """ + line: int + start_column: int + end_column: int + + def __init__(self, line: int, start_column: int, end_column: int): + self.line = line + self.start_column = start_column + self.end_column = end_column + + def __str__(self) -> str: + return f'{self.line}:{self.start_column}-{self.end_column}' + + +class Diagnostic: + """Stores information about typos and emit error diagnostics. + + A diagnostic stores the information relevant for a given potential typo, and + provides facilities for emitting either Findings or textual error messages. + + Attributes: + filepath: the path to the file in which the typo was found + filepos: the position at which the typo was found in the file + typo: the typo + fix: a suggested fix + """ + + filepath: pathlib.Path + filepos: FilePos + typo: str + fix: str + + def __init__( + self, filepath: pathlib.Path, filepos: FilePos, typo: str, fix: str + ): + self.filepath = filepath + self.filepos = filepos + self.typo = typo + self.fix = fix + + def __str__(self) -> str: + return f'{self.filepath}:' + str(self.filepos) + f': {self.summary()}' + + def summary(self) -> str: + return ( + f'Found potentially misspelt directive "{self.typo}". Did you mean ' + f'"{self.fix}"?' + ) + + +def find_potential_directives( + lines: Sequence[str], directive_prefix: Optional[str] = None +) -> List[Tuple[FilePos, str]]: + """Filters a list for strings that could be potential FileCheck directives. + + Finds all the strings that could be potential directives in a sequence of + strings. This assumes that lines containing the directive always start with + either whitespace, or the specified comment prefix. What constitutes a + potential directive is loosely defined---we err on the side of capturing + more strings than is necessary, rather than missing any. + + Args: + lines: a sequence of strings + directive_prefix: an optional prefix associated with directives, e.g. '//' + or ';'. If not provided, the function does not attempt to match any prefix + before directives. + + Returns: + A list of tuples (p, d) where p is the position of the potential directive + within the file and d is the potential directive. + """ + matches = [] + # TODO(bchetioui): regexes can be further improved to capture typoed + # directives more holistically. In practice, they seem to perform well as is. + if directive_prefix is None: + directive_pattern = re.compile(r'([\d\w\_][\s\d\w\-_]*):') + else: + directive_pattern = re.compile( + r'{}[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):'.format(directive_prefix) + ) + for lineno, line in enumerate(lines, start=1): + match = re.search(directive_pattern, line) + if match is None: + continue + potential_directive, span = match.group(1), match.span(1) + matches.append((FilePos(lineno, span[0] + 1, span[1]), potential_directive)) + return matches + + +def comment_prefix_from_filename(filename: str) -> Optional[str]: + """Returns the inline comment prefix corresponding to a file type. + + Args: + filename: the name of a file including its extension + + Returns: + The inline comment prefix corresponding to the file's extension, e.g. + '//' for '.cc' and '.mlir', or ';' for '.ll'. + """ + language_to_comment_prefix = { + 'cc': '//', + 'cpp': '//', + 'll': ';', + 'mlir': '//', + 'py': '#', + } + + comment_prefix = None + if '.' in filename: + comment_prefix = language_to_comment_prefix.get(filename.split('.')[-1]) + + return comment_prefix + + +# TODO(bchetioui): make parse_additional_prefixes cross line boundaries, or +# have it take only one line as a parameter. +# TODO(bchetioui): convert this code to use regexes as well. +# TODO(bchetioui): factorize this to parse comment prefixes to ignore as well. +# TODO(bchetioui): check that the argument is actually part of a CHECK command. +def parse_additional_prefixes(lines: Sequence[str]) -> Set[str]: + """Parses custom prefixes defined in the list of strings provided. + + Args: + lines: A list of strings in which to look for prefixes. + + Returns: + A set of prefixes found in the input lines. + """ + + def parse_check_prefixes( + s: str, option_prefix: str, extract_prefixes: Callable[[str], List[str]] + ) -> List[str]: + prefix_groups = s.split(option_prefix)[1:] + join = lambda ll: [e for l in ll for e in l] # pylint: disable=g-complex-comprehension + return join( + extract_prefixes(prefix_group) for prefix_group in prefix_groups + ) + + def extract_one_prefix(g: str) -> List[str]: + return [g.strip().split(' ')[0].replace('"', '').replace("'", '').strip()] + + def extract_many_prefixes(g: str) -> List[str]: + prefixes = g.strip().split(' ')[0].replace('"', '').replace("'", '').strip() + return [prefix.strip() for prefix in prefixes.split(',')] + + additional_prefixes = [] + + for line in lines: + line = line.strip() + # If the line ends with a backslash for line continuation, remove it. This + # avoids potentially parsing a prefix incorrectly if there is no space + # between the prefix and a line continuation, e.g. like in + # '-check-prefix=CHECK-PREFIX\'. + if line.endswith('\\'): line = line[:-1] + additional_prefixes.extend( + parse_check_prefixes(line, '-check-prefix=', extract_one_prefix) + ) + additional_prefixes.extend( + parse_check_prefixes(line, '-check-prefix ', extract_one_prefix) + ) + additional_prefixes.extend( + parse_check_prefixes(line, '-check-prefixes=', extract_many_prefixes) + ) + additional_prefixes.extend( + parse_check_prefixes(line, '-check-prefixes ', extract_many_prefixes) + ) + + return set(additional_prefixes) + + +def find_directive_typos( + filepath: pathlib.Path, + comment_prefix: Optional[str] = None, + threshold: int = 3, +) -> Sequence[Diagnostic]: + """Detects potential typos in FileCheck directives. + + Args: + filepath: the path to the file to check for typos in directives + comment_prefix: the prefix associated with directives, e.g. '//' or ';' + depending on the file type (optional, will be set to '//' if not provided) + threshold: the (inclusive) maximum Levenshtein distance between a potential + directive and an actual directive, such that the potential directive is + classified as a typo + + Returns: + A list of diagnostics, sorted according to the order of typos from the + top of the file. + """ + with open(filepath, 'rt') as f: + lines = f.readlines() + return find_directive_typos_impl(lines, filepath, comment_prefix, threshold) + + +def find_directive_typos_impl( + lines: Sequence[str], + filepath: pathlib.Path, + comment_prefix: Optional[str] = None, + threshold: int = 3, +) -> Sequence[Diagnostic]: + """Underlying implementation for `find_directive_typos`.""" + all_prefixes = _prefixes.union(parse_additional_prefixes(lines)) + all_directives = ( + [ + f'{prefix}-{suffix}' + for prefix, suffix in itertools.product(all_prefixes, _suffixes) + ] + + list(_ignore) + + list(all_prefixes) + ) + + def find_best_match(typo): + return min( + [(threshold + 1, typo)] + + [ + (levenshtein(typo, d), d) + for d in all_directives + if abs(len(d) - len(typo)) <= threshold + ], + key=lambda tup: tup[0], + ) + + comment_prefix = comment_prefix or '//' + potential_directives = find_potential_directives(lines, comment_prefix) + diagnostics = [] + + for filepos, potential_directive in potential_directives: + # TODO(bchetioui): match count directives more finely. We skip directives + # starting with 'CHECK-COUNT-' for the moment as they require more complex + # logic to be handled correctly. + if any( + potential_directive.startswith(f'{prefix}-COUNT-') + for prefix in all_prefixes + ): + continue + + # Ignoring potential typos that will not be matched later due to a too low + # threshold, in order to avoid potentially long computation times. + if len(potential_directive) > max(map(len, all_directives)) + threshold: + continue + + score, best_match = find_best_match(potential_directive) + if score == 0: # This is an actual directive, ignore. + continue + elif score <= threshold and best_match not in _ignore: + diagnostics.append( + Diagnostic(filepath, filepos, potential_directive, best_match) + ) + + return diagnostics diff --git a/llvm/utils/filecheck_lint/tests/filecheck_lint_test.py b/llvm/utils/filecheck_lint/tests/filecheck_lint_test.py new file mode 100644 --- /dev/null +++ b/llvm/utils/filecheck_lint/tests/filecheck_lint_test.py @@ -0,0 +1,80 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## +import unittest + +import filecheck_lint.filecheck_lint as fcl + + +class TestParser(unittest.TestCase): + + def test_parse_all_additional_prefixes(self): + + def run(lines, expected_prefixes): + prefixes = fcl.parse_additional_prefixes(lines) + for prefix in expected_prefixes: + self.assertIn(prefix, prefixes) + + for lines, expected_prefixes in [ + (['-check-prefix=PREFIX'], {'PREFIX'}), + (['-check-prefix PREFIX'], {'PREFIX'}), + (['-check-prefixes=PREFIX1,PREFIX2'], {'PREFIX1', 'PREFIX2'}), + (['-check-prefixes PREFIX1,PREFIX2'], {'PREFIX1', 'PREFIX2'}), + ([ + '-check-prefix=PREFIX1 -check-prefix PREFIX2', + '-check-prefixes=PREFIX3,PREFIX4 -check-prefix=PREFIX5', + '-check-prefixes PREFIX6,PREFIX7 -check-prefixes=PREFIX8', + ], # pylint: disable=bad-continuation + {f'PREFIX{i}' for i in range(1, 9)}), + ]: + run(lines, expected_prefixes) + + def test_additional_prefixes_uniquely(self): + lines = ['--check-prefix=SOME-PREFIX', '--check-prefix=SOME-PREFIX'] + prefixes = fcl.parse_additional_prefixes(lines) + assert len(prefixes) == 1 + + def test_additional_prefix_line_continuation(self): + lines = ['--check-prefix=SOME-PREFIX\\'] + assert fcl.parse_additional_prefixes(lines) == {'SOME-PREFIX'} + + +class TestTypoDetection(unittest.TestCase): + + def test_find_potential_directives_comment_prefix(self): + lines = ['junk; CHCK1:', 'junk// CHCK2:', 'SOME CHCK3:'] + + semi_prefix_results = fcl.find_potential_directives(lines, ';') + assert len(semi_prefix_results) == 1 + pos, match = semi_prefix_results[0] + assert (pos.line == 1 and + pos.start_column == len('junk; ') + 1 and + pos.end_column == len(lines[0]) - 1) + assert match == 'CHCK1' + + doubleslash_prefix_results = fcl.find_potential_directives(lines, '//') + assert len(doubleslash_prefix_results) == 1 + pos, match = doubleslash_prefix_results[0] + assert (pos.line == 2 and + pos.start_column == len('junk// ') + 1 and + pos.end_column == len(lines[1]) - 1) + assert match == 'CHCK2' + + no_prefix_results = fcl.find_potential_directives(lines, None) + assert len(no_prefix_results) == 3 + assert no_prefix_results[0][1] == 'CHCK1' + assert no_prefix_results[1][1] == 'CHCK2' + assert no_prefix_results[2][1] == 'SOME CHCK3' + + def test_levenshtein(self): + for s1, s2, distance in [ + ('Levenshtein', 'Levenstin', 2), # 2 insertions + ('Levenshtein', 'Levenstherin', 3), # 1 insertion, 2 deletions + ('Levenshtein', 'Lenvinshtein', 2), # 1 deletion, 1 substitution + ('Levenshtein', 'Levenshtein', 0), # identical strings + ]: + assert fcl.levenshtein(s1, s2) == distance