llvm/utils/filecheck_lint/filecheck_lint.py

   1 #!/usr/bin/env python3
   2 # ===----------------------------------------------------------------------===##
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https://llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 # ===----------------------------------------------------------------------===##
   9 """A linter that detects potential typos in FileCheck directive names.
  10
  11 Consider a broken test foo.cpp:
  12
  13 // RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
  14 // RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
  15 auto x = 42;
  16 // NEWW: auto is a c++11 extension
  17 // ODL-NOT: auto is a c++11 extension
  18
  19 We first detect the locally valid FileCheck directive prefixes by parsing the
  20 --check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
  21 {CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
  22
  23 Then we look for lines that look like directives. These are of the form 'FOO:',
  24 usually at the beginning of a line or a comment. If any of these are a
  25 "near-miss" for a directive name, then we suspect this is a typo and report it.
  26
  27 Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
  28 """
  29
  30 import itertools
  31 import logging
  32 import pathlib
  33 import re
  34 import sys
  35 from typing import Generator, Sequence, Tuple
  36
  37 _distance_threshold = 3
  38 _prefixes = {"CHECK"}
  39 _suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
  40 # 'NOTE' and 'TODO' are not directives, but are likely to be false positives
  41 # if encountered and to generate noise as a result. We filter them out also to
  42 # avoid this.
  43 _lit_directives = {
  44     "RUN",
  45     "REQUIRES",
  46     "UNSUPPORTED",
  47     "XFAIL",
  48     "DEFINE",
  49     "REDEFINE",
  50 }
  51 # 'COM' and 'RUN' are default comment prefixes for FileCheck.
  52 _comment_prefixes = {"COM", "RUN"}
  53 _ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})
  54
  55
  56 def levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
  57     """Computes the edit distance between two strings.
  58
  59     Additions, deletions, and substitutions all count as a single operation.
  60     """
  61     if not s1:
  62         return len(s2)
  63     if not s2:
  64         return len(s1)
  65
  66     distances = range(len(s2) + 1)
  67     for i in range(len(s1)):
  68         new_distances = [i + 1]
  69         for j in range(len(s2)):
  70             cost = min(
  71                 distances[j] + int(s1[i] != s2[j]),
  72                 distances[j + 1] + 1,
  73                 new_distances[-1] + 1,
  74             )
  75             new_distances.append(cost)
  76         distances = new_distances
  77     return distances[-1]
  78
  79
  80 class FileRange:
  81     """Stores the coordinates of a span on a single line within a file.
  82
  83     Attributes:
  84       content:    line str
  85       start_byte: the (inclusive) byte offset the span starts
  86       end_byte:   the (inclusive) byte offset the span ends
  87     """
  88
  89     content: str
  90     start_byte: int
  91     end_byte: int
  92
  93     def __init__(
  94         self, content: str, start_byte: int, end_byte: int
  95     ):  # pylint: disable=g-doc-args
  96         """
  97         Stores the coordinates of a span based on a string and start/end bytes.
  98
  99         `start_byte` and `end_byte` are assumed to be on the same line.
 100         """
 101         self.content = content
 102         self.start_byte = start_byte
 103         self.end_byte = end_byte
 104
 105     def as_str(self):
 106         """
 107         Derives span from line and coordinates.
 108
 109         start_column: the (inclusive) column where the span starts
 110         end_column:   the (inclusive) column where the span ends
 111         """
 112         content_before_span = self.content[: self.start_byte]
 113         line = content_before_span.count("\n") + 1
 114         start_column = self.start_byte - content_before_span.rfind("\n")
 115         end_column = start_column + (self.end_byte - self.start_byte - 1)
 116
 117         return f"{line}:{start_column}-{end_column}"
 118
 119
 120 class Diagnostic:
 121     """Stores information about one typo and a suggested fix.
 122
 123     Attributes:
 124       filepath:   the path to the file in which the typo was found
 125       filerange:  the position at which the typo was found in the file
 126       typo:       the typo
 127       fix:        a suggested fix
 128     """
 129
 130     filepath: pathlib.Path
 131     filerange: FileRange
 132     typo: str
 133     fix: str
 134
 135     def __init__(
 136         self,
 137         filepath: pathlib.Path,
 138         filerange: FileRange,
 139         typo: str,
 140         fix: str,  # pylint: disable=redefined-outer-name
 141     ):
 142         self.filepath = filepath
 143         self.filerange = filerange
 144         self.typo = typo
 145         self.fix = fix
 146
 147     def __str__(self) -> str:
 148         return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}"
 149
 150     def summary(self) -> str:
 151         return (
 152             f'Found potentially misspelled directive "{self.typo}". Did you mean '
 153             f'"{self.fix}"?'
 154         )
 155
 156
 157 def find_potential_directives(
 158     content: str,
 159 ) -> Generator[Tuple[FileRange, str], None, None]:
 160     """Extracts all the potential FileCheck directives from a string.
 161
 162     What constitutes a potential directive is loosely defined---we err on the side
 163     of capturing more strings than is necessary, rather than missing any.
 164
 165     Args:
 166       content: the string in which to look for directives
 167
 168     Yields:
 169       Tuples (p, d) where p is the span where the potential directive occurs
 170       within the string and d is the potential directive.
 171     """
 172     directive_pattern = re.compile(
 173         r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE
 174     )
 175     for match in re.finditer(directive_pattern, content):
 176         potential_directive, span = match.group(1), match.span(1)
 177         yield (FileRange(content, span[0], span[1]), potential_directive)
 178
 179
 180 # TODO(bchetioui): also parse comment prefixes to ignore.
 181 def parse_custom_prefixes(
 182     content: str,
 183 ) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
 184     """Parses custom prefixes defined in the string provided.
 185
 186     For example, given the following file content:
 187       RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
 188       RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
 189
 190     the custom prefixes are CHECK1, CHECK2, and CHECK3.
 191     """
 192     param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
 193     for m in re.finditer(
 194         r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content
 195     ):
 196         prefixes = m.group(1)
 197         if prefixes.startswith("'") or prefixes.startswith('"'):
 198             prefixes = prefixes[1:-1]
 199         for prefix in prefixes.split(","):
 200             yield prefix
 201
 202
 203 def find_directive_typos(
 204     content: str,
 205     filepath: pathlib.Path,
 206     threshold: int = 3,
 207 ) -> Generator[Diagnostic, None, None]:
 208     """Detects potential typos in FileCheck directives.
 209
 210     Args:
 211       content: the content of the file
 212       filepath: the path to the file to check for typos in directives
 213       threshold: the (inclusive) maximum edit distance between a potential
 214         directive and an actual directive, such that the potential directive is
 215         classified as a typo
 216
 217     Yields:
 218       Diagnostics, in order from the top of the file.
 219     """
 220     all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
 221     all_directives = (
 222         [
 223             f"{prefix}{suffix}"
 224             for prefix, suffix in itertools.product(all_prefixes, _suffixes)
 225         ]
 226         + list(_ignore)
 227         + list(all_prefixes)
 228     )
 229
 230     def find_best_match(typo):
 231         return min(
 232             [(threshold + 1, typo)]
 233             + [
 234                 (levenshtein(typo, d), d)
 235                 for d in all_directives
 236                 if abs(len(d) - len(typo)) <= threshold
 237             ],
 238             key=lambda tup: tup[0],
 239         )
 240
 241     potential_directives = find_potential_directives(content)
 242     # Cache score and best_match to skip recalculating.
 243     score_and_best_match_for_potential_directive = dict()
 244     for filerange, potential_directive in potential_directives:
 245         # TODO(bchetioui): match count directives more finely. We skip directives
 246         # starting with 'CHECK-COUNT-' for the moment as they require more complex
 247         # logic to be handled correctly.
 248         if any(
 249             potential_directive.startswith(f"{prefix}-COUNT-")
 250             for prefix in all_prefixes
 251         ):
 252             continue
 253
 254         # Ignoring potential typos that will not be matched later due to a too low
 255         # threshold, in order to avoid potentially long computation times.
 256         if len(potential_directive) > max(map(len, all_directives)) + threshold:
 257             continue
 258
 259         if potential_directive not in score_and_best_match_for_potential_directive:
 260             score, best_match = find_best_match(potential_directive)
 261             score_and_best_match_for_potential_directive[potential_directive] = (
 262                 score,
 263                 best_match,
 264             )
 265         else:
 266             score, best_match = score_and_best_match_for_potential_directive[
 267                 potential_directive
 268             ]
 269         if score == 0:  # This is an actual directive, ignore.
 270             continue
 271         elif score <= threshold and best_match not in _ignore:
 272             yield Diagnostic(filepath, filerange, potential_directive, best_match)
 273
 274
 275 def main(argv: Sequence[str]):
 276     if len(argv) < 2:
 277         print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
 278         exit(1)
 279
 280     for filepath in argv[1:]:
 281         logging.info("Checking %s", filepath)
 282         with open(filepath, "rt") as f:
 283             content = f.read()
 284         for diagnostic in find_directive_typos(
 285             content,
 286             pathlib.Path(filepath),
 287             threshold=_distance_threshold,
 288         ):
 289             print(diagnostic)
 290
 291
 292 if __name__ == "__main__":
 293     main(sys.argv)