[clang-tidy][modernize-use-starts-ends-with] Fix operator rewriting false negative...
[llvm-project.git] / llvm / utils / revert_checker.py
blobb1c6e228e4d4199801e1d425c27e1f81803b5dd5
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 # ===----------------------------------------------------------------------===##
5 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6 # See https://llvm.org/LICENSE.txt for license information.
7 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
9 # ===----------------------------------------------------------------------===##
10 """Checks for reverts of commits across a given git commit.
12 To clarify the meaning of 'across' with an example, if we had the following
13 commit history (where `a -> b` notes that `b` is a direct child of `a`):
15 123abc -> 223abc -> 323abc -> 423abc -> 523abc
17 And where 423abc is a revert of 223abc, this revert is considered to be 'across'
18 323abc. More generally, a revert A of a parent commit B is considered to be
19 'across' a commit C if C is a parent of A and B is a parent of C.
21 Please note that revert detection in general is really difficult, since merge
22 conflicts/etc always introduce _some_ amount of fuzziness. This script just
23 uses a bundle of heuristics, and is bound to ignore / incorrectly flag some
24 reverts. The hope is that it'll easily catch the vast majority (>90%) of them,
25 though.
27 This is designed to be used in one of two ways: an import in Python, or run
28 directly from a shell. If you want to import this, the `find_reverts`
29 function is the thing to look at. If you'd rather use this from a shell, have a
30 usage example:
32 ```
33 ./revert_checker.py c47f97169 origin/main origin/release/12.x
34 ```
36 This checks for all reverts from the tip of origin/main to c47f97169, which are
37 across the latter. It then does the same for origin/release/12.x to c47f97169.
38 Duplicate reverts discovered when walking both roots (origin/main and
39 origin/release/12.x) are deduplicated in output.
40 """
42 import argparse
43 import collections
44 import logging
45 import re
46 import subprocess
47 import sys
48 from typing import Dict, Generator, Iterable, List, NamedTuple, Optional, Tuple
50 assert sys.version_info >= (3, 6), "Only Python 3.6+ is supported."
52 # People are creative with their reverts, and heuristics are a bit difficult.
53 # At a glance, most reverts have "This reverts commit ${full_sha}". Many others
54 # have `Reverts llvm/llvm-project#${PR_NUMBER}`.
56 # By their powers combined, we should be able to automatically catch something
57 # like 80% of reverts with reasonable confidence. At some point, human
58 # intervention will always be required (e.g., I saw
59 # ```
60 # This reverts commit ${commit_sha_1} and
61 # also ${commit_sha_2_shorthand}
62 # ```
63 # during my sample)
65 _CommitMessageReverts = NamedTuple(
66 "_CommitMessageReverts",
68 ("potential_shas", List[str]),
69 ("potential_pr_numbers", List[int]),
74 def _try_parse_reverts_from_commit_message(
75 commit_message: str,
76 ) -> _CommitMessageReverts:
77 """Tries to parse revert SHAs and LLVM PR numbers form the commit message.
79 Returns:
80 A namedtuple containing:
81 - A list of potentially reverted SHAs
82 - A list of potentially reverted LLVM PR numbers
83 """
84 if not commit_message:
85 return _CommitMessageReverts([], [])
87 sha_reverts = re.findall(
88 r"This reverts commit ([a-f0-9]{40})\b",
89 commit_message,
92 first_line = commit_message.splitlines()[0]
93 initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line)
94 if initial_revert:
95 sha_reverts.append(initial_revert.group(1))
97 pr_numbers = [
98 int(x)
99 for x in re.findall(
100 r"Reverts llvm/llvm-project#(\d+)",
101 commit_message,
105 return _CommitMessageReverts(
106 potential_shas=sha_reverts,
107 potential_pr_numbers=pr_numbers,
111 def _stream_stdout(
112 command: List[str], cwd: Optional[str] = None
113 ) -> Generator[str, None, None]:
114 with subprocess.Popen(
115 command,
116 cwd=cwd,
117 stdout=subprocess.PIPE,
118 encoding="utf-8",
119 errors="replace",
120 ) as p:
121 assert p.stdout is not None # for mypy's happiness.
122 yield from p.stdout
125 def _resolve_sha(git_dir: str, sha: str) -> str:
126 if len(sha) == 40:
127 return sha
129 return subprocess.check_output(
130 ["git", "-C", git_dir, "rev-parse", sha],
131 encoding="utf-8",
132 stderr=subprocess.DEVNULL,
133 ).strip()
136 _LogEntry = NamedTuple(
137 "_LogEntry",
139 ("sha", str),
140 ("commit_message", str),
145 def _log_stream(git_dir: str, root_sha: str, end_at_sha: str) -> Iterable[_LogEntry]:
146 sep = 50 * "<>"
147 log_command = [
148 "git",
149 "-C",
150 git_dir,
151 "log",
152 "^" + end_at_sha,
153 root_sha,
154 "--format=" + sep + "%n%H%n%B%n",
157 stdout_stream = iter(_stream_stdout(log_command))
159 # Find the next separator line. If there's nothing to log, it may not exist.
160 # It might not be the first line if git feels complainy.
161 found_commit_header = False
162 for line in stdout_stream:
163 if line.rstrip() == sep:
164 found_commit_header = True
165 break
167 while found_commit_header:
168 sha = next(stdout_stream, None)
169 assert sha is not None, "git died?"
170 sha = sha.rstrip()
172 commit_message = []
174 found_commit_header = False
175 for line in stdout_stream:
176 line = line.rstrip()
177 if line.rstrip() == sep:
178 found_commit_header = True
179 break
180 commit_message.append(line)
182 yield _LogEntry(sha, "\n".join(commit_message).rstrip())
185 def _shas_between(git_dir: str, base_ref: str, head_ref: str) -> Iterable[str]:
186 rev_list = [
187 "git",
188 "-C",
189 git_dir,
190 "rev-list",
191 "--first-parent",
192 f"{base_ref}..{head_ref}",
194 return (x.strip() for x in _stream_stdout(rev_list))
197 def _rev_parse(git_dir: str, ref: str) -> str:
198 return subprocess.check_output(
199 ["git", "-C", git_dir, "rev-parse", ref],
200 encoding="utf-8",
201 ).strip()
204 Revert = NamedTuple(
205 "Revert",
207 ("sha", str),
208 ("reverted_sha", str),
213 def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
214 """Finds the closest common parent commit between `ref_a` and `ref_b`."""
215 return subprocess.check_output(
216 ["git", "-C", git_dir, "merge-base", ref_a, ref_b],
217 encoding="utf-8",
218 ).strip()
221 def _load_pr_commit_mappings(
222 git_dir: str, root: str, min_ref: str
223 ) -> Dict[int, List[str]]:
224 git_log = ["git", "log", "--format=%H %s", f"{min_ref}..{root}"]
225 results = collections.defaultdict(list)
226 pr_regex = re.compile(r"\s\(#(\d+)\)$")
227 for line in _stream_stdout(git_log, cwd=git_dir):
228 m = pr_regex.search(line)
229 if not m:
230 continue
232 pr_number = int(m.group(1))
233 sha = line.split(None, 1)[0]
234 # N.B., these are kept in log (read: reverse chronological) order,
235 # which is what's expected by `find_reverts`.
236 results[pr_number].append(sha)
237 return results
240 # N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
241 # enough for the 99% case of reverts: rarely should someone land a cleanish
242 # revert of a >6 month old change...
243 def find_reverts(
244 git_dir: str, across_ref: str, root: str, max_pr_lookback: int = 20000
245 ) -> List[Revert]:
246 """Finds reverts across `across_ref` in `git_dir`, starting from `root`.
248 These reverts are returned in order of oldest reverts first.
250 Args:
251 git_dir: git directory to find reverts in.
252 across_ref: the ref to find reverts across.
253 root: the 'main' ref to look for reverts on.
254 max_pr_lookback: this function uses heuristics to map PR numbers to
255 SHAs. These heuristics require that commit history from `root` to
256 `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
257 many commits behind `across_ref` should be loaded in memory.
259 across_sha = _rev_parse(git_dir, across_ref)
260 root_sha = _rev_parse(git_dir, root)
262 common_ancestor = _find_common_parent_commit(git_dir, across_sha, root_sha)
263 if common_ancestor != across_sha:
264 raise ValueError(
265 f"{across_sha} isn't an ancestor of {root_sha} "
266 "(common ancestor: {common_ancestor})"
269 intermediate_commits = set(_shas_between(git_dir, across_sha, root_sha))
270 assert across_sha not in intermediate_commits
272 logging.debug(
273 "%d commits appear between %s and %s",
274 len(intermediate_commits),
275 across_sha,
276 root_sha,
279 all_reverts = []
280 # Lazily load PR <-> commit mappings, since it can be expensive.
281 pr_commit_mappings = None
282 for sha, commit_message in _log_stream(git_dir, root_sha, across_sha):
283 reverts, pr_reverts = _try_parse_reverts_from_commit_message(
284 commit_message,
286 if pr_reverts:
287 if pr_commit_mappings is None:
288 logging.info(
289 "Loading PR <-> commit mappings. This may take a moment..."
291 pr_commit_mappings = _load_pr_commit_mappings(
292 git_dir, root_sha, f"{across_sha}~{max_pr_lookback}"
294 logging.info(
295 "Loaded %d PR <-> commit mappings", len(pr_commit_mappings)
298 for reverted_pr_number in pr_reverts:
299 reverted_shas = pr_commit_mappings.get(reverted_pr_number)
300 if not reverted_shas:
301 logging.warning(
302 "No SHAs for reverted PR %d (commit %s)",
303 reverted_pr_number,
304 sha,
306 continue
307 logging.debug(
308 "Inferred SHAs %s for reverted PR %d (commit %s)",
309 reverted_shas,
310 reverted_pr_number,
311 sha,
313 reverts.extend(reverted_shas)
315 if not reverts:
316 continue
318 resolved_reverts = sorted(set(_resolve_sha(git_dir, x) for x in reverts))
319 for reverted_sha in resolved_reverts:
320 if reverted_sha in intermediate_commits:
321 logging.debug(
322 "Commit %s reverts %s, which happened after %s",
323 sha,
324 reverted_sha,
325 across_sha,
327 continue
329 try:
330 object_type = subprocess.check_output(
331 ["git", "-C", git_dir, "cat-file", "-t", reverted_sha],
332 encoding="utf-8",
333 stderr=subprocess.DEVNULL,
334 ).strip()
335 except subprocess.CalledProcessError:
336 logging.warning(
337 "Failed to resolve reverted object %s (claimed to be reverted "
338 "by sha %s)",
339 reverted_sha,
340 sha,
342 continue
344 if object_type == "commit":
345 all_reverts.append(Revert(sha, reverted_sha))
346 continue
348 logging.error(
349 "%s claims to revert %s -- which isn't a commit -- %s",
350 sha,
351 object_type,
352 reverted_sha,
355 # Since `all_reverts` contains reverts in log order (e.g., newer comes before
356 # older), we need to reverse this to keep with our guarantee of older =
357 # earlier in the result.
358 all_reverts.reverse()
359 return all_reverts
362 def _main() -> None:
363 parser = argparse.ArgumentParser(
364 description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
366 parser.add_argument("base_ref", help="Git ref or sha to check for reverts around.")
367 parser.add_argument("-C", "--git_dir", default=".", help="Git directory to use.")
368 parser.add_argument("root", nargs="+", help="Root(s) to search for commits from.")
369 parser.add_argument("--debug", action="store_true")
370 parser.add_argument(
371 "-u",
372 "--review_url",
373 action="store_true",
374 help="Format SHAs as llvm review URLs",
376 opts = parser.parse_args()
378 logging.basicConfig(
379 format="%(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: %(message)s",
380 level=logging.DEBUG if opts.debug else logging.INFO,
383 # `root`s can have related history, so we want to filter duplicate commits
384 # out. The overwhelmingly common case is also to have one root, and it's way
385 # easier to reason about output that comes in an order that's meaningful to
386 # git.
387 seen_reverts = set()
388 all_reverts = []
389 for root in opts.root:
390 for revert in find_reverts(opts.git_dir, opts.base_ref, root):
391 if revert not in seen_reverts:
392 seen_reverts.add(revert)
393 all_reverts.append(revert)
395 sha_prefix = (
396 "https://github.com/llvm/llvm-project/commit/" if opts.review_url else ""
398 for revert in all_reverts:
399 sha_fmt = f"{sha_prefix}{revert.sha}"
400 reverted_sha_fmt = f"{sha_prefix}{revert.reverted_sha}"
401 print(f"{sha_fmt} claims to revert {reverted_sha_fmt}")
404 if __name__ == "__main__":
405 _main()