2 # -*- coding: utf-8 -*-
3 # ===----------------------------------------------------------------------===##
5 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6 # See https://llvm.org/LICENSE.txt for license information.
7 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
9 # ===----------------------------------------------------------------------===##
10 """Checks for reverts of commits across a given git commit.
12 To clarify the meaning of 'across' with an example, if we had the following
13 commit history (where `a -> b` notes that `b` is a direct child of `a`):
15 123abc -> 223abc -> 323abc -> 423abc -> 523abc
17 And where 423abc is a revert of 223abc, this revert is considered to be 'across'
18 323abc. More generally, a revert A of a parent commit B is considered to be
19 'across' a commit C if C is a parent of A and B is a parent of C.
21 Please note that revert detection in general is really difficult, since merge
22 conflicts/etc always introduce _some_ amount of fuzziness. This script just
23 uses a bundle of heuristics, and is bound to ignore / incorrectly flag some
24 reverts. The hope is that it'll easily catch the vast majority (>90%) of them,
27 This is designed to be used in one of two ways: an import in Python, or run
28 directly from a shell. If you want to import this, the `find_reverts`
29 function is the thing to look at. If you'd rather use this from a shell, have a
33 ./revert_checker.py c47f97169 origin/main origin/release/12.x
36 This checks for all reverts from the tip of origin/main to c47f97169, which are
37 across the latter. It then does the same for origin/release/12.x to c47f97169.
38 Duplicate reverts discovered when walking both roots (origin/main and
39 origin/release/12.x) are deduplicated in output.
48 from typing
import Dict
, Generator
, Iterable
, List
, NamedTuple
, Optional
, Tuple
50 assert sys
.version_info
>= (3, 6), "Only Python 3.6+ is supported."
52 # People are creative with their reverts, and heuristics are a bit difficult.
53 # At a glance, most reverts have "This reverts commit ${full_sha}". Many others
54 # have `Reverts llvm/llvm-project#${PR_NUMBER}`.
56 # By their powers combined, we should be able to automatically catch something
57 # like 80% of reverts with reasonable confidence. At some point, human
58 # intervention will always be required (e.g., I saw
60 # This reverts commit ${commit_sha_1} and
61 # also ${commit_sha_2_shorthand}
65 _CommitMessageReverts
= NamedTuple(
66 "_CommitMessageReverts",
68 ("potential_shas", List
[str]),
69 ("potential_pr_numbers", List
[int]),
74 def _try_parse_reverts_from_commit_message(
76 ) -> _CommitMessageReverts
:
77 """Tries to parse revert SHAs and LLVM PR numbers form the commit message.
80 A namedtuple containing:
81 - A list of potentially reverted SHAs
82 - A list of potentially reverted LLVM PR numbers
84 if not commit_message
:
85 return _CommitMessageReverts([], [])
87 sha_reverts
= re
.findall(
88 r
"This reverts commit ([a-f0-9]{40})\b",
92 first_line
= commit_message
.splitlines()[0]
93 initial_revert
= re
.match(r
'Revert ([a-f0-9]{6,}) "', first_line
)
95 sha_reverts
.append(initial_revert
.group(1))
100 r
"Reverts llvm/llvm-project#(\d+)",
105 return _CommitMessageReverts(
106 potential_shas
=sha_reverts
,
107 potential_pr_numbers
=pr_numbers
,
112 command
: List
[str], cwd
: Optional
[str] = None
113 ) -> Generator
[str, None, None]:
114 with subprocess
.Popen(
117 stdout
=subprocess
.PIPE
,
121 assert p
.stdout
is not None # for mypy's happiness.
125 def _resolve_sha(git_dir
: str, sha
: str) -> str:
129 return subprocess
.check_output(
130 ["git", "-C", git_dir
, "rev-parse", sha
],
132 stderr
=subprocess
.DEVNULL
,
136 _LogEntry
= NamedTuple(
140 ("commit_message", str),
145 def _log_stream(git_dir
: str, root_sha
: str, end_at_sha
: str) -> Iterable
[_LogEntry
]:
154 "--format=" + sep
+ "%n%H%n%B%n",
157 stdout_stream
= iter(_stream_stdout(log_command
))
159 # Find the next separator line. If there's nothing to log, it may not exist.
160 # It might not be the first line if git feels complainy.
161 found_commit_header
= False
162 for line
in stdout_stream
:
163 if line
.rstrip() == sep
:
164 found_commit_header
= True
167 while found_commit_header
:
168 sha
= next(stdout_stream
, None)
169 assert sha
is not None, "git died?"
174 found_commit_header
= False
175 for line
in stdout_stream
:
177 if line
.rstrip() == sep
:
178 found_commit_header
= True
180 commit_message
.append(line
)
182 yield _LogEntry(sha
, "\n".join(commit_message
).rstrip())
185 def _shas_between(git_dir
: str, base_ref
: str, head_ref
: str) -> Iterable
[str]:
192 f
"{base_ref}..{head_ref}",
194 return (x
.strip() for x
in _stream_stdout(rev_list
))
197 def _rev_parse(git_dir
: str, ref
: str) -> str:
198 return subprocess
.check_output(
199 ["git", "-C", git_dir
, "rev-parse", ref
],
208 ("reverted_sha", str),
213 def _find_common_parent_commit(git_dir
: str, ref_a
: str, ref_b
: str) -> str:
214 """Finds the closest common parent commit between `ref_a` and `ref_b`."""
215 return subprocess
.check_output(
216 ["git", "-C", git_dir
, "merge-base", ref_a
, ref_b
],
221 def _load_pr_commit_mappings(
222 git_dir
: str, root
: str, min_ref
: str
223 ) -> Dict
[int, List
[str]]:
224 git_log
= ["git", "log", "--format=%H %s", f
"{min_ref}..{root}"]
225 results
= collections
.defaultdict(list)
226 pr_regex
= re
.compile(r
"\s\(#(\d+)\)$")
227 for line
in _stream_stdout(git_log
, cwd
=git_dir
):
228 m
= pr_regex
.search(line
)
232 pr_number
= int(m
.group(1))
233 sha
= line
.split(None, 1)[0]
234 # N.B., these are kept in log (read: reverse chronological) order,
235 # which is what's expected by `find_reverts`.
236 results
[pr_number
].append(sha
)
240 # N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
241 # enough for the 99% case of reverts: rarely should someone land a cleanish
242 # revert of a >6 month old change...
244 git_dir
: str, across_ref
: str, root
: str, max_pr_lookback
: int = 20000
246 """Finds reverts across `across_ref` in `git_dir`, starting from `root`.
248 These reverts are returned in order of oldest reverts first.
251 git_dir: git directory to find reverts in.
252 across_ref: the ref to find reverts across.
253 root: the 'main' ref to look for reverts on.
254 max_pr_lookback: this function uses heuristics to map PR numbers to
255 SHAs. These heuristics require that commit history from `root` to
256 `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
257 many commits behind `across_ref` should be loaded in memory.
259 across_sha
= _rev_parse(git_dir
, across_ref
)
260 root_sha
= _rev_parse(git_dir
, root
)
262 common_ancestor
= _find_common_parent_commit(git_dir
, across_sha
, root_sha
)
263 if common_ancestor
!= across_sha
:
265 f
"{across_sha} isn't an ancestor of {root_sha} "
266 "(common ancestor: {common_ancestor})"
269 intermediate_commits
= set(_shas_between(git_dir
, across_sha
, root_sha
))
270 assert across_sha
not in intermediate_commits
273 "%d commits appear between %s and %s",
274 len(intermediate_commits
),
280 # Lazily load PR <-> commit mappings, since it can be expensive.
281 pr_commit_mappings
= None
282 for sha
, commit_message
in _log_stream(git_dir
, root_sha
, across_sha
):
283 reverts
, pr_reverts
= _try_parse_reverts_from_commit_message(
287 if pr_commit_mappings
is None:
289 "Loading PR <-> commit mappings. This may take a moment..."
291 pr_commit_mappings
= _load_pr_commit_mappings(
292 git_dir
, root_sha
, f
"{across_sha}~{max_pr_lookback}"
295 "Loaded %d PR <-> commit mappings", len(pr_commit_mappings
)
298 for reverted_pr_number
in pr_reverts
:
299 reverted_shas
= pr_commit_mappings
.get(reverted_pr_number
)
300 if not reverted_shas
:
302 "No SHAs for reverted PR %d (commit %s)",
308 "Inferred SHAs %s for reverted PR %d (commit %s)",
313 reverts
.extend(reverted_shas
)
318 resolved_reverts
= sorted(set(_resolve_sha(git_dir
, x
) for x
in reverts
))
319 for reverted_sha
in resolved_reverts
:
320 if reverted_sha
in intermediate_commits
:
322 "Commit %s reverts %s, which happened after %s",
330 object_type
= subprocess
.check_output(
331 ["git", "-C", git_dir
, "cat-file", "-t", reverted_sha
],
333 stderr
=subprocess
.DEVNULL
,
335 except subprocess
.CalledProcessError
:
337 "Failed to resolve reverted object %s (claimed to be reverted "
344 if object_type
== "commit":
345 all_reverts
.append(Revert(sha
, reverted_sha
))
349 "%s claims to revert %s -- which isn't a commit -- %s",
355 # Since `all_reverts` contains reverts in log order (e.g., newer comes before
356 # older), we need to reverse this to keep with our guarantee of older =
357 # earlier in the result.
358 all_reverts
.reverse()
363 parser
= argparse
.ArgumentParser(
364 description
=__doc__
, formatter_class
=argparse
.RawDescriptionHelpFormatter
366 parser
.add_argument("base_ref", help="Git ref or sha to check for reverts around.")
367 parser
.add_argument("-C", "--git_dir", default
=".", help="Git directory to use.")
368 parser
.add_argument("root", nargs
="+", help="Root(s) to search for commits from.")
369 parser
.add_argument("--debug", action
="store_true")
374 help="Format SHAs as llvm review URLs",
376 opts
= parser
.parse_args()
379 format
="%(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: %(message)s",
380 level
=logging
.DEBUG
if opts
.debug
else logging
.INFO
,
383 # `root`s can have related history, so we want to filter duplicate commits
384 # out. The overwhelmingly common case is also to have one root, and it's way
385 # easier to reason about output that comes in an order that's meaningful to
389 for root
in opts
.root
:
390 for revert
in find_reverts(opts
.git_dir
, opts
.base_ref
, root
):
391 if revert
not in seen_reverts
:
392 seen_reverts
.add(revert
)
393 all_reverts
.append(revert
)
396 "https://github.com/llvm/llvm-project/commit/" if opts
.review_url
else ""
398 for revert
in all_reverts
:
399 sha_fmt
= f
"{sha_prefix}{revert.sha}"
400 reverted_sha_fmt
= f
"{sha_prefix}{revert.reverted_sha}"
401 print(f
"{sha_fmt} claims to revert {reverted_sha_fmt}")
404 if __name__
== "__main__":