1 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2 # See https://llvm.org/LICENSE.txt for license information.
3 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4 """Extract IR for training.
6 Extract IR for training, either from a compile_commands.json file produced by
7 cmake, or a linker parameter list file.
10 'python compiler_opt/tools/extract_ir.py ...'
12 The compilation is assumed to have been performed with clang, using
13 -fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
15 In a distributed ThinLTO case, the compilation is assumed to have been performed
16 specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
18 In a local ThinLTO case, the compilation is assumedto have been performed
19 specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
21 To change the logging verbosity, set the --verbosity flag to the desired level.
22 Setting it to a specific level will enable all messages at that level and
23 higher. Exact values can be found by invoking the script with --help.
30 from mlgo
.corpus
import extract_ir_lib
33 def parse_args_and_run():
34 parser
= argparse
.ArgumentParser(
35 description
="A tool for making a corpus from build artifacts"
40 help="Input file or directory - either compile_commands.json, a linker "
41 "parameter list, or a path to a directory containing object files.",
46 help="Input file type - JSON, LLD params, directory, or bazel aquery.",
47 choices
=["json", "params", "directory", "bazel_aquery"],
51 parser
.add_argument("--output_dir", type=str, help="Output directory")
55 help="Number of parallel works for objcopy. `None` for maximum available.",
60 "--llvm_objcopy_path",
62 help="Path to llvm-objcopy",
63 default
="llvm-objcopy",
69 help="Base directory for object files. Defaults to current working dir.",
76 help="Include only those modules with a command line matching this regular "
77 "expression. Set it to None to not perform any filtering. Note that the "
78 "regular expression is applied independently for each separate command line "
79 "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
80 "with thinlto_build=lld.",
87 help="Set if the build was performed with either 'distributed' or 'local' "
88 "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
89 "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
90 "the distributed case or -Wl,--save-temps=import and "
91 "-Wl,--thinlto-emit-index-files passed in the local case",
92 choices
=["distributed", "local"],
99 help="The section name passed to llvm-objcopy. For ELF object files, the "
100 "default .llvmcmd is correct. For Mach-O object files, one should use "
101 "something like __LLVM,__cmdline",
106 "--bitcode_section_name",
108 help="The section name passed to llvm-objcopy. For ELF object files, the "
109 "default .llvmbc is correct. For Mach-O object files, one should use "
114 # TODO(#107898): Refactor this into a common location.
118 help="The verbosity level to use for logging",
121 choices
=["DEBUG", "INFO", "WARNING", "ERROR"],
123 args
= parser
.parse_args()
128 logging
.basicConfig(level
=args
.verbosity
)
131 if args
.input is not None and args
.thinlto_build
== "local":
132 raise ValueError("--thinlto_build=local cannot be run with --input")
133 if args
.input is None:
134 if args
.thinlto_build
!= "local":
135 raise ValueError("--input or --thinlto_build=local must be provided")
136 objs
= extract_ir_lib
.load_for_lld_thinlto(args
.obj_base_dir
, args
.output_dir
)
137 elif args
.input_type
== "json":
138 with
open(args
.input, encoding
="utf-8") as f
:
139 objs
= extract_ir_lib
.load_from_compile_commands(
140 json
.load(f
), args
.output_dir
142 elif args
.input_type
== "params":
143 if not args
.obj_base_dir
:
145 "-obj_base_dir is unspecified, assuming current directory. "
146 "If no objects are found, use this option to specify the root "
147 "directory for the object file paths in the input file."
149 with
open(args
.input, encoding
="utf-8") as f
:
150 objs
= extract_ir_lib
.load_from_lld_params(
151 [l
.strip() for l
in f
.readlines()], args
.obj_base_dir
, args
.output_dir
153 elif args
.input_type
== "directory":
155 "Using the directory input is only recommended if the build system "
156 "your project uses does not support any structured output that "
157 "ml-compiler-opt understands. If your build system provides a "
158 "structured compilation database, use that instead"
160 objs
= extract_ir_lib
.load_from_directory(args
.input, args
.output_dir
)
161 elif args
.input_type
== "bazel_aquery":
162 with
open(args
.input, encoding
="utf-8") as aquery_json_handle
:
163 objs
= extract_ir_lib
.load_bazel_aquery(
164 json
.load(aquery_json_handle
), args
.obj_base_dir
, args
.output_dir
167 logging
.error("Unknown input type: %s", args
.input_type
)
169 relative_output_paths
= extract_ir_lib
.run_extraction(
172 args
.llvm_objcopy_path
,
175 args
.cmd_section_name
,
176 args
.bitcode_section_name
,
179 extract_ir_lib
.write_corpus_manifest(
180 args
.thinlto_build
, relative_output_paths
, args
.output_dir
184 "Converted %d files out of %d",
185 len(objs
) - relative_output_paths
.count(None),
190 if __name__
== "__main__":