[NFC][MLIR][Linalg] Refactor linalg.matmul tablegen ODS and related C++ code. (#116377)
[llvm-project.git] / llvm / utils / mlgo-utils / mlgo / corpus / extract_ir.py
blob5edb429241d0c00d272e18fe0bf0879170e7ff69
1 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2 # See https://llvm.org/LICENSE.txt for license information.
3 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4 """Extract IR for training.
6 Extract IR for training, either from a compile_commands.json file produced by
7 cmake, or a linker parameter list file.
9 Only run with
10 'python compiler_opt/tools/extract_ir.py ...'
12 The compilation is assumed to have been performed with clang, using
13 -fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
15 In a distributed ThinLTO case, the compilation is assumed to have been performed
16 specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
18 In a local ThinLTO case, the compilation is assumedto have been performed
19 specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
21 To change the logging verbosity, set the --verbosity flag to the desired level.
22 Setting it to a specific level will enable all messages at that level and
23 higher. Exact values can be found by invoking the script with --help.
24 """
26 import argparse
27 import json
28 import logging
30 from mlgo.corpus import extract_ir_lib
33 def parse_args_and_run():
34 parser = argparse.ArgumentParser(
35 description="A tool for making a corpus from build artifacts"
37 parser.add_argument(
38 "--input",
39 type=str,
40 help="Input file or directory - either compile_commands.json, a linker "
41 "parameter list, or a path to a directory containing object files.",
43 parser.add_argument(
44 "--input_type",
45 type=str,
46 help="Input file type - JSON, LLD params, directory, or bazel aquery.",
47 choices=["json", "params", "directory", "bazel_aquery"],
48 default="json",
49 nargs="?",
51 parser.add_argument("--output_dir", type=str, help="Output directory")
52 parser.add_argument(
53 "--num_workers",
54 type=int,
55 help="Number of parallel works for objcopy. `None` for maximum available.",
56 default=None,
57 nargs="?",
59 parser.add_argument(
60 "--llvm_objcopy_path",
61 type=str,
62 help="Path to llvm-objcopy",
63 default="llvm-objcopy",
64 nargs="?",
66 parser.add_argument(
67 "--obj_base_dir",
68 type=str,
69 help="Base directory for object files. Defaults to current working dir.",
70 default="",
71 nargs="?",
73 parser.add_argument(
74 "--cmd_filter",
75 type=str,
76 help="Include only those modules with a command line matching this regular "
77 "expression. Set it to None to not perform any filtering. Note that the "
78 "regular expression is applied independently for each separate command line "
79 "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
80 "with thinlto_build=lld.",
81 default=None,
82 nargs="?",
84 parser.add_argument(
85 "--thinlto_build",
86 type=str,
87 help="Set if the build was performed with either 'distributed' or 'local' "
88 "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
89 "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
90 "the distributed case or -Wl,--save-temps=import and "
91 "-Wl,--thinlto-emit-index-files passed in the local case",
92 choices=["distributed", "local"],
93 default=None,
94 nargs="?",
96 parser.add_argument(
97 "--cmd_section_name",
98 type=str,
99 help="The section name passed to llvm-objcopy. For ELF object files, the "
100 "default .llvmcmd is correct. For Mach-O object files, one should use "
101 "something like __LLVM,__cmdline",
102 default=".llvmcmd",
103 nargs="?",
105 parser.add_argument(
106 "--bitcode_section_name",
107 type=str,
108 help="The section name passed to llvm-objcopy. For ELF object files, the "
109 "default .llvmbc is correct. For Mach-O object files, one should use "
110 "__LLVM,__bitcode",
111 default=".llvmbc",
112 nargs="?",
114 # TODO(#107898): Refactor this into a common location.
115 parser.add_argument(
116 "--verbosity",
117 type=str,
118 help="The verbosity level to use for logging",
119 default="INFO",
120 nargs="?",
121 choices=["DEBUG", "INFO", "WARNING", "ERROR"],
123 args = parser.parse_args()
124 main(args)
127 def main(args):
128 logging.basicConfig(level=args.verbosity)
130 objs = []
131 if args.input is not None and args.thinlto_build == "local":
132 raise ValueError("--thinlto_build=local cannot be run with --input")
133 if args.input is None:
134 if args.thinlto_build != "local":
135 raise ValueError("--input or --thinlto_build=local must be provided")
136 objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
137 elif args.input_type == "json":
138 with open(args.input, encoding="utf-8") as f:
139 objs = extract_ir_lib.load_from_compile_commands(
140 json.load(f), args.output_dir
142 elif args.input_type == "params":
143 if not args.obj_base_dir:
144 logging.info(
145 "-obj_base_dir is unspecified, assuming current directory. "
146 "If no objects are found, use this option to specify the root "
147 "directory for the object file paths in the input file."
149 with open(args.input, encoding="utf-8") as f:
150 objs = extract_ir_lib.load_from_lld_params(
151 [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
153 elif args.input_type == "directory":
154 logging.warning(
155 "Using the directory input is only recommended if the build system "
156 "your project uses does not support any structured output that "
157 "ml-compiler-opt understands. If your build system provides a "
158 "structured compilation database, use that instead"
160 objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
161 elif args.input_type == "bazel_aquery":
162 with open(args.input, encoding="utf-8") as aquery_json_handle:
163 objs = extract_ir_lib.load_bazel_aquery(
164 json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
166 else:
167 logging.error("Unknown input type: %s", args.input_type)
169 relative_output_paths = extract_ir_lib.run_extraction(
170 objs,
171 args.num_workers,
172 args.llvm_objcopy_path,
173 args.cmd_filter,
174 args.thinlto_build,
175 args.cmd_section_name,
176 args.bitcode_section_name,
179 extract_ir_lib.write_corpus_manifest(
180 args.thinlto_build, relative_output_paths, args.output_dir
183 logging.info(
184 "Converted %d files out of %d",
185 len(objs) - relative_output_paths.count(None),
186 len(objs),
190 if __name__ == "__main__":
191 parse_args_and_run()