[NFC][MLIR][Linalg] Refactor linalg.matmul tablegen ODS and related C++ code. (#116377)
[llvm-project.git] / llvm / utils / mlgo-utils / mlgo / corpus / extract_ir_lib.py
blobf434e59524bbf674eaeca3ab8e30e918c119601f
1 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2 # See https://llvm.org/LICENSE.txt for license information.
3 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4 """Library functions for IR extraction."""
6 import os
7 import pathlib
8 import re
9 import shutil
10 import subprocess
11 import multiprocessing
12 import functools
13 import json
14 import logging
16 from typing import Dict, List, Optional
18 _UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
21 # TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
22 # \0 - separated list of strings, to a \n one.
23 def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
24 """Determine if the module should be included."""
25 if match_regexp is None:
26 return True
27 lines = cmdline.split("\0")
28 return any(len(re.findall(match_regexp, l)) for l in lines)
31 def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
32 opts = cmdline.split("\0")
33 for option in opts:
34 if option.startswith("-fthinlto-index"):
35 return os.path.join(basedir, option.split("=")[1])
36 return None
39 class TrainingIRExtractor:
40 """IR and command line extraction from an object file."""
42 def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
43 """Set up a TrainingIRExtractor.
45 Args:
46 obj_relative_path: relative path to the input object file. It will be also
47 used to construct the absolute path of the output IR and cmd files, by
48 appending it to output_base_dir.
49 output_base_dir: the directory under which the output will be produced.
50 obj_base_dir: the base directory for all the input object files.
51 """
52 self._obj_relative_path = obj_relative_path
53 self._output_base_dir = output_base_dir
54 self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
56 def obj_base_dir(self):
57 return self._obj_base_dir
59 def output_base_dir(self):
60 return self._output_base_dir
62 def relative_output_path(self):
63 return self._obj_relative_path
65 def input_obj(self):
66 return os.path.join(self.obj_base_dir(), self._obj_relative_path)
68 def lld_src_bc(self):
69 # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
70 # IR bitcode saved by lld. It is hardcoded into lld.
71 return os.path.join(
72 self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
75 def lld_src_thinlto(self):
76 return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
78 def dest_dir(self):
79 return os.path.join(
80 self.output_base_dir(), os.path.dirname(self._obj_relative_path)
83 def module_name(self):
84 return os.path.basename(self._obj_relative_path)
86 def cmd_file(self):
87 return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
89 def bc_file(self):
90 return os.path.join(self.dest_dir(), self.module_name() + ".bc")
92 def thinlto_index_file(self):
93 return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
95 def _get_extraction_cmd_command(
96 self, llvm_objcopy_path: str, cmd_section_name: str
98 """Get llvm-objcopy and process args to a produce a command string that,
99 when invoked, will extract the cmd section info ths self.cmd_file() file.
101 return [
102 llvm_objcopy_path,
103 "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
104 self.input_obj(),
105 "/dev/null",
108 def _get_extraction_bc_command(
109 self, llvm_objcopy_path: str, bitcode_section_name: str
111 """Gets llvm-objcopy and process args to produce a command string that,
112 when invoked, will extract the bitcode section into the self.bc_file()
113 file.
115 return [
116 llvm_objcopy_path,
117 "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
118 self.input_obj(),
119 "/dev/null",
122 def _extract_clang_artifacts(
123 self,
124 llvm_objcopy_path: str,
125 cmd_filter: str,
126 is_thinlto: bool,
127 cmd_section_name: str,
128 bitcode_section_name: str,
129 ) -> Optional[str]:
130 """Run llvm-objcopy to extract the .bc and command line."""
131 if not os.path.exists(self.input_obj()):
132 logging.info("%s does not exist.", self.input_obj())
133 return None
134 os.makedirs(self.dest_dir(), exist_ok=True)
135 try:
136 subprocess.check_output(
137 self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
138 stderr=subprocess.STDOUT,
139 encoding="utf-8",
141 if cmd_filter is not None or is_thinlto:
142 with open(self.cmd_file(), encoding="utf-8") as f:
143 lines = f.readlines()
144 assert len(lines) == 1
145 cmdline = lines[0]
146 if not should_include_module(cmdline, cmd_filter):
147 logging.info(
148 "Excluding module %s because it does not match the filter",
149 self.input_obj(),
151 os.remove(self.cmd_file())
152 return None
153 if is_thinlto:
154 index_file = get_thinlto_index(cmdline, self.obj_base_dir())
155 shutil.copy(index_file, self.thinlto_index_file())
157 subprocess.check_output(
158 self._get_extraction_bc_command(
159 llvm_objcopy_path, bitcode_section_name
161 stderr=subprocess.STDOUT,
162 encoding="utf-8",
164 except subprocess.CalledProcessError as e:
165 # This may happen if .o file was build from asm (.S source).
166 logging.warning("%s was not processed: %s", self.input_obj(), e)
167 logging.info(e.output)
168 return None
169 assert (
170 os.path.exists(self.cmd_file())
171 and os.path.exists(self.bc_file())
172 and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
174 return self.relative_output_path()
176 def _extract_lld_artifacts(self) -> Optional[str]:
177 """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
178 if not os.path.exists(self.lld_src_bc()):
179 logging.info("%s does not exist.", self.lld_src_bc())
180 return None
181 if not os.path.exists(self.lld_src_thinlto()):
182 logging.info("%s does not exist.", self.lld_src_thinlto())
183 return None
184 os.makedirs(self.dest_dir(), exist_ok=True)
186 # Copy over the files
187 shutil.copy(self.lld_src_bc(), self.bc_file())
188 shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
190 assert os.path.exists(self.bc_file())
191 assert os.path.exists(self.thinlto_index_file())
192 return self._obj_relative_path
194 def extract(
195 self,
196 llvm_objcopy_path: Optional[str] = None,
197 cmd_filter: Optional[str] = None,
198 thinlto_build: Optional[str] = None,
199 cmd_section_name: Optional[str] = ".llvmcmd",
200 bitcode_section_name: Optional[str] = ".llvmbc",
201 ) -> Optional[str]:
202 if thinlto_build == "local":
203 return self._extract_lld_artifacts()
204 return self._extract_clang_artifacts(
205 llvm_objcopy_path=llvm_objcopy_path,
206 cmd_filter=cmd_filter,
207 is_thinlto=thinlto_build == "distributed",
208 cmd_section_name=cmd_section_name,
209 bitcode_section_name=bitcode_section_name,
213 def convert_compile_command_to_objectfile(
214 command: Dict[str, str], output_dir: str
215 ) -> Optional[TrainingIRExtractor]:
216 obj_base_dir = command["directory"]
217 if "arguments" in command:
218 cmd_parts = command["arguments"]
219 elif "command" in command:
220 cmd_parts = command["command"].split()
221 else:
222 logging.info("compile_commands element has no command and arguments")
223 return None
225 try:
226 obj_index = cmd_parts.index("-o") + 1
227 except ValueError:
228 # This could happen if there are non-clang commands in compile_commands.json
229 logging.info("Command has no -o option: %s", " ".join(cmd_parts))
230 return None
231 obj_rel_path = cmd_parts[obj_index]
232 # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
233 return TrainingIRExtractor(
234 obj_relative_path=obj_rel_path,
235 output_base_dir=output_dir,
236 obj_base_dir=obj_base_dir,
240 def load_from_compile_commands(
241 json_array: List[Dict[str, str]], output_dir: str
242 ) -> List[TrainingIRExtractor]:
243 objs = [
244 convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
246 # Filter out None, in case there were non-clang commands in the .json
247 return [obj for obj in objs if obj is not None]
250 def load_from_lld_params(
251 params_array: List[str], obj_base_dir: str, output_dir: str
252 ) -> List[TrainingIRExtractor]:
253 """Create an ObjectFile array based on lld's parameters."""
254 # yank out -o and the output. After that, anything not starting with '-', and
255 # ending in a '.o', is an object file.
256 try:
257 minus_o_idx = params_array.index("-o")
258 del params_array[minus_o_idx : minus_o_idx + 2]
259 just_obj_paths = [
260 o for o in params_array if not o.startswith("-") and o.endswith(".o")
262 except ValueError:
263 logging.info("This params file does not have an explicit -o option.")
264 just_obj_paths = params_array
266 def make_obj(obj_file: str) -> TrainingIRExtractor:
267 return TrainingIRExtractor(
268 obj_relative_path=obj_file,
269 output_base_dir=output_dir,
270 obj_base_dir=obj_base_dir,
273 return [make_obj(obj_file) for obj_file in just_obj_paths]
276 def load_from_directory(
277 obj_base_dir: str, output_dir: str
278 ) -> List[TrainingIRExtractor]:
279 """Create an object file array by globbing an entire drectory.
281 Args:
282 obj_base_dir: The base build directory that all object files will be
283 written out as being relative to.
284 output_dir: The output directory where extracted .bc and .cmd files should
285 be placed.
287 paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
289 def make_spec(obj_file: str):
290 return TrainingIRExtractor(
291 obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
292 output_base_dir=output_dir,
293 obj_base_dir=obj_base_dir,
296 return [make_spec(path) for path in paths]
299 def load_for_lld_thinlto(
300 obj_base_dir: str, output_dir: str
301 ) -> List[TrainingIRExtractor]:
302 # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
303 # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
304 # are also emitted next to the postimport bitcode, with the suffix
305 # .thinlto.bc instead
306 paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
308 def make_spec(obj_file: str):
309 return TrainingIRExtractor(
310 # Cut away .3.import.bc
311 obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
312 output_base_dir=output_dir,
313 obj_base_dir=obj_base_dir,
316 return [make_spec(path) for path in paths]
319 def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
320 """Creates an object file array by looking at the JSON output of bazel aquery.
322 Args:
323 aquery_json: The JSON-formatted output of the bazel aquery command for
324 the target of interest. The bazel aquery JSON should be a JSON
325 serialized version of the analysis.ActionGraphContainer proto.
326 https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
327 obj_base_dir: The base build directory that all object files will be
328 written out as arelative to.
329 output_dir: The output directory where extracted .bc and .cmd files should
330 be placed.
332 linker_params = []
334 for action_info in aquery_json["actions"]:
335 if action_info["mnemonic"] != "CppLink":
336 continue
337 linker_params = action_info["arguments"]
339 return load_from_lld_params(linker_params, obj_base_dir, output_dir)
342 def run_extraction(
343 objs: List[TrainingIRExtractor],
344 num_workers: int,
345 llvm_objcopy_path: str,
346 cmd_filter: str,
347 thinlto_build: str,
348 cmd_section_name: str,
349 bitcode_section_name: str,
351 """Extracts all specified object files into the corpus directory.
353 Args:
354 objs: A list of TrainingIRExtractor Objects that represent the object files
355 to extract bitcode/commands from.
356 num_workers: The number of parallel processes to spawn to run the
357 extraction.
358 llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
359 cmd_filter: A regular expression that is used to select for compilations
360 performed with specific flags. If you want to include all compilations,
361 set this to None.
362 thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
363 Set this to None if the build was not done with ThinLTO.
364 cmd_section_name: The name of the command line section created by the
365 bitcode embedding.
366 bitcode_section_name: The name of the bitcode section created by the
367 bitcode embedding.
369 extract_artifacts = functools.partial(
370 TrainingIRExtractor.extract,
371 llvm_objcopy_path=llvm_objcopy_path,
372 cmd_filter=cmd_filter,
373 thinlto_build=thinlto_build,
374 cmd_section_name=cmd_section_name,
375 bitcode_section_name=bitcode_section_name,
378 with multiprocessing.Pool(num_workers) as pool:
379 relative_output_paths = pool.map(extract_artifacts, objs)
380 pool.close()
381 pool.join()
382 return relative_output_paths
385 def write_corpus_manifest(
386 thinlto_build: str, relative_output_paths: List[str], output_dir: str
388 """Writes a corpus_manifest.json containing all necessary information about
389 the corpus.
391 Args:
392 thinlto_build: Whether or not the build was done with ThinLTO and if so,
393 what kind of ThinLTO. Set this to none if the build was not performed with
394 ThinLTO.
395 relative_output_paths: The relative (to the corpus directory) output paths
396 of all the bitcode files that should be placed in the corpus manifest
397 output_dir: The corpus directory where the corpus manifest should be
398 placed.
400 # This comes first rather than later so global_command_override is at the top
401 # of the .json after being written
402 if thinlto_build == "local":
403 corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
404 else:
405 corpus_description = {}
407 corpus_description.update(
409 "has_thinlto": thinlto_build is not None,
410 "modules": [path for path in relative_output_paths if path is not None],
414 with open(
415 os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
416 ) as f:
417 json.dump(corpus_description, f, indent=2)