1 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2 # See https://llvm.org/LICENSE.txt for license information.
3 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4 """Library functions for IR extraction."""
11 import multiprocessing
16 from typing
import Dict
, List
, Optional
18 _UNSPECIFIED_OVERRIDE
= ["<UNSPECIFIED>"]
21 # TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
22 # \0 - separated list of strings, to a \n one.
23 def should_include_module(cmdline
: str, match_regexp
: Optional
[str]) -> bool:
24 """Determine if the module should be included."""
25 if match_regexp
is None:
27 lines
= cmdline
.split("\0")
28 return any(len(re
.findall(match_regexp
, l
)) for l
in lines
)
31 def get_thinlto_index(cmdline
: str, basedir
: str) -> Optional
[str]:
32 opts
= cmdline
.split("\0")
34 if option
.startswith("-fthinlto-index"):
35 return os
.path
.join(basedir
, option
.split("=")[1])
39 class TrainingIRExtractor
:
40 """IR and command line extraction from an object file."""
42 def __init__(self
, obj_relative_path
, output_base_dir
, obj_base_dir
=None):
43 """Set up a TrainingIRExtractor.
46 obj_relative_path: relative path to the input object file. It will be also
47 used to construct the absolute path of the output IR and cmd files, by
48 appending it to output_base_dir.
49 output_base_dir: the directory under which the output will be produced.
50 obj_base_dir: the base directory for all the input object files.
52 self
._obj
_relative
_path
= obj_relative_path
53 self
._output
_base
_dir
= output_base_dir
54 self
._obj
_base
_dir
= obj_base_dir
if obj_base_dir
is not None else ""
56 def obj_base_dir(self
):
57 return self
._obj
_base
_dir
59 def output_base_dir(self
):
60 return self
._output
_base
_dir
62 def relative_output_path(self
):
63 return self
._obj
_relative
_path
66 return os
.path
.join(self
.obj_base_dir(), self
._obj
_relative
_path
)
69 # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
70 # IR bitcode saved by lld. It is hardcoded into lld.
72 self
._obj
_base
_dir
, self
._obj
_relative
_path
+ ".3.import.bc"
75 def lld_src_thinlto(self
):
76 return os
.path
.join(self
._obj
_base
_dir
, self
._obj
_relative
_path
+ ".thinlto.bc")
80 self
.output_base_dir(), os
.path
.dirname(self
._obj
_relative
_path
)
83 def module_name(self
):
84 return os
.path
.basename(self
._obj
_relative
_path
)
87 return os
.path
.join(self
.dest_dir(), self
.module_name() + ".cmd")
90 return os
.path
.join(self
.dest_dir(), self
.module_name() + ".bc")
92 def thinlto_index_file(self
):
93 return os
.path
.join(self
.dest_dir(), self
.module_name() + ".thinlto.bc")
95 def _get_extraction_cmd_command(
96 self
, llvm_objcopy_path
: str, cmd_section_name
: str
98 """Get llvm-objcopy and process args to a produce a command string that,
99 when invoked, will extract the cmd section info ths self.cmd_file() file.
103 "--dump-section=" + cmd_section_name
+ "=" + self
.cmd_file(),
108 def _get_extraction_bc_command(
109 self
, llvm_objcopy_path
: str, bitcode_section_name
: str
111 """Gets llvm-objcopy and process args to produce a command string that,
112 when invoked, will extract the bitcode section into the self.bc_file()
117 "--dump-section=" + bitcode_section_name
+ "=" + self
.bc_file(),
122 def _extract_clang_artifacts(
124 llvm_objcopy_path
: str,
127 cmd_section_name
: str,
128 bitcode_section_name
: str,
130 """Run llvm-objcopy to extract the .bc and command line."""
131 if not os
.path
.exists(self
.input_obj()):
132 logging
.info("%s does not exist.", self
.input_obj())
134 os
.makedirs(self
.dest_dir(), exist_ok
=True)
136 subprocess
.check_output(
137 self
._get
_extraction
_cmd
_command
(llvm_objcopy_path
, cmd_section_name
),
138 stderr
=subprocess
.STDOUT
,
141 if cmd_filter
is not None or is_thinlto
:
142 with
open(self
.cmd_file(), encoding
="utf-8") as f
:
143 lines
= f
.readlines()
144 assert len(lines
) == 1
146 if not should_include_module(cmdline
, cmd_filter
):
148 "Excluding module %s because it does not match the filter",
151 os
.remove(self
.cmd_file())
154 index_file
= get_thinlto_index(cmdline
, self
.obj_base_dir())
155 shutil
.copy(index_file
, self
.thinlto_index_file())
157 subprocess
.check_output(
158 self
._get
_extraction
_bc
_command
(
159 llvm_objcopy_path
, bitcode_section_name
161 stderr
=subprocess
.STDOUT
,
164 except subprocess
.CalledProcessError
as e
:
165 # This may happen if .o file was build from asm (.S source).
166 logging
.warning("%s was not processed: %s", self
.input_obj(), e
)
167 logging
.info(e
.output
)
170 os
.path
.exists(self
.cmd_file())
171 and os
.path
.exists(self
.bc_file())
172 and (not is_thinlto
or os
.path
.exists(self
.thinlto_index_file()))
174 return self
.relative_output_path()
176 def _extract_lld_artifacts(self
) -> Optional
[str]:
177 """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
178 if not os
.path
.exists(self
.lld_src_bc()):
179 logging
.info("%s does not exist.", self
.lld_src_bc())
181 if not os
.path
.exists(self
.lld_src_thinlto()):
182 logging
.info("%s does not exist.", self
.lld_src_thinlto())
184 os
.makedirs(self
.dest_dir(), exist_ok
=True)
186 # Copy over the files
187 shutil
.copy(self
.lld_src_bc(), self
.bc_file())
188 shutil
.copy(self
.lld_src_thinlto(), self
.thinlto_index_file())
190 assert os
.path
.exists(self
.bc_file())
191 assert os
.path
.exists(self
.thinlto_index_file())
192 return self
._obj
_relative
_path
196 llvm_objcopy_path
: Optional
[str] = None,
197 cmd_filter
: Optional
[str] = None,
198 thinlto_build
: Optional
[str] = None,
199 cmd_section_name
: Optional
[str] = ".llvmcmd",
200 bitcode_section_name
: Optional
[str] = ".llvmbc",
202 if thinlto_build
== "local":
203 return self
._extract
_lld
_artifacts
()
204 return self
._extract
_clang
_artifacts
(
205 llvm_objcopy_path
=llvm_objcopy_path
,
206 cmd_filter
=cmd_filter
,
207 is_thinlto
=thinlto_build
== "distributed",
208 cmd_section_name
=cmd_section_name
,
209 bitcode_section_name
=bitcode_section_name
,
213 def convert_compile_command_to_objectfile(
214 command
: Dict
[str, str], output_dir
: str
215 ) -> Optional
[TrainingIRExtractor
]:
216 obj_base_dir
= command
["directory"]
217 if "arguments" in command
:
218 cmd_parts
= command
["arguments"]
219 elif "command" in command
:
220 cmd_parts
= command
["command"].split()
222 logging
.info("compile_commands element has no command and arguments")
226 obj_index
= cmd_parts
.index("-o") + 1
228 # This could happen if there are non-clang commands in compile_commands.json
229 logging
.info("Command has no -o option: %s", " ".join(cmd_parts
))
231 obj_rel_path
= cmd_parts
[obj_index
]
232 # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
233 return TrainingIRExtractor(
234 obj_relative_path
=obj_rel_path
,
235 output_base_dir
=output_dir
,
236 obj_base_dir
=obj_base_dir
,
240 def load_from_compile_commands(
241 json_array
: List
[Dict
[str, str]], output_dir
: str
242 ) -> List
[TrainingIRExtractor
]:
244 convert_compile_command_to_objectfile(cmd
, output_dir
) for cmd
in json_array
246 # Filter out None, in case there were non-clang commands in the .json
247 return [obj
for obj
in objs
if obj
is not None]
250 def load_from_lld_params(
251 params_array
: List
[str], obj_base_dir
: str, output_dir
: str
252 ) -> List
[TrainingIRExtractor
]:
253 """Create an ObjectFile array based on lld's parameters."""
254 # yank out -o and the output. After that, anything not starting with '-', and
255 # ending in a '.o', is an object file.
257 minus_o_idx
= params_array
.index("-o")
258 del params_array
[minus_o_idx
: minus_o_idx
+ 2]
260 o
for o
in params_array
if not o
.startswith("-") and o
.endswith(".o")
263 logging
.info("This params file does not have an explicit -o option.")
264 just_obj_paths
= params_array
266 def make_obj(obj_file
: str) -> TrainingIRExtractor
:
267 return TrainingIRExtractor(
268 obj_relative_path
=obj_file
,
269 output_base_dir
=output_dir
,
270 obj_base_dir
=obj_base_dir
,
273 return [make_obj(obj_file
) for obj_file
in just_obj_paths
]
276 def load_from_directory(
277 obj_base_dir
: str, output_dir
: str
278 ) -> List
[TrainingIRExtractor
]:
279 """Create an object file array by globbing an entire drectory.
282 obj_base_dir: The base build directory that all object files will be
283 written out as being relative to.
284 output_dir: The output directory where extracted .bc and .cmd files should
287 paths
= [str(p
) for p
in pathlib
.Path(obj_base_dir
).glob("**/*.o")]
289 def make_spec(obj_file
: str):
290 return TrainingIRExtractor(
291 obj_relative_path
=os
.path
.relpath(obj_file
, start
=obj_base_dir
),
292 output_base_dir
=output_dir
,
293 obj_base_dir
=obj_base_dir
,
296 return [make_spec(path
) for path
in paths
]
299 def load_for_lld_thinlto(
300 obj_base_dir
: str, output_dir
: str
301 ) -> List
[TrainingIRExtractor
]:
302 # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
303 # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
304 # are also emitted next to the postimport bitcode, with the suffix
305 # .thinlto.bc instead
306 paths
= [str(p
) for p
in pathlib
.Path(obj_base_dir
).glob("**/*.3.import.bc")]
308 def make_spec(obj_file
: str):
309 return TrainingIRExtractor(
310 # Cut away .3.import.bc
311 obj_relative_path
=os
.path
.relpath(obj_file
, start
=obj_base_dir
)[:-12],
312 output_base_dir
=output_dir
,
313 obj_base_dir
=obj_base_dir
,
316 return [make_spec(path
) for path
in paths
]
319 def load_bazel_aquery(aquery_json
, obj_base_dir
: str, output_dir
: str):
320 """Creates an object file array by looking at the JSON output of bazel aquery.
323 aquery_json: The JSON-formatted output of the bazel aquery command for
324 the target of interest. The bazel aquery JSON should be a JSON
325 serialized version of the analysis.ActionGraphContainer proto.
326 https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
327 obj_base_dir: The base build directory that all object files will be
328 written out as arelative to.
329 output_dir: The output directory where extracted .bc and .cmd files should
334 for action_info
in aquery_json
["actions"]:
335 if action_info
["mnemonic"] != "CppLink":
337 linker_params
= action_info
["arguments"]
339 return load_from_lld_params(linker_params
, obj_base_dir
, output_dir
)
343 objs
: List
[TrainingIRExtractor
],
345 llvm_objcopy_path
: str,
348 cmd_section_name
: str,
349 bitcode_section_name
: str,
351 """Extracts all specified object files into the corpus directory.
354 objs: A list of TrainingIRExtractor Objects that represent the object files
355 to extract bitcode/commands from.
356 num_workers: The number of parallel processes to spawn to run the
358 llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
359 cmd_filter: A regular expression that is used to select for compilations
360 performed with specific flags. If you want to include all compilations,
362 thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
363 Set this to None if the build was not done with ThinLTO.
364 cmd_section_name: The name of the command line section created by the
366 bitcode_section_name: The name of the bitcode section created by the
369 extract_artifacts
= functools
.partial(
370 TrainingIRExtractor
.extract
,
371 llvm_objcopy_path
=llvm_objcopy_path
,
372 cmd_filter
=cmd_filter
,
373 thinlto_build
=thinlto_build
,
374 cmd_section_name
=cmd_section_name
,
375 bitcode_section_name
=bitcode_section_name
,
378 with multiprocessing
.Pool(num_workers
) as pool
:
379 relative_output_paths
= pool
.map(extract_artifacts
, objs
)
382 return relative_output_paths
385 def write_corpus_manifest(
386 thinlto_build
: str, relative_output_paths
: List
[str], output_dir
: str
388 """Writes a corpus_manifest.json containing all necessary information about
392 thinlto_build: Whether or not the build was done with ThinLTO and if so,
393 what kind of ThinLTO. Set this to none if the build was not performed with
395 relative_output_paths: The relative (to the corpus directory) output paths
396 of all the bitcode files that should be placed in the corpus manifest
397 output_dir: The corpus directory where the corpus manifest should be
400 # This comes first rather than later so global_command_override is at the top
401 # of the .json after being written
402 if thinlto_build
== "local":
403 corpus_description
= {"global_command_override": _UNSPECIFIED_OVERRIDE
}
405 corpus_description
= {}
407 corpus_description
.update(
409 "has_thinlto": thinlto_build
is not None,
410 "modules": [path
for path
in relative_output_paths
if path
is not None],
415 os
.path
.join(output_dir
, "corpus_description.json"), "w", encoding
="utf-8"
417 json
.dump(corpus_description
, f
, indent
=2)