[NFC][Py Reformat] Reformat python files in llvm
[llvm-project.git] / llvm / utils / extract_symbols.py
blob9238828d7ce8567bb56687963a88a7b3daf5404b
1 #!/usr/bin/env python
3 """A tool for extracting a list of symbols to export
5 When exporting symbols from a dll or exe we either need to mark the symbols in
6 the source code as __declspec(dllexport) or supply a list of symbols to the
7 linker. This program automates the latter by inspecting the symbol tables of a
8 list of link inputs and deciding which of those symbols need to be exported.
10 We can't just export all the defined symbols, as there's a limit of 65535
11 exported symbols and in clang we go way over that, particularly in a debug
12 build. Therefore a large part of the work is pruning symbols either which can't
13 be imported, or which we think are things that have definitions in public header
14 files (i.e. template instantiations) and we would get defined in the thing
15 importing these symbols anyway.
16 """
18 from __future__ import print_function
19 import sys
20 import re
21 import os
22 import subprocess
23 import multiprocessing
24 import argparse
26 # Define a function which extracts a list of pairs of (symbols, is_def) from a
27 # library using llvm-nm becuase it can work both with regular and bitcode files.
28 # We use subprocess.Popen and yield a symbol at a time instead of using
29 # subprocess.check_output and returning a list as, especially on Windows, waiting
30 # for the entire output to be ready can take a significant amount of time.
31 def nm_get_symbols(tool, lib):
32 # '-P' means the output is in portable format,
33 # '-g' means we only get global symbols,
34 # '-Xany' enforce handling both 32- and 64-bit objects on AIX,
35 # '--no-demangle' ensure that C++ symbol names are not demangled; note
36 # that llvm-nm do not demangle by default, but the system nm on AIX does
37 # that, so the behavior may change in the future,
38 # '-p' do not waste time sorting the symbols.
39 cmd = [tool, "-P", "-g", "-Xany", "--no-demangle", "-p"]
40 process = subprocess.Popen(
41 cmd + [lib],
42 bufsize=1,
43 stdout=subprocess.PIPE,
44 stdin=subprocess.PIPE,
45 universal_newlines=True,
47 process.stdin.close()
48 for line in process.stdout:
49 # Look for external symbols that are defined in some section
50 # The POSIX format is:
51 # name type value size
52 # The -P flag displays the size field for symbols only when applicable,
53 # so the last field is optional. There's no space after the value field,
54 # but \s+ match newline also, so \s+\S* will match the optional size field.
55 match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
56 if match:
57 yield (match.group(1), True)
58 # Look for undefined symbols, which have type U and may or may not
59 # (depending on which nm is being used) have value and size.
60 match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
61 if match:
62 yield (match.group(1), False)
63 process.wait()
66 # Define a function which determines if the target is 32-bit Windows (as that's
67 # where calling convention name decoration happens).
68 def readobj_is_32bit_windows(tool, lib):
69 output = subprocess.check_output(
70 [tool, "--file-header", lib], universal_newlines=True
72 for line in output.splitlines():
73 match = re.match("Format: (\S+)", line)
74 if match:
75 return match.group(1) == "COFF-i386"
76 return False
79 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
80 # identifier/type mangling we can decide which symbols could possibly be
81 # required and which we can discard.
82 def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
83 # Keep unmangled (i.e. extern "C") names
84 if not "?" in symbol:
85 if calling_convention_decoration:
86 # Remove calling convention decoration from names
87 match = re.match("[_@]([^@]+)", symbol)
88 if match:
89 symbol = match.group(1)
90 # Discard floating point/SIMD constants.
91 if symbol.startswith(("__xmm@", "__ymm@", "__real@")):
92 return None
93 return symbol
94 # Deleting destructors start with ?_G or ?_E and can be discarded because
95 # link.exe gives you a warning telling you they can't be exported if you
96 # don't
97 elif symbol.startswith("??_G") or symbol.startswith("??_E"):
98 return None
99 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
100 # that mentions an anonymous namespace can be discarded, as the anonymous
101 # namespace doesn't exist outside of that translation unit.
102 elif re.search("\?A(0x\w+)?@", symbol):
103 return None
104 # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
105 elif re.match("\?is[A-Z0-9]*@X86@llvm", symbol):
106 return None
107 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
108 # bit of a mess and imprecise, but that avoids having to completely demangle
109 # the symbol name. The outermost namespace is at the end of the identifier
110 # mangling, and the identifier mangling is followed by the type mangling, so
111 # we look for (llvm|clang)@@ followed by something that looks like a
112 # function type mangling. To spot a function type we use (this is derived
113 # from clang/lib/AST/MicrosoftMangle.cpp):
114 # <function-type> ::= <function-class> <this-cvr-qualifiers>
115 # <calling-convention> <return-type>
116 # <argument-list> <throw-spec>
117 # <function-class> ::= [A-Z]
118 # <this-cvr-qualifiers> ::= [A-Z0-9_]*
119 # <calling-convention> ::= [A-JQ]
120 # <return-type> ::= .+
121 # <argument-list> ::= X (void)
122 # ::= .+@ (list of types)
123 # ::= .*Z (list of types, varargs)
124 # <throw-spec> ::= exceptions are not allowed
125 elif re.search("(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
126 return symbol
127 return None
130 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
131 # demangle the identifier mangling to identify symbols that can be safely
132 # discarded.
133 def should_keep_itanium_symbol(symbol, calling_convention_decoration):
134 # Start by removing any calling convention decoration (which we expect to
135 # see on all symbols, even mangled C++ symbols)
136 if calling_convention_decoration and symbol.startswith("_"):
137 symbol = symbol[1:]
138 # Keep unmangled names
139 if not symbol.startswith("_") and not symbol.startswith("."):
140 return symbol
141 # Discard manglings that aren't nested names
142 match = re.match("_Z(T[VTIS])?(N.+)", symbol)
143 if not match:
144 return None
145 # Demangle the name. If the name is too complex then we don't need to keep
146 # it, but it the demangling fails then keep the symbol just in case.
147 try:
148 names, _ = parse_itanium_nested_name(match.group(2))
149 except TooComplexName:
150 return None
151 if not names:
152 return symbol
153 # Keep llvm:: and clang:: names
154 elif names[0][0] == "4llvm" or names[0][0] == "5clang":
155 return symbol
156 # Discard everything else
157 else:
158 return None
161 # Certain kinds of complex manglings we assume cannot be part of a public
162 # interface, and we handle them by raising an exception.
163 class TooComplexName(Exception):
164 pass
167 # Parse an itanium mangled name from the start of a string and return a
168 # (name, rest of string) pair.
169 def parse_itanium_name(arg):
170 # Check for a normal name
171 match = re.match("(\d+)(.+)", arg)
172 if match:
173 n = int(match.group(1))
174 name = match.group(1) + match.group(2)[:n]
175 rest = match.group(2)[n:]
176 return name, rest
177 # Check for constructor/destructor names
178 match = re.match("([CD][123])(.+)", arg)
179 if match:
180 return match.group(1), match.group(2)
181 # Assume that a sequence of characters that doesn't end a nesting is an
182 # operator (this is very imprecise, but appears to be good enough)
183 match = re.match("([^E]+)(.+)", arg)
184 if match:
185 return match.group(1), match.group(2)
186 # Anything else: we can't handle it
187 return None, arg
190 # Parse an itanium mangled template argument list from the start of a string
191 # and throw it away, returning the rest of the string.
192 def skip_itanium_template(arg):
193 # A template argument list starts with I
194 assert arg.startswith("I"), arg
195 tmp = arg[1:]
196 while tmp:
197 # Check for names
198 match = re.match("(\d+)(.+)", tmp)
199 if match:
200 n = int(match.group(1))
201 tmp = match.group(2)[n:]
202 continue
203 # Check for substitutions
204 match = re.match("S[A-Z0-9]*_(.+)", tmp)
205 if match:
206 tmp = match.group(1)
207 # Start of a template
208 elif tmp.startswith("I"):
209 tmp = skip_itanium_template(tmp)
210 # Start of a nested name
211 elif tmp.startswith("N"):
212 _, tmp = parse_itanium_nested_name(tmp)
213 # Start of an expression: assume that it's too complicated
214 elif tmp.startswith("L") or tmp.startswith("X"):
215 raise TooComplexName
216 # End of the template
217 elif tmp.startswith("E"):
218 return tmp[1:]
219 # Something else: probably a type, skip it
220 else:
221 tmp = tmp[1:]
222 return None
225 # Parse an itanium mangled nested name and transform it into a list of pairs of
226 # (name, is_template), returning (list, rest of string).
227 def parse_itanium_nested_name(arg):
228 # A nested name starts with N
229 assert arg.startswith("N"), arg
230 ret = []
232 # Skip past the N, and possibly a substitution
233 match = re.match("NS[A-Z0-9]*_(.+)", arg)
234 if match:
235 tmp = match.group(1)
236 else:
237 tmp = arg[1:]
239 # Skip past CV-qualifiers and ref qualifiers
240 match = re.match("[rVKRO]*(.+)", tmp)
241 if match:
242 tmp = match.group(1)
244 # Repeatedly parse names from the string until we reach the end of the
245 # nested name
246 while tmp:
247 # An E ends the nested name
248 if tmp.startswith("E"):
249 return ret, tmp[1:]
250 # Parse a name
251 name_part, tmp = parse_itanium_name(tmp)
252 if not name_part:
253 # If we failed then we don't know how to demangle this
254 return None, None
255 is_template = False
256 # If this name is a template record that, then skip the template
257 # arguments
258 if tmp.startswith("I"):
259 tmp = skip_itanium_template(tmp)
260 is_template = True
261 # Add the name to the list
262 ret.append((name_part, is_template))
264 # If we get here then something went wrong
265 return None, None
268 # Parse a microsoft mangled symbol and return a list of pairs of
269 # (name, is_template). This is very rudimentary and does just enough
270 # in order to determine if the first or second component is a template.
271 def parse_microsoft_mangling(arg):
272 # If the name doesn't start with ? this isn't a mangled name
273 if not arg.startswith("?"):
274 return [(arg, False)]
275 arg = arg[1:]
276 components = []
277 while len(arg) > 0:
278 # If we see an empty component we've reached the end
279 if arg.startswith("@"):
280 return components
281 # Check for a simple name
282 match = re.match("(\w+)@(.+)", arg)
283 if match:
284 components.append((match.group(1), False))
285 arg = match.group(2)
286 continue
287 # Check for a special function name
288 match = re.match("(\?_?\w)(.+)", arg)
289 if match:
290 components.append((match.group(1), False))
291 arg = match.group(2)
292 continue
293 # Check for a template name
294 match = re.match("\?\$(\w+)@[^@]+@(.+)", arg)
295 if match:
296 components.append((match.group(1), True))
297 arg = match.group(2)
298 continue
299 # Some other kind of name that we can't handle
300 components.append((arg, False))
301 return components
302 return components
305 def extract_symbols(arg):
306 llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg
307 symbol_defs = dict()
308 symbol_refs = set()
309 for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib):
310 symbol = should_keep_symbol(symbol, calling_convention_decoration)
311 if symbol:
312 if is_def:
313 symbol_defs[symbol] = 1 + symbol_defs.setdefault(symbol, 0)
314 else:
315 symbol_refs.add(symbol)
316 return (symbol_defs, symbol_refs)
319 def get_template_name(sym, mangling):
320 # Parse the mangling into a list of (name, is_template)
321 try:
322 if mangling == "microsoft":
323 names = parse_microsoft_mangling(sym)
324 else:
325 match = re.match("_Z(T[VTIS])?(N.+)", sym)
326 if match:
327 names, _ = parse_itanium_nested_name(match.group(2))
328 else:
329 names = None
330 except TooComplexName:
331 return None
333 if not names:
334 return None
336 # If any component is a template then return it
337 for name, is_template in names:
338 if is_template:
339 return name
341 # Not a template
342 return None
345 def parse_tool_path(parser, tool, val):
346 try:
347 # Close std streams as we don't want any output and we don't
348 # want the process to wait for something on stdin.
349 p = subprocess.Popen(
350 [val],
351 stdout=subprocess.PIPE,
352 stderr=subprocess.PIPE,
353 stdin=subprocess.PIPE,
354 universal_newlines=True,
356 p.stdout.close()
357 p.stderr.close()
358 p.stdin.close()
359 p.wait()
360 return val
361 except Exception:
362 parser.error(f"Invalid path for {tool}")
365 if __name__ == "__main__":
366 parser = argparse.ArgumentParser(
367 description="Extract symbols to export from libraries"
369 parser.add_argument(
370 "--mangling",
371 choices=["itanium", "microsoft"],
372 required=True,
373 help="expected symbol mangling scheme",
375 parser.add_argument(
376 "--nm",
377 metavar="path",
378 type=lambda x: parse_tool_path(parser, "nm", x),
379 help="path to the llvm-nm executable",
381 parser.add_argument(
382 "--readobj",
383 metavar="path",
384 type=lambda x: parse_tool_path(parser, "readobj", x),
385 help="path to the llvm-readobj executable",
387 parser.add_argument(
388 "libs",
389 metavar="lib",
390 type=str,
391 nargs="+",
392 help="libraries to extract symbols from",
394 parser.add_argument("-o", metavar="file", type=str, help="output to file")
395 args = parser.parse_args()
397 # How we determine which symbols to keep and which to discard depends on
398 # the mangling scheme
399 if args.mangling == "microsoft":
400 should_keep_symbol = should_keep_microsoft_symbol
401 else:
402 should_keep_symbol = should_keep_itanium_symbol
404 # Get the list of libraries to extract symbols from
405 libs = list()
406 for lib in args.libs:
407 # When invoked by cmake the arguments are the cmake target names of the
408 # libraries, so we need to add .lib/.a to the end and maybe lib to the
409 # start to get the filename. Also allow objects.
410 suffixes = [".lib", ".a", ".obj", ".o"]
411 if not any([lib.endswith(s) for s in suffixes]):
412 for s in suffixes:
413 if os.path.exists(lib + s):
414 lib = lib + s
415 break
416 if os.path.exists("lib" + lib + s):
417 lib = "lib" + lib + s
418 break
419 if not any([lib.endswith(s) for s in suffixes]):
420 print("Don't know what to do with argument " + lib, file=sys.stderr)
421 exit(1)
422 libs.append(lib)
424 # Check if calling convention decoration is used by inspecting the first
425 # library in the list
426 calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0])
428 # Extract symbols from libraries in parallel. This is a huge time saver when
429 # doing a debug build, as there are hundreds of thousands of symbols in each
430 # library.
431 pool = multiprocessing.Pool()
432 try:
433 # Only one argument can be passed to the mapping function, and we can't
434 # use a lambda or local function definition as that doesn't work on
435 # windows, so create a list of tuples which duplicates the arguments
436 # that are the same in all calls.
437 vals = [
438 (args.nm, should_keep_symbol, calling_convention_decoration, x)
439 for x in libs
441 # Do an async map then wait for the result to make sure that
442 # KeyboardInterrupt gets caught correctly (see
443 # http://bugs.python.org/issue8296)
444 result = pool.map_async(extract_symbols, vals)
445 pool.close()
446 libs_symbols = result.get(3600)
447 except KeyboardInterrupt:
448 # On Ctrl-C terminate everything and exit
449 pool.terminate()
450 pool.join()
451 exit(1)
453 # Merge everything into a single dict
454 symbol_defs = dict()
455 symbol_refs = set()
456 for (this_lib_defs, this_lib_refs) in libs_symbols:
457 for k, v in list(this_lib_defs.items()):
458 symbol_defs[k] = v + symbol_defs.setdefault(k, 0)
459 for sym in list(this_lib_refs):
460 symbol_refs.add(sym)
462 # Find which template instantiations are referenced at least once.
463 template_instantiation_refs = set()
464 for sym in list(symbol_refs):
465 template = get_template_name(sym, args.mangling)
466 if template:
467 template_instantiation_refs.add(template)
469 # Print symbols which both:
470 # * Appear in exactly one input, as symbols defined in multiple
471 # objects/libraries are assumed to have public definitions.
472 # * Are not a template instantiation that isn't referenced anywhere. This
473 # is because we need to export any explicitly instantiated templates,
474 # and we expect those to be referenced in some object.
475 if args.o:
476 outfile = open(args.o, "w")
477 else:
478 outfile = sys.stdout
479 for k, v in list(symbol_defs.items()):
480 template = get_template_name(k, args.mangling)
481 if v == 1 and (not template or template in template_instantiation_refs):
482 print(k, file=outfile)