[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / utils / extract_symbols.py
blob6f01cd12fcd81f082aa4041af220ee7935e40168
1 #!/usr/bin/env python
3 """A tool for extracting a list of symbols to export
5 When exporting symbols from a dll or exe we either need to mark the symbols in
6 the source code as __declspec(dllexport) or supply a list of symbols to the
7 linker. This program automates the latter by inspecting the symbol tables of a
8 list of link inputs and deciding which of those symbols need to be exported.
10 We can't just export all the defined symbols, as there's a limit of 65535
11 exported symbols and in clang we go way over that, particularly in a debug
12 build. Therefore a large part of the work is pruning symbols either which can't
13 be imported, or which we think are things that have definitions in public header
14 files (i.e. template instantiations) and we would get defined in the thing
15 importing these symbols anyway.
16 """
18 from __future__ import print_function
19 import sys
20 import re
21 import os
22 import subprocess
23 import multiprocessing
24 import argparse
26 # Define functions which extract a list of symbols from a library using several
27 # different tools. We use subprocess.Popen and yield a symbol at a time instead
28 # of using subprocess.check_output and returning a list as, especially on
29 # Windows, waiting for the entire output to be ready can take a significant
30 # amount of time.
32 def dumpbin_get_symbols(lib):
33 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
34 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
35 universal_newlines=True)
36 process.stdin.close()
37 for line in process.stdout:
38 # Look for external symbols that are defined in some section
39 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
40 if match:
41 yield match.group(1)
42 process.wait()
44 def nm_get_symbols(lib):
45 if sys.platform.startswith('aix'):
46 process = subprocess.Popen(['nm','-P','-Xany','-C','-p',lib], bufsize=1,
47 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
48 universal_newlines=True)
49 else:
50 process = subprocess.Popen(['nm','-P',lib], bufsize=1,
51 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
52 universal_newlines=True)
53 process.stdin.close()
54 for line in process.stdout:
55 # Look for external symbols that are defined in some section
56 # The POSIX format is:
57 # name type value size
58 # The -P flag displays the size field for symbols only when applicable,
59 # so the last field is optional. There's no space after the value field,
60 # but \s+ match newline also, so \s+\S* will match the optional size field.
61 match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S*$", line)
62 if match:
63 yield match.group(1)
64 process.wait()
66 def readobj_get_symbols(lib):
67 process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
68 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
69 universal_newlines=True)
70 process.stdin.close()
71 for line in process.stdout:
72 # When looking through the output of llvm-readobj we expect to see Name,
73 # Section, then StorageClass, so record Name and Section when we see
74 # them and decide if this is a defined external symbol when we see
75 # StorageClass.
76 match = re.search('Name: (\S+)', line)
77 if match:
78 name = match.group(1)
79 match = re.search('Section: (\S+)', line)
80 if match:
81 section = match.group(1)
82 match = re.search('StorageClass: (\S+)', line)
83 if match:
84 storageclass = match.group(1)
85 if section != 'IMAGE_SYM_ABSOLUTE' and \
86 section != 'IMAGE_SYM_UNDEFINED' and \
87 storageclass == 'External':
88 yield name
89 process.wait()
91 # Define functions which determine if the target is 32-bit Windows (as that's
92 # where calling convention name decoration happens).
94 def dumpbin_is_32bit_windows(lib):
95 # dumpbin /headers can output a huge amount of data (>100MB in a debug
96 # build) so we read only up to the 'machine' line then close the output.
97 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
98 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
99 universal_newlines=True)
100 process.stdin.close()
101 retval = False
102 for line in process.stdout:
103 match = re.match('.+machine \((\S+)\)', line)
104 if match:
105 retval = (match.group(1) == 'x86')
106 break
107 process.stdout.close()
108 process.wait()
109 return retval
111 def objdump_is_32bit_windows(lib):
112 output = subprocess.check_output(['objdump','-f',lib],
113 universal_newlines=True)
114 for line in output:
115 match = re.match('.+file format (\S+)', line)
116 if match:
117 return (match.group(1) == 'pe-i386')
118 return False
120 def readobj_is_32bit_windows(lib):
121 output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
122 universal_newlines=True)
123 for line in output:
124 match = re.match('Format: (\S+)', line)
125 if match:
126 return (match.group(1) == 'COFF-i386')
127 return False
129 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
130 # identifier/type mangling we can decide which symbols could possibly be
131 # required and which we can discard.
132 def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
133 # Keep unmangled (i.e. extern "C") names
134 if not '?' in symbol:
135 if calling_convention_decoration:
136 # Remove calling convention decoration from names
137 match = re.match('[_@]([^@]+)', symbol)
138 if match:
139 return match.group(1)
140 return symbol
141 # Function template instantiations start with ?$; keep the instantiations of
142 # clang::Type::getAs, as some of them are explipict specializations that are
143 # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
144 # the definition is public
145 elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
146 return symbol
147 elif symbol.startswith('??$'):
148 return None
149 # Deleting destructors start with ?_G or ?_E and can be discarded because
150 # link.exe gives you a warning telling you they can't be exported if you
151 # don't
152 elif symbol.startswith('??_G') or symbol.startswith('??_E'):
153 return None
154 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
155 # defined in headers and not required to be kept
156 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
157 return None
158 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
159 # that mentions an anonymous namespace can be discarded, as the anonymous
160 # namespace doesn't exist outside of that translation unit.
161 elif re.search('\?A(0x\w+)?@', symbol):
162 return None
163 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
164 # bit of a mess and imprecise, but that avoids having to completely demangle
165 # the symbol name. The outermost namespace is at the end of the identifier
166 # mangling, and the identifier mangling is followed by the type mangling, so
167 # we look for (llvm|clang)@@ followed by something that looks like a
168 # function type mangling. To spot a function type we use (this is derived
169 # from clang/lib/AST/MicrosoftMangle.cpp):
170 # <function-type> ::= <function-class> <this-cvr-qualifiers>
171 # <calling-convention> <return-type>
172 # <argument-list> <throw-spec>
173 # <function-class> ::= [A-Z]
174 # <this-cvr-qualifiers> ::= [A-Z0-9_]*
175 # <calling-convention> ::= [A-JQ]
176 # <return-type> ::= .+
177 # <argument-list> ::= X (void)
178 # ::= .+@ (list of types)
179 # ::= .*Z (list of types, varargs)
180 # <throw-spec> ::= exceptions are not allowed
181 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
182 return symbol
183 return None
185 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
186 # demangle the identifier mangling to identify symbols that can be safely
187 # discarded.
188 def should_keep_itanium_symbol(symbol, calling_convention_decoration):
189 # Start by removing any calling convention decoration (which we expect to
190 # see on all symbols, even mangled C++ symbols)
191 if calling_convention_decoration and symbol.startswith('_'):
192 symbol = symbol[1:]
193 # Keep unmangled names
194 if not symbol.startswith('_') and not symbol.startswith('.'):
195 return symbol
196 # Discard manglings that aren't nested names
197 match = re.match('_Z(T[VTIS])?(N.+)', symbol)
198 if not match:
199 return None
200 # Demangle the name. If the name is too complex then we don't need to keep
201 # it, but it the demangling fails then keep the symbol just in case.
202 try:
203 names, _ = parse_itanium_nested_name(match.group(2))
204 except TooComplexName:
205 return None
206 if not names:
207 return symbol
208 # Constructors and destructors of templates classes are assumed to be
209 # defined in headers and not required to be kept
210 if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
211 return None
212 # Keep the instantiations of clang::Type::getAs, as some of them are
213 # explipict specializations that are defined in clang's lib/AST/Type.cpp;
214 # discard any other function template instantiations as it's assumed that
215 # the definition is public
216 elif symbol.startswith('_ZNK5clang4Type5getAs'):
217 return symbol
218 elif names[-1][1]:
219 return None
220 # Keep llvm:: and clang:: names
221 elif names[0][0] == '4llvm' or names[0][0] == '5clang':
222 return symbol
223 # Discard everything else
224 else:
225 return None
227 # Certain kinds of complex manglings we assume cannot be part of a public
228 # interface, and we handle them by raising an exception.
229 class TooComplexName(Exception):
230 pass
232 # Parse an itanium mangled name from the start of a string and return a
233 # (name, rest of string) pair.
234 def parse_itanium_name(arg):
235 # Check for a normal name
236 match = re.match('(\d+)(.+)', arg)
237 if match:
238 n = int(match.group(1))
239 name = match.group(1)+match.group(2)[:n]
240 rest = match.group(2)[n:]
241 return name, rest
242 # Check for constructor/destructor names
243 match = re.match('([CD][123])(.+)', arg)
244 if match:
245 return match.group(1), match.group(2)
246 # Assume that a sequence of characters that doesn't end a nesting is an
247 # operator (this is very imprecise, but appears to be good enough)
248 match = re.match('([^E]+)(.+)', arg)
249 if match:
250 return match.group(1), match.group(2)
251 # Anything else: we can't handle it
252 return None, arg
254 # Parse an itanium mangled template argument list from the start of a string
255 # and throw it away, returning the rest of the string.
256 def skip_itanium_template(arg):
257 # A template argument list starts with I
258 assert arg.startswith('I'), arg
259 tmp = arg[1:]
260 while tmp:
261 # Check for names
262 match = re.match('(\d+)(.+)', tmp)
263 if match:
264 n = int(match.group(1))
265 tmp = match.group(2)[n:]
266 continue
267 # Check for substitutions
268 match = re.match('S[A-Z0-9]*_(.+)', tmp)
269 if match:
270 tmp = match.group(1)
271 # Start of a template
272 elif tmp.startswith('I'):
273 tmp = skip_itanium_template(tmp)
274 # Start of a nested name
275 elif tmp.startswith('N'):
276 _, tmp = parse_itanium_nested_name(tmp)
277 # Start of an expression: assume that it's too complicated
278 elif tmp.startswith('L') or tmp.startswith('X'):
279 raise TooComplexName
280 # End of the template
281 elif tmp.startswith('E'):
282 return tmp[1:]
283 # Something else: probably a type, skip it
284 else:
285 tmp = tmp[1:]
286 return None
288 # Parse an itanium mangled nested name and transform it into a list of pairs of
289 # (name, is_template), returning (list, rest of string).
290 def parse_itanium_nested_name(arg):
291 # A nested name starts with N
292 assert arg.startswith('N'), arg
293 ret = []
295 # Skip past the N, and possibly a substitution
296 match = re.match('NS[A-Z0-9]*_(.+)', arg)
297 if match:
298 tmp = match.group(1)
299 else:
300 tmp = arg[1:]
302 # Skip past CV-qualifiers and ref qualifiers
303 match = re.match('[rVKRO]*(.+)', tmp);
304 if match:
305 tmp = match.group(1)
307 # Repeatedly parse names from the string until we reach the end of the
308 # nested name
309 while tmp:
310 # An E ends the nested name
311 if tmp.startswith('E'):
312 return ret, tmp[1:]
313 # Parse a name
314 name_part, tmp = parse_itanium_name(tmp)
315 if not name_part:
316 # If we failed then we don't know how to demangle this
317 return None, None
318 is_template = False
319 # If this name is a template record that, then skip the template
320 # arguments
321 if tmp.startswith('I'):
322 tmp = skip_itanium_template(tmp)
323 is_template = True
324 # Add the name to the list
325 ret.append((name_part, is_template))
327 # If we get here then something went wrong
328 return None, None
330 def extract_symbols(arg):
331 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
332 symbols = dict()
333 for symbol in get_symbols(lib):
334 symbol = should_keep_symbol(symbol, calling_convention_decoration)
335 if symbol:
336 symbols[symbol] = 1 + symbols.setdefault(symbol,0)
337 return symbols
339 if __name__ == '__main__':
340 tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
341 parser = argparse.ArgumentParser(
342 description='Extract symbols to export from libraries')
343 parser.add_argument('--mangling', choices=['itanium','microsoft'],
344 required=True, help='expected symbol mangling scheme')
345 parser.add_argument('--tools', choices=tool_exes, nargs='*',
346 help='tools to use to extract symbols and determine the'
347 ' target')
348 parser.add_argument('libs', metavar='lib', type=str, nargs='+',
349 help='libraries to extract symbols from')
350 parser.add_argument('-o', metavar='file', type=str, help='output to file')
351 args = parser.parse_args()
353 # Determine the function to use to get the list of symbols from the inputs,
354 # and the function to use to determine if the target is 32-bit windows.
355 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
356 'nm' : (nm_get_symbols, None),
357 'objdump' : (None, objdump_is_32bit_windows),
358 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
359 get_symbols = None
360 is_32bit_windows = None
361 # If we have a tools argument then use that for the list of tools to check
362 if args.tools:
363 tool_exes = args.tools
364 # Find a tool to use by trying each in turn until we find one that exists
365 # (subprocess.call will throw OSError when the program does not exist)
366 get_symbols = None
367 for exe in tool_exes:
368 try:
369 # Close std streams as we don't want any output and we don't
370 # want the process to wait for something on stdin.
371 p = subprocess.Popen([exe], stdout=subprocess.PIPE,
372 stderr=subprocess.PIPE,
373 stdin=subprocess.PIPE,
374 universal_newlines=True)
375 p.stdout.close()
376 p.stderr.close()
377 p.stdin.close()
378 p.wait()
379 # Keep going until we have a tool to use for both get_symbols and
380 # is_32bit_windows
381 if not get_symbols:
382 get_symbols = tools[exe][0]
383 if not is_32bit_windows:
384 is_32bit_windows = tools[exe][1]
385 if get_symbols and is_32bit_windows:
386 break
387 except OSError:
388 continue
389 if not get_symbols:
390 print("Couldn't find a program to read symbols with", file=sys.stderr)
391 exit(1)
392 if not is_32bit_windows:
393 print("Couldn't find a program to determining the target", file=sys.stderr)
394 exit(1)
396 # How we determine which symbols to keep and which to discard depends on
397 # the mangling scheme
398 if args.mangling == 'microsoft':
399 should_keep_symbol = should_keep_microsoft_symbol
400 else:
401 should_keep_symbol = should_keep_itanium_symbol
403 # Get the list of libraries to extract symbols from
404 libs = list()
405 for lib in args.libs:
406 # When invoked by cmake the arguments are the cmake target names of the
407 # libraries, so we need to add .lib/.a to the end and maybe lib to the
408 # start to get the filename. Also allow objects.
409 suffixes = ['.lib','.a','.obj','.o']
410 if not any([lib.endswith(s) for s in suffixes]):
411 for s in suffixes:
412 if os.path.exists(lib+s):
413 lib = lib+s
414 break
415 if os.path.exists('lib'+lib+s):
416 lib = 'lib'+lib+s
417 break
418 if not any([lib.endswith(s) for s in suffixes]):
419 print("Don't know what to do with argument "+lib, file=sys.stderr)
420 exit(1)
421 libs.append(lib)
423 # Check if calling convention decoration is used by inspecting the first
424 # library in the list
425 calling_convention_decoration = is_32bit_windows(libs[0])
427 # Extract symbols from libraries in parallel. This is a huge time saver when
428 # doing a debug build, as there are hundreds of thousands of symbols in each
429 # library.
430 pool = multiprocessing.Pool()
431 try:
432 # Only one argument can be passed to the mapping function, and we can't
433 # use a lambda or local function definition as that doesn't work on
434 # windows, so create a list of tuples which duplicates the arguments
435 # that are the same in all calls.
436 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
437 # Do an async map then wait for the result to make sure that
438 # KeyboardInterrupt gets caught correctly (see
439 # http://bugs.python.org/issue8296)
440 result = pool.map_async(extract_symbols, vals)
441 pool.close()
442 libs_symbols = result.get(3600)
443 except KeyboardInterrupt:
444 # On Ctrl-C terminate everything and exit
445 pool.terminate()
446 pool.join()
447 exit(1)
449 # Merge everything into a single dict
450 symbols = dict()
451 for this_lib_symbols in libs_symbols:
452 for k,v in list(this_lib_symbols.items()):
453 symbols[k] = v + symbols.setdefault(k,0)
455 # Count instances of member functions of template classes, and map the
456 # symbol name to the function+class. We do this under the assumption that if
457 # a member function of a template class is instantiated many times it's
458 # probably declared in a public header file.
459 template_function_count = dict()
460 template_function_mapping = dict()
461 template_function_count[""] = 0
462 for k in symbols:
463 name = None
464 if args.mangling == 'microsoft':
465 # Member functions of templates start with
466 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
467 # As manglings go from the innermost scope to the outermost scope
468 # this means:
469 # * When we have a function member of a subclass of a template
470 # class then <fn_name> will actually contain the mangling of
471 # both the subclass and the function member. This is fine.
472 # * When we have a function member of a template subclass of a
473 # (possibly template) class then it's the innermost template
474 # subclass that becomes <class_name>. This should be OK so long
475 # as we don't have multiple classes with a template subclass of
476 # the same name.
477 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
478 if match:
479 name = match.group(1)
480 else:
481 # Find member functions of templates by demangling the name and
482 # checking if the second-to-last name in the list is a template.
483 match = re.match('_Z(T[VTIS])?(N.+)', k)
484 if match:
485 try:
486 names, _ = parse_itanium_nested_name(match.group(2))
487 if names and names[-2][1]:
488 name = ''.join([x for x,_ in names])
489 except TooComplexName:
490 # Manglings that are too complex should already have been
491 # filtered out, but if we happen to somehow see one here
492 # just leave it as-is.
493 pass
494 if name:
495 old_count = template_function_count.setdefault(name,0)
496 template_function_count[name] = old_count + 1
497 template_function_mapping[k] = name
498 else:
499 template_function_mapping[k] = ""
501 # Print symbols which both:
502 # * Appear in exactly one input, as symbols defined in multiple
503 # objects/libraries are assumed to have public definitions.
504 # * Aren't instances of member functions of templates which have been
505 # instantiated 100 times or more, which are assumed to have public
506 # definitions. (100 is an arbitrary guess here.)
507 if args.o:
508 outfile = open(args.o,'w')
509 else:
510 outfile = sys.stdout
511 for k,v in list(symbols.items()):
512 template_count = template_function_count[template_function_mapping[k]]
513 if v == 1 and template_count < 100:
514 print(k, file=outfile)