3 """A tool for extracting a list of symbols to export
5 When exporting symbols from a dll or exe we either need to mark the symbols in
6 the source code as __declspec(dllexport) or supply a list of symbols to the
7 linker. This program automates the latter by inspecting the symbol tables of a
8 list of link inputs and deciding which of those symbols need to be exported.
10 We can't just export all the defined symbols, as there's a limit of 65535
11 exported symbols and in clang we go way over that, particularly in a debug
12 build. Therefore a large part of the work is pruning symbols either which can't
13 be imported, or which we think are things that have definitions in public header
14 files (i.e. template instantiations) and we would get defined in the thing
15 importing these symbols anyway.
18 from __future__
import print_function
23 import multiprocessing
26 # Define functions which extract a list of symbols from a library using several
27 # different tools. We use subprocess.Popen and yield a symbol at a time instead
28 # of using subprocess.check_output and returning a list as, especially on
29 # Windows, waiting for the entire output to be ready can take a significant
32 def dumpbin_get_symbols(lib
):
33 process
= subprocess
.Popen(['dumpbin','/symbols',lib
], bufsize
=1,
34 stdout
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
,
35 universal_newlines
=True)
37 for line
in process
.stdout
:
38 # Look for external symbols that are defined in some section
39 match
= re
.match("^.+SECT.+External\s+\|\s+(\S+).*$", line
)
44 def nm_get_symbols(lib
):
45 if sys
.platform
.startswith('aix'):
46 process
= subprocess
.Popen(['nm','-P','-Xany','-C','-p',lib
], bufsize
=1,
47 stdout
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
,
48 universal_newlines
=True)
50 process
= subprocess
.Popen(['nm','-P',lib
], bufsize
=1,
51 stdout
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
,
52 universal_newlines
=True)
54 for line
in process
.stdout
:
55 # Look for external symbols that are defined in some section
56 # The POSIX format is:
57 # name type value size
58 # The -P flag displays the size field for symbols only when applicable,
59 # so the last field is optional. There's no space after the value field,
60 # but \s+ match newline also, so \s+\S* will match the optional size field.
61 match
= re
.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S*$", line
)
66 def readobj_get_symbols(lib
):
67 process
= subprocess
.Popen(['llvm-readobj','--symbols',lib
], bufsize
=1,
68 stdout
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
,
69 universal_newlines
=True)
71 for line
in process
.stdout
:
72 # When looking through the output of llvm-readobj we expect to see Name,
73 # Section, then StorageClass, so record Name and Section when we see
74 # them and decide if this is a defined external symbol when we see
76 match
= re
.search('Name: (\S+)', line
)
79 match
= re
.search('Section: (\S+)', line
)
81 section
= match
.group(1)
82 match
= re
.search('StorageClass: (\S+)', line
)
84 storageclass
= match
.group(1)
85 if section
!= 'IMAGE_SYM_ABSOLUTE' and \
86 section
!= 'IMAGE_SYM_UNDEFINED' and \
87 storageclass
== 'External':
91 # Define functions which determine if the target is 32-bit Windows (as that's
92 # where calling convention name decoration happens).
94 def dumpbin_is_32bit_windows(lib
):
95 # dumpbin /headers can output a huge amount of data (>100MB in a debug
96 # build) so we read only up to the 'machine' line then close the output.
97 process
= subprocess
.Popen(['dumpbin','/headers',lib
], bufsize
=1,
98 stdout
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
,
99 universal_newlines
=True)
100 process
.stdin
.close()
102 for line
in process
.stdout
:
103 match
= re
.match('.+machine \((\S+)\)', line
)
105 retval
= (match
.group(1) == 'x86')
107 process
.stdout
.close()
111 def objdump_is_32bit_windows(lib
):
112 output
= subprocess
.check_output(['objdump','-f',lib
],
113 universal_newlines
=True)
114 for line
in output
.splitlines():
115 match
= re
.match('.+file format (\S+)', line
)
117 return (match
.group(1) == 'pe-i386')
120 def readobj_is_32bit_windows(lib
):
121 output
= subprocess
.check_output(['llvm-readobj','--file-header',lib
],
122 universal_newlines
=True)
123 for line
in output
.splitlines():
124 match
= re
.match('Format: (\S+)', line
)
126 return (match
.group(1) == 'COFF-i386')
129 # On AIX, there isn't an easy way to detect 32-bit windows objects with the system toolchain,
130 # so just assume false.
131 def aix_is_32bit_windows(lib
):
134 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
135 # identifier/type mangling we can decide which symbols could possibly be
136 # required and which we can discard.
137 def should_keep_microsoft_symbol(symbol
, calling_convention_decoration
):
138 # Keep unmangled (i.e. extern "C") names
139 if not '?' in symbol
:
140 if calling_convention_decoration
:
141 # Remove calling convention decoration from names
142 match
= re
.match('[_@]([^@]+)', symbol
)
144 return match
.group(1)
146 # Function template instantiations start with ?$; keep the instantiations of
147 # clang::Type::getAs, as some of them are explipict specializations that are
148 # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
149 # the definition is public
150 elif re
.match('\?\?\$getAs@.+@Type@clang@@', symbol
):
152 elif symbol
.startswith('??$'):
154 # Deleting destructors start with ?_G or ?_E and can be discarded because
155 # link.exe gives you a warning telling you they can't be exported if you
157 elif symbol
.startswith('??_G') or symbol
.startswith('??_E'):
159 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
160 # defined in headers and not required to be kept
161 elif symbol
.startswith('??0?$') or symbol
.startswith('??1?$'):
163 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
164 # that mentions an anonymous namespace can be discarded, as the anonymous
165 # namespace doesn't exist outside of that translation unit.
166 elif re
.search('\?A(0x\w+)?@', symbol
):
168 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
169 # bit of a mess and imprecise, but that avoids having to completely demangle
170 # the symbol name. The outermost namespace is at the end of the identifier
171 # mangling, and the identifier mangling is followed by the type mangling, so
172 # we look for (llvm|clang)@@ followed by something that looks like a
173 # function type mangling. To spot a function type we use (this is derived
174 # from clang/lib/AST/MicrosoftMangle.cpp):
175 # <function-type> ::= <function-class> <this-cvr-qualifiers>
176 # <calling-convention> <return-type>
177 # <argument-list> <throw-spec>
178 # <function-class> ::= [A-Z]
179 # <this-cvr-qualifiers> ::= [A-Z0-9_]*
180 # <calling-convention> ::= [A-JQ]
181 # <return-type> ::= .+
182 # <argument-list> ::= X (void)
183 # ::= .+@ (list of types)
184 # ::= .*Z (list of types, varargs)
185 # <throw-spec> ::= exceptions are not allowed
186 elif re
.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol
):
190 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
191 # demangle the identifier mangling to identify symbols that can be safely
193 def should_keep_itanium_symbol(symbol
, calling_convention_decoration
):
194 # Start by removing any calling convention decoration (which we expect to
195 # see on all symbols, even mangled C++ symbols)
196 if calling_convention_decoration
and symbol
.startswith('_'):
198 # Keep unmangled names
199 if not symbol
.startswith('_') and not symbol
.startswith('.'):
201 # Discard manglings that aren't nested names
202 match
= re
.match('_Z(T[VTIS])?(N.+)', symbol
)
205 # Demangle the name. If the name is too complex then we don't need to keep
206 # it, but it the demangling fails then keep the symbol just in case.
208 names
, _
= parse_itanium_nested_name(match
.group(2))
209 except TooComplexName
:
213 # Constructors and destructors of templates classes are assumed to be
214 # defined in headers and not required to be kept
215 if re
.match('[CD][123]', names
[-1][0]) and names
[-2][1]:
217 # Keep the instantiations of clang::Type::getAs, as some of them are
218 # explipict specializations that are defined in clang's lib/AST/Type.cpp;
219 # discard any other function template instantiations as it's assumed that
220 # the definition is public
221 elif symbol
.startswith('_ZNK5clang4Type5getAs'):
225 # Keep llvm:: and clang:: names
226 elif names
[0][0] == '4llvm' or names
[0][0] == '5clang':
228 # Discard everything else
232 # Certain kinds of complex manglings we assume cannot be part of a public
233 # interface, and we handle them by raising an exception.
234 class TooComplexName(Exception):
237 # Parse an itanium mangled name from the start of a string and return a
238 # (name, rest of string) pair.
239 def parse_itanium_name(arg
):
240 # Check for a normal name
241 match
= re
.match('(\d+)(.+)', arg
)
243 n
= int(match
.group(1))
244 name
= match
.group(1)+match
.group(2)[:n
]
245 rest
= match
.group(2)[n
:]
247 # Check for constructor/destructor names
248 match
= re
.match('([CD][123])(.+)', arg
)
250 return match
.group(1), match
.group(2)
251 # Assume that a sequence of characters that doesn't end a nesting is an
252 # operator (this is very imprecise, but appears to be good enough)
253 match
= re
.match('([^E]+)(.+)', arg
)
255 return match
.group(1), match
.group(2)
256 # Anything else: we can't handle it
259 # Parse an itanium mangled template argument list from the start of a string
260 # and throw it away, returning the rest of the string.
261 def skip_itanium_template(arg
):
262 # A template argument list starts with I
263 assert arg
.startswith('I'), arg
267 match
= re
.match('(\d+)(.+)', tmp
)
269 n
= int(match
.group(1))
270 tmp
= match
.group(2)[n
:]
272 # Check for substitutions
273 match
= re
.match('S[A-Z0-9]*_(.+)', tmp
)
276 # Start of a template
277 elif tmp
.startswith('I'):
278 tmp
= skip_itanium_template(tmp
)
279 # Start of a nested name
280 elif tmp
.startswith('N'):
281 _
, tmp
= parse_itanium_nested_name(tmp
)
282 # Start of an expression: assume that it's too complicated
283 elif tmp
.startswith('L') or tmp
.startswith('X'):
285 # End of the template
286 elif tmp
.startswith('E'):
288 # Something else: probably a type, skip it
293 # Parse an itanium mangled nested name and transform it into a list of pairs of
294 # (name, is_template), returning (list, rest of string).
295 def parse_itanium_nested_name(arg
):
296 # A nested name starts with N
297 assert arg
.startswith('N'), arg
300 # Skip past the N, and possibly a substitution
301 match
= re
.match('NS[A-Z0-9]*_(.+)', arg
)
307 # Skip past CV-qualifiers and ref qualifiers
308 match
= re
.match('[rVKRO]*(.+)', tmp
);
312 # Repeatedly parse names from the string until we reach the end of the
315 # An E ends the nested name
316 if tmp
.startswith('E'):
319 name_part
, tmp
= parse_itanium_name(tmp
)
321 # If we failed then we don't know how to demangle this
324 # If this name is a template record that, then skip the template
326 if tmp
.startswith('I'):
327 tmp
= skip_itanium_template(tmp
)
329 # Add the name to the list
330 ret
.append((name_part
, is_template
))
332 # If we get here then something went wrong
335 def extract_symbols(arg
):
336 get_symbols
, should_keep_symbol
, calling_convention_decoration
, lib
= arg
338 for symbol
in get_symbols(lib
):
339 symbol
= should_keep_symbol(symbol
, calling_convention_decoration
)
341 symbols
[symbol
] = 1 + symbols
.setdefault(symbol
,0)
344 if __name__
== '__main__':
345 tool_exes
= ['dumpbin','nm','objdump','llvm-readobj']
346 parser
= argparse
.ArgumentParser(
347 description
='Extract symbols to export from libraries')
348 parser
.add_argument('--mangling', choices
=['itanium','microsoft'],
349 required
=True, help='expected symbol mangling scheme')
350 parser
.add_argument('--tools', choices
=tool_exes
, nargs
='*',
351 help='tools to use to extract symbols and determine the'
353 parser
.add_argument('libs', metavar
='lib', type=str, nargs
='+',
354 help='libraries to extract symbols from')
355 parser
.add_argument('-o', metavar
='file', type=str, help='output to file')
356 args
= parser
.parse_args()
358 # Determine the function to use to get the list of symbols from the inputs,
359 # and the function to use to determine if the target is 32-bit windows.
360 tools
= { 'dumpbin' : (dumpbin_get_symbols
, dumpbin_is_32bit_windows
),
361 'nm' : (nm_get_symbols
, None),
362 'objdump' : (None, objdump_is_32bit_windows
),
363 'llvm-readobj' : (readobj_get_symbols
, readobj_is_32bit_windows
) }
365 is_32bit_windows
= aix_is_32bit_windows
if sys
.platform
.startswith('aix') else None
366 # If we have a tools argument then use that for the list of tools to check
368 tool_exes
= args
.tools
369 # Find a tool to use by trying each in turn until we find one that exists
370 # (subprocess.call will throw OSError when the program does not exist)
372 for exe
in tool_exes
:
374 # Close std streams as we don't want any output and we don't
375 # want the process to wait for something on stdin.
376 p
= subprocess
.Popen([exe
], stdout
=subprocess
.PIPE
,
377 stderr
=subprocess
.PIPE
,
378 stdin
=subprocess
.PIPE
,
379 universal_newlines
=True)
384 # Keep going until we have a tool to use for both get_symbols and
387 get_symbols
= tools
[exe
][0]
388 if not is_32bit_windows
:
389 is_32bit_windows
= tools
[exe
][1]
390 if get_symbols
and is_32bit_windows
:
395 print("Couldn't find a program to read symbols with", file=sys
.stderr
)
397 if not is_32bit_windows
:
398 print("Couldn't find a program to determining the target", file=sys
.stderr
)
401 # How we determine which symbols to keep and which to discard depends on
402 # the mangling scheme
403 if args
.mangling
== 'microsoft':
404 should_keep_symbol
= should_keep_microsoft_symbol
406 should_keep_symbol
= should_keep_itanium_symbol
408 # Get the list of libraries to extract symbols from
410 for lib
in args
.libs
:
411 # When invoked by cmake the arguments are the cmake target names of the
412 # libraries, so we need to add .lib/.a to the end and maybe lib to the
413 # start to get the filename. Also allow objects.
414 suffixes
= ['.lib','.a','.obj','.o']
415 if not any([lib
.endswith(s
) for s
in suffixes
]):
417 if os
.path
.exists(lib
+s
):
420 if os
.path
.exists('lib'+lib
+s
):
423 if not any([lib
.endswith(s
) for s
in suffixes
]):
424 print("Don't know what to do with argument "+lib
, file=sys
.stderr
)
428 # Check if calling convention decoration is used by inspecting the first
429 # library in the list
430 calling_convention_decoration
= is_32bit_windows(libs
[0])
432 # Extract symbols from libraries in parallel. This is a huge time saver when
433 # doing a debug build, as there are hundreds of thousands of symbols in each
435 pool
= multiprocessing
.Pool()
437 # Only one argument can be passed to the mapping function, and we can't
438 # use a lambda or local function definition as that doesn't work on
439 # windows, so create a list of tuples which duplicates the arguments
440 # that are the same in all calls.
441 vals
= [(get_symbols
, should_keep_symbol
, calling_convention_decoration
, x
) for x
in libs
]
442 # Do an async map then wait for the result to make sure that
443 # KeyboardInterrupt gets caught correctly (see
444 # http://bugs.python.org/issue8296)
445 result
= pool
.map_async(extract_symbols
, vals
)
447 libs_symbols
= result
.get(3600)
448 except KeyboardInterrupt:
449 # On Ctrl-C terminate everything and exit
454 # Merge everything into a single dict
456 for this_lib_symbols
in libs_symbols
:
457 for k
,v
in list(this_lib_symbols
.items()):
458 symbols
[k
] = v
+ symbols
.setdefault(k
,0)
460 # Count instances of member functions of template classes, and map the
461 # symbol name to the function+class. We do this under the assumption that if
462 # a member function of a template class is instantiated many times it's
463 # probably declared in a public header file.
464 template_function_count
= dict()
465 template_function_mapping
= dict()
466 template_function_count
[""] = 0
469 if args
.mangling
== 'microsoft':
470 # Member functions of templates start with
471 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
472 # As manglings go from the innermost scope to the outermost scope
474 # * When we have a function member of a subclass of a template
475 # class then <fn_name> will actually contain the mangling of
476 # both the subclass and the function member. This is fine.
477 # * When we have a function member of a template subclass of a
478 # (possibly template) class then it's the innermost template
479 # subclass that becomes <class_name>. This should be OK so long
480 # as we don't have multiple classes with a template subclass of
482 match
= re
.search("^\?(\??\w+\@\?\$\w+)\@", k
)
484 name
= match
.group(1)
486 # Find member functions of templates by demangling the name and
487 # checking if the second-to-last name in the list is a template.
488 match
= re
.match('_Z(T[VTIS])?(N.+)', k
)
491 names
, _
= parse_itanium_nested_name(match
.group(2))
492 if names
and names
[-2][1]:
493 name
= ''.join([x
for x
,_
in names
])
494 except TooComplexName
:
495 # Manglings that are too complex should already have been
496 # filtered out, but if we happen to somehow see one here
497 # just leave it as-is.
500 old_count
= template_function_count
.setdefault(name
,0)
501 template_function_count
[name
] = old_count
+ 1
502 template_function_mapping
[k
] = name
504 template_function_mapping
[k
] = ""
506 # Print symbols which both:
507 # * Appear in exactly one input, as symbols defined in multiple
508 # objects/libraries are assumed to have public definitions.
509 # * Aren't instances of member functions of templates which have been
510 # instantiated 100 times or more, which are assumed to have public
511 # definitions. (100 is an arbitrary guess here.)
513 outfile
= open(args
.o
,'w')
516 for k
,v
in list(symbols
.items()):
517 template_count
= template_function_count
[template_function_mapping
[k
]]
518 if v
== 1 and template_count
< 100:
519 print(k
, file=outfile
)