3 """A tool for extracting a list of symbols to export
5 When exporting symbols from a dll or exe we either need to mark the symbols in
6 the source code as __declspec(dllexport) or supply a list of symbols to the
7 linker. This program automates the latter by inspecting the symbol tables of a
8 list of link inputs and deciding which of those symbols need to be exported.
10 We can't just export all the defined symbols, as there's a limit of 65535
11 exported symbols and in clang we go way over that, particularly in a debug
12 build. Therefore a large part of the work is pruning symbols either which can't
13 be imported, or which we think are things that have definitions in public header
14 files (i.e. template instantiations) and we would get defined in the thing
15 importing these symbols anyway.
18 from __future__
import print_function
23 import multiprocessing
27 # Define a function which extracts a list of pairs of (symbols, is_def) from a
28 # library using llvm-nm becuase it can work both with regular and bitcode files.
29 # We use subprocess.Popen and yield a symbol at a time instead of using
30 # subprocess.check_output and returning a list as, especially on Windows, waiting
31 # for the entire output to be ready can take a significant amount of time.
32 def nm_get_symbols(tool
, lib
):
33 # '-P' means the output is in portable format,
34 # '-g' means we only get global symbols,
35 # '-Xany' enforce handling both 32- and 64-bit objects on AIX,
36 # '--no-demangle' ensure that C++ symbol names are not demangled; note
37 # that llvm-nm do not demangle by default, but the system nm on AIX does
38 # that, so the behavior may change in the future,
39 # '-p' do not waste time sorting the symbols.
40 cmd
= [tool
, "-P", "-g", "-Xany", "--no-demangle", "-p"]
41 process
= subprocess
.Popen(
44 stdout
=subprocess
.PIPE
,
45 stdin
=subprocess
.PIPE
,
46 universal_newlines
=True,
49 for line
in process
.stdout
:
50 # Look for external symbols that are defined in some section
51 # The POSIX format is:
52 # name type value size
53 # The -P flag displays the size field for symbols only when applicable,
54 # so the last field is optional. There's no space after the value field,
55 # but \s+ match newline also, so \s+\S* will match the optional size field.
56 match
= re
.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line
)
58 yield (match
.group(1), True)
59 # Look for undefined symbols, which have type U and may or may not
60 # (depending on which nm is being used) have value and size.
61 match
= re
.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line
)
63 yield (match
.group(1), False)
67 # Define a function which determines if the target is 32-bit Windows (as that's
68 # where calling convention name decoration happens).
69 def readobj_is_32bit_windows(tool
, lib
):
70 output
= subprocess
.check_output(
71 [tool
, "--file-header", lib
], universal_newlines
=True
73 for line
in output
.splitlines():
74 match
= re
.match("Format: (\S+)", line
)
76 return match
.group(1) == "COFF-i386"
80 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
81 # identifier/type mangling we can decide which symbols could possibly be
82 # required and which we can discard.
83 def should_keep_microsoft_symbol(symbol
, calling_convention_decoration
):
84 # Keep unmangled (i.e. extern "C") names
86 if calling_convention_decoration
:
87 # Remove calling convention decoration from names
88 match
= re
.match("[_@]([^@]+)", symbol
)
90 symbol
= match
.group(1)
91 # Discard floating point/SIMD constants.
92 if symbol
.startswith(("__xmm@", "__ymm@", "__real@")):
95 # Deleting destructors start with ?_G or ?_E and can be discarded because
96 # link.exe gives you a warning telling you they can't be exported if you
98 elif symbol
.startswith("??_G") or symbol
.startswith("??_E"):
100 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
101 # that mentions an anonymous namespace can be discarded, as the anonymous
102 # namespace doesn't exist outside of that translation unit.
103 elif re
.search("\?A(0x\w+)?@", symbol
):
105 # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
106 elif re
.match("\?is[A-Z0-9]*@X86@llvm", symbol
):
108 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
109 # bit of a mess and imprecise, but that avoids having to completely demangle
110 # the symbol name. The outermost namespace is at the end of the identifier
111 # mangling, and the identifier mangling is followed by the type mangling, so
112 # we look for (llvm|clang)@@ followed by something that looks like a
113 # function type mangling. To spot a function type we use (this is derived
114 # from clang/lib/AST/MicrosoftMangle.cpp):
115 # <function-type> ::= <function-class> <this-cvr-qualifiers>
116 # <calling-convention> <return-type>
117 # <argument-list> <throw-spec>
118 # <function-class> ::= [A-Z]
119 # <this-cvr-qualifiers> ::= [A-Z0-9_]*
120 # <calling-convention> ::= [A-JQ]
121 # <return-type> ::= .+
122 # <argument-list> ::= X (void)
123 # ::= .+@ (list of types)
124 # ::= .*Z (list of types, varargs)
125 # <throw-spec> ::= exceptions are not allowed
126 elif re
.search("(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol
):
131 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
132 # demangle the identifier mangling to identify symbols that can be safely
134 def should_keep_itanium_symbol(symbol
, calling_convention_decoration
):
135 # Start by removing any calling convention decoration (which we expect to
136 # see on all symbols, even mangled C++ symbols)
137 if calling_convention_decoration
and symbol
.startswith("_"):
139 # Keep unmangled names
140 if not symbol
.startswith("_") and not symbol
.startswith("."):
142 # Discard manglings that aren't nested names
143 match
= re
.match("_Z(T[VTIS])?(N.+)", symbol
)
146 # Demangle the name. If the name is too complex then we don't need to keep
147 # it, but it the demangling fails then keep the symbol just in case.
149 names
, _
= parse_itanium_nested_name(match
.group(2))
150 except TooComplexName
:
154 # Keep llvm:: and clang:: names
155 elif names
[0][0] == "4llvm" or names
[0][0] == "5clang":
157 # Discard everything else
162 # Certain kinds of complex manglings we assume cannot be part of a public
163 # interface, and we handle them by raising an exception.
164 class TooComplexName(Exception):
168 # Parse an itanium mangled name from the start of a string and return a
169 # (name, rest of string) pair.
170 def parse_itanium_name(arg
):
171 # Check for a normal name
172 match
= re
.match("(\d+)(.+)", arg
)
174 n
= int(match
.group(1))
175 name
= match
.group(1) + match
.group(2)[:n
]
176 rest
= match
.group(2)[n
:]
178 # Check for constructor/destructor names
179 match
= re
.match("([CD][123])(.+)", arg
)
181 return match
.group(1), match
.group(2)
182 # Assume that a sequence of characters that doesn't end a nesting is an
183 # operator (this is very imprecise, but appears to be good enough)
184 match
= re
.match("([^E]+)(.+)", arg
)
186 return match
.group(1), match
.group(2)
187 # Anything else: we can't handle it
191 # Parse an itanium mangled template argument list from the start of a string
192 # and throw it away, returning the rest of the string.
193 def skip_itanium_template(arg
):
194 # A template argument list starts with I
195 assert arg
.startswith("I"), arg
199 match
= re
.match("(\d+)(.+)", tmp
)
201 n
= int(match
.group(1))
202 tmp
= match
.group(2)[n
:]
204 # Check for substitutions
205 match
= re
.match("S[A-Z0-9]*_(.+)", tmp
)
208 # Start of a template
209 elif tmp
.startswith("I"):
210 tmp
= skip_itanium_template(tmp
)
211 # Start of a nested name
212 elif tmp
.startswith("N"):
213 _
, tmp
= parse_itanium_nested_name(tmp
)
214 # Start of an expression: assume that it's too complicated
215 elif tmp
.startswith("L") or tmp
.startswith("X"):
217 # End of the template
218 elif tmp
.startswith("E"):
220 # Something else: probably a type, skip it
226 # Parse an itanium mangled nested name and transform it into a list of pairs of
227 # (name, is_template), returning (list, rest of string).
228 def parse_itanium_nested_name(arg
):
229 # A nested name starts with N
230 assert arg
.startswith("N"), arg
233 # Skip past the N, and possibly a substitution
234 match
= re
.match("NS[A-Z0-9]*_(.+)", arg
)
240 # Skip past CV-qualifiers and ref qualifiers
241 match
= re
.match("[rVKRO]*(.+)", tmp
)
245 # Repeatedly parse names from the string until we reach the end of the
248 # An E ends the nested name
249 if tmp
.startswith("E"):
252 name_part
, tmp
= parse_itanium_name(tmp
)
254 # If we failed then we don't know how to demangle this
257 # If this name is a template record that, then skip the template
259 if tmp
.startswith("I"):
260 tmp
= skip_itanium_template(tmp
)
262 # Add the name to the list
263 ret
.append((name_part
, is_template
))
265 # If we get here then something went wrong
269 # Parse a microsoft mangled symbol and return a list of pairs of
270 # (name, is_template). This is very rudimentary and does just enough
271 # in order to determine if the first or second component is a template.
272 def parse_microsoft_mangling(arg
):
273 # If the name doesn't start with ? this isn't a mangled name
274 if not arg
.startswith("?"):
275 return [(arg
, False)]
279 # If we see an empty component we've reached the end
280 if arg
.startswith("@"):
282 # Check for a simple name
283 match
= re
.match("(\w+)@(.+)", arg
)
285 components
.append((match
.group(1), False))
288 # Check for a special function name
289 match
= re
.match("(\?_?\w)(.+)", arg
)
291 components
.append((match
.group(1), False))
294 # Check for a template name
295 match
= re
.match("\?\$(\w+)@[^@]+@(.+)", arg
)
297 components
.append((match
.group(1), True))
300 # Some other kind of name that we can't handle
301 components
.append((arg
, False))
306 def extract_symbols(arg
):
307 llvm_nm_path
, should_keep_symbol
, calling_convention_decoration
, lib
= arg
310 for (symbol
, is_def
) in nm_get_symbols(llvm_nm_path
, lib
):
311 symbol
= should_keep_symbol(symbol
, calling_convention_decoration
)
314 symbol_defs
[symbol
] = 1 + symbol_defs
.setdefault(symbol
, 0)
316 symbol_refs
.add(symbol
)
317 return (symbol_defs
, symbol_refs
)
320 def get_template_name(sym
, mangling
):
321 # Parse the mangling into a list of (name, is_template)
323 if mangling
== "microsoft":
324 names
= parse_microsoft_mangling(sym
)
326 match
= re
.match("_Z(T[VTIS])?(N.+)", sym
)
328 names
, _
= parse_itanium_nested_name(match
.group(2))
331 except TooComplexName
:
337 # If any component is a template then return it
338 for name
, is_template
in names
:
346 def parse_tool_path(parser
, tool
, val
):
348 # Close std streams as we don't want any output and we don't
349 # want the process to wait for something on stdin.
350 p
= subprocess
.Popen(
352 stdout
=subprocess
.PIPE
,
353 stderr
=subprocess
.PIPE
,
354 stdin
=subprocess
.PIPE
,
355 universal_newlines
=True,
363 parser
.error(f
"Invalid path for {tool}")
366 if __name__
== "__main__":
367 parser
= argparse
.ArgumentParser(
368 description
="Extract symbols to export from libraries"
372 choices
=["itanium", "microsoft"],
374 help="expected symbol mangling scheme",
379 type=lambda x
: parse_tool_path(parser
, "nm", x
),
380 help="path to the llvm-nm executable",
385 type=lambda x
: parse_tool_path(parser
, "readobj", x
),
386 help="path to the llvm-readobj executable",
393 help="libraries to extract symbols from",
395 parser
.add_argument("-o", metavar
="file", type=str, help="output to file")
396 args
= parser
.parse_args()
398 # How we determine which symbols to keep and which to discard depends on
399 # the mangling scheme
400 if args
.mangling
== "microsoft":
401 should_keep_symbol
= should_keep_microsoft_symbol
403 should_keep_symbol
= should_keep_itanium_symbol
405 # Get the list of libraries to extract symbols from
407 for lib
in args
.libs
:
408 # When invoked by cmake the arguments are the cmake target names of the
409 # libraries, so we need to add .lib/.a to the end and maybe lib to the
410 # start to get the filename. Also allow objects.
411 suffixes
= [".lib", ".a", ".obj", ".o"]
412 if not any([lib
.endswith(s
) for s
in suffixes
]):
414 if os
.path
.exists(lib
+ s
):
417 if os
.path
.exists("lib" + lib
+ s
):
418 lib
= "lib" + lib
+ s
420 if not any([lib
.endswith(s
) for s
in suffixes
]):
421 print("Don't know what to do with argument " + lib
, file=sys
.stderr
)
425 # Check if calling convention decoration is used by inspecting the first
426 # library in the list
427 calling_convention_decoration
= readobj_is_32bit_windows(args
.readobj
, libs
[0])
429 # Extract symbols from libraries in parallel. This is a huge time saver when
430 # doing a debug build, as there are hundreds of thousands of symbols in each
432 # FIXME: On AIX, the default pool size can be too big for a logical
433 # partition's allocated memory, and can lead to an out of memory
434 # IO error. We are setting the pool size to 8 to avoid such
435 # errors at the moment, and will look for a graceful solution later.
436 pool
= multiprocessing
.Pool(8) if platform
.system() == "AIX" \
437 else multiprocessing
.Pool()
439 # Only one argument can be passed to the mapping function, and we can't
440 # use a lambda or local function definition as that doesn't work on
441 # windows, so create a list of tuples which duplicates the arguments
442 # that are the same in all calls.
444 (args
.nm
, should_keep_symbol
, calling_convention_decoration
, x
)
447 # Do an async map then wait for the result to make sure that
448 # KeyboardInterrupt gets caught correctly (see
449 # http://bugs.python.org/issue8296)
450 result
= pool
.map_async(extract_symbols
, vals
)
452 libs_symbols
= result
.get(3600)
453 except KeyboardInterrupt:
454 # On Ctrl-C terminate everything and exit
459 # Merge everything into a single dict
462 for (this_lib_defs
, this_lib_refs
) in libs_symbols
:
463 for k
, v
in list(this_lib_defs
.items()):
464 symbol_defs
[k
] = v
+ symbol_defs
.setdefault(k
, 0)
465 for sym
in list(this_lib_refs
):
468 # Find which template instantiations are referenced at least once.
469 template_instantiation_refs
= set()
470 for sym
in list(symbol_refs
):
471 template
= get_template_name(sym
, args
.mangling
)
473 template_instantiation_refs
.add(template
)
475 # Print symbols which both:
476 # * Appear in exactly one input, as symbols defined in multiple
477 # objects/libraries are assumed to have public definitions.
478 # * Are not a template instantiation that isn't referenced anywhere. This
479 # is because we need to export any explicitly instantiated templates,
480 # and we expect those to be referenced in some object.
482 outfile
= open(args
.o
, "w")
485 for k
, v
in list(symbol_defs
.items()):
486 template
= get_template_name(k
, args
.mangling
)
487 if v
== 1 and (not template
or template
in template_instantiation_refs
):
488 print(k
, file=outfile
)