2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Generate a spatial analysis against an arbitrary library.
8 To use, build the 'binary_size_tool' target. Then run this tool, passing
9 in the location of the library to be analyzed along with any other options
16 import multiprocessing
26 import binary_size_utils
28 # This path changee is not beautiful. Temporary (I hope) measure until
29 # the chromium project has figured out a proper way to organize the
30 # library of python tools. http://crbug.com/375725
31 elf_symbolizer_path
= os
.path
.abspath(os
.path
.join(
32 os
.path
.dirname(__file__
),
38 sys
.path
.append(elf_symbolizer_path
)
39 import symbols
.elf_symbolizer
as elf_symbolizer
# pylint: disable=F0401
42 # TODO(andrewhayden): Only used for legacy reports. Delete.
43 def FormatBytes(byte_count
):
44 """Pretty-print a number of bytes."""
46 byte_count
= byte_count
/ 1.0e6
47 return '%.1fm' % byte_count
49 byte_count
= byte_count
/ 1.0e3
50 return '%.1fk' % byte_count
51 return str(byte_count
)
54 # TODO(andrewhayden): Only used for legacy reports. Delete.
55 def SymbolTypeToHuman(symbol_type
):
56 """Convert a symbol type as printed by nm into a human-readable name."""
59 'r': 'read-only data',
62 'v': 'weak symbol'}[symbol_type
]
65 def _MkChild(node
, name
):
66 child
= node
['children'].get(name
)
68 child
= {'n': name
, 'children': {}}
69 node
['children'][name
] = child
73 def MakeChildrenDictsIntoLists(node
):
75 if 'children' in node
:
76 largest_list_len
= len(node
['children'])
78 for child
in node
['children'].itervalues():
79 child_largest_list_len
= MakeChildrenDictsIntoLists(child
)
80 if child_largest_list_len
> largest_list_len
:
81 largest_list_len
= child_largest_list_len
82 child_list
.append(child
)
83 node
['children'] = child_list
85 return largest_list_len
88 def MakeCompactTree(symbols
):
89 result
= {'n': '/', 'children': {}, 'k': 'p', 'maxDepth': 0}
90 seen_symbol_with_path
= False
91 for symbol_name
, symbol_type
, symbol_size
, file_path
in symbols
:
93 if 'vtable for ' in symbol_name
:
94 symbol_type
= '@' # hack to categorize these separately
95 # Take path like '/foo/bar/baz', convert to ['foo', 'bar', 'baz']
97 file_path
= os
.path
.normpath(file_path
)
98 seen_symbol_with_path
= True
100 file_path
= '(No Path)'
102 if file_path
.startswith('/'):
103 file_path
= file_path
[1:]
104 path_parts
= file_path
.split('/')
106 # Find pre-existing node in tree, or update if it already exists
109 while len(path_parts
) > 0:
110 path_part
= path_parts
.pop(0)
111 if len(path_part
) == 0:
114 node
= _MkChild(node
, path_part
)
115 assert not 'k' in node
or node
['k'] == 'p'
116 node
['k'] = 'p' # p for path
118 # 'node' is now the file node. Find the symbol-type bucket.
119 node
['lastPathElement'] = True
120 node
= _MkChild(node
, symbol_type
)
121 assert not 'k' in node
or node
['k'] == 'b'
122 node
['t'] = symbol_type
123 node
['k'] = 'b' # b for bucket
126 # 'node' is now the symbol-type bucket. Make the child entry.
127 node
= _MkChild(node
, symbol_name
)
128 if 'children' in node
:
130 logging
.warning('A container node used as symbol for %s.' % symbol_name
)
131 # This is going to be used as a leaf so no use for child list.
133 node
['value'] = symbol_size
134 node
['t'] = symbol_type
135 node
['k'] = 's' # s for symbol
137 result
['maxDepth'] = max(result
['maxDepth'], depth
)
139 if not seen_symbol_with_path
:
140 logging
.warning('Symbols lack paths. Data will not be structured.')
142 largest_list_len
= MakeChildrenDictsIntoLists(result
)
144 if largest_list_len
> 1000:
145 logging
.warning('There are sections with %d nodes. '
146 'Results might be unusable.' % largest_list_len
)
150 # TODO(andrewhayden): Only used for legacy reports. Delete.
151 def TreeifySymbols(symbols
):
152 """Convert symbols into a path-based tree, calculating size information
155 The result is a dictionary that contains two kinds of nodes:
156 1. Leaf nodes, representing source code locations (e.g., c++ files)
157 These nodes have the following dictionary entries:
158 sizes: a dictionary whose keys are categories (such as code, data,
159 vtable, etceteras) and whose values are the size, in bytes, of
161 size: the total size, in bytes, of all the entries in the sizes dict
162 2. Non-leaf nodes, representing directories
163 These nodes have the following dictionary entries:
164 children: a dictionary whose keys are names (path entries; either
165 directory or file names) and whose values are other nodes;
166 size: the total size, in bytes, of all the leaf nodes that are
167 contained within the children dict (recursively expanded)
169 The result object is itself a dictionary that represents the common ancestor
170 of all child nodes, e.g. a path to which all other nodes beneath it are
171 relative. The 'size' attribute of this dict yields the sum of the size of all
172 leaf nodes within the data structure.
174 dirs
= {'children': {}, 'size': 0}
175 for sym
, symbol_type
, size
, path
in symbols
:
178 path
= os
.path
.normpath(path
)
179 if path
.startswith('/'):
184 parts
= path
.split('/')
188 file_key
= parts
.pop()
191 # Traverse the tree to the parent of the file node, creating as needed
194 if part
not in tree
['children']:
195 tree
['children'][part
] = {'children': {}, 'size': 0}
196 tree
= tree
['children'][part
]
199 # Get (creating if necessary) the node for the file
200 # This node doesn't have a 'children' attribute
201 if file_key
not in tree
['children']:
202 tree
['children'][file_key
] = {'sizes': collections
.defaultdict(int),
204 tree
= tree
['children'][file_key
]
207 # Accumulate size into a bucket within the file
208 symbol_type
= symbol_type
.lower()
209 if 'vtable for ' in sym
:
210 tree
['sizes']['[vtable]'] += size
211 elif 'r' == symbol_type
:
212 tree
['sizes']['[rodata]'] += size
213 elif 'd' == symbol_type
:
214 tree
['sizes']['[data]'] += size
215 elif 'b' == symbol_type
:
216 tree
['sizes']['[bss]'] += size
217 elif 't' == symbol_type
:
218 # 'text' in binary parlance means 'code'.
219 tree
['sizes']['[code]'] += size
220 elif 'w' == symbol_type
:
221 tree
['sizes']['[weak]'] += size
223 tree
['sizes']['[other]'] += size
225 print >> sys
.stderr
, sym
, parts
, file_key
228 key
= 'symbols without paths'
229 if key
not in dirs
['children']:
230 dirs
['children'][key
] = {'sizes': collections
.defaultdict(int),
232 tree
= dirs
['children'][key
]
234 if (sym
.endswith('::__FUNCTION__') or
235 sym
.endswith('::__PRETTY_FUNCTION__')):
236 subkey
= '__FUNCTION__'
237 elif sym
.startswith('CSWTCH.'):
240 subkey
= sym
[0:sym
.find('::') + 2]
241 tree
['sizes'][subkey
] = tree
['sizes'].get(subkey
, 0) + size
246 # TODO(andrewhayden): Only used for legacy reports. Delete.
247 def JsonifyTree(tree
, name
):
248 """Convert TreeifySymbols output to a JSON treemap.
250 The format is very similar, with the notable exceptions being
251 lists of children instead of maps and some different attribute names."""
254 '[vtable]': 'vtable',
255 '[rodata]': 'read-only_data',
259 '[weak]': 'weak_symbol'
261 if 'children' in tree
:
262 # Non-leaf node. Recurse.
263 for child_name
, child
in tree
['children'].iteritems():
264 children
.append(JsonifyTree(child
, child_name
))
266 # Leaf node; dump per-file stats as entries in the treemap
267 for kind
, size
in tree
['sizes'].iteritems():
268 child_json
= {'name': kind
+ ' (' + FormatBytes(size
) + ')',
269 'data': { '$area': size
}}
270 css_class
= css_class_map
.get(kind
)
271 if css_class
is not None:
272 child_json
['data']['$symbol'] = css_class
273 children
.append(child_json
)
274 # Sort children by size, largest to smallest.
275 children
.sort(key
=lambda child
: -child
['data']['$area'])
277 # For leaf nodes, the 'size' attribute is the size of the leaf;
278 # Non-leaf nodes don't really have a size, but their 'size' attribute is
279 # the sum of the sizes of all their children.
280 return {'name': name
+ ' (' + FormatBytes(tree
['size']) + ')',
281 'data': { '$area': tree
['size'] },
282 'children': children
}
284 def DumpCompactTree(symbols
, outfile
):
285 tree_root
= MakeCompactTree(symbols
)
286 with
open(outfile
, 'w') as out
:
287 out
.write('var tree_data = ')
288 json
.dump(tree_root
, out
)
289 print('Writing %d bytes json' % os
.path
.getsize(outfile
))
292 # TODO(andrewhayden): Only used for legacy reports. Delete.
293 def DumpTreemap(symbols
, outfile
):
294 dirs
= TreeifySymbols(symbols
)
295 out
= open(outfile
, 'w')
297 out
.write('var kTree = ' + json
.dumps(JsonifyTree(dirs
, '/')))
303 # TODO(andrewhayden): Only used for legacy reports. Delete.
304 def DumpLargestSymbols(symbols
, outfile
, n
):
305 # a list of (sym, symbol_type, size, path); sort by size.
306 symbols
= sorted(symbols
, key
=lambda x
: -x
[2])
308 out
= open(outfile
, 'w')
310 out
.write('var largestSymbols = [\n')
311 for sym
, symbol_type
, size
, path
in symbols
:
312 if symbol_type
in ('b', 'w'):
313 continue # skip bss and weak symbols
316 entry
= {'size': FormatBytes(size
),
318 'type': SymbolTypeToHuman(symbol_type
),
320 out
.write(json
.dumps(entry
))
331 def MakeSourceMap(symbols
):
333 for _sym
, _symbol_type
, size
, path
in symbols
:
336 key
= os
.path
.normpath(path
)
339 if key
not in sources
:
340 sources
[key
] = {'path': path
, 'symbol_count': 0, 'size': 0}
341 record
= sources
[key
]
342 record
['size'] += size
343 record
['symbol_count'] += 1
347 # TODO(andrewhayden): Only used for legacy reports. Delete.
348 def DumpLargestSources(symbols
, outfile
, n
):
349 source_map
= MakeSourceMap(symbols
)
350 sources
= sorted(source_map
.values(), key
=lambda x
: -x
['size'])
352 out
= open(outfile
, 'w')
354 out
.write('var largestSources = [\n')
355 for record
in sources
:
356 entry
= {'size': FormatBytes(record
['size']),
357 'symbol_count': str(record
['symbol_count']),
358 'location': record
['path']}
359 out
.write(json
.dumps(entry
))
370 # TODO(andrewhayden): Only used for legacy reports. Delete.
371 def DumpLargestVTables(symbols
, outfile
, n
):
373 for symbol
, _type
, size
, path
in symbols
:
374 if 'vtable for ' in symbol
:
375 vtables
.append({'symbol': symbol
, 'path': path
, 'size': size
})
376 vtables
= sorted(vtables
, key
=lambda x
: -x
['size'])
378 out
= open(outfile
, 'w')
380 out
.write('var largestVTables = [\n')
381 for record
in vtables
:
382 entry
= {'size': FormatBytes(record
['size']),
383 'symbol': record
['symbol'],
384 'location': record
['path']}
385 out
.write(json
.dumps(entry
))
396 # Regex for parsing "nm" output. A sample line looks like this:
397 # 0167b39c 00000018 t ACCESS_DESCRIPTION_free /path/file.c:95
399 # The fields are: address, size, type, name, source location
400 # Regular expression explained ( see also: https://xkcd.com/208 ):
401 # ([0-9a-f]{8,}+) The address
402 # [\s]+ Whitespace separator
403 # ([0-9a-f]{8,}+) The size. From here on out it's all optional.
404 # [\s]+ Whitespace separator
405 # (\S?) The symbol type, which is any non-whitespace char
406 # [\s*] Whitespace separator
407 # ([^\t]*) Symbol name, any non-tab character (spaces ok!)
408 # [\t]? Tab separator
409 # (.*) The location (filename[:linennum|?][ (discriminator n)]
410 sNmPattern
= re
.compile(
411 r
'([0-9a-f]{8,})[\s]+([0-9a-f]{8,})[\s]*(\S?)[\s*]([^\t]*)[\t]?(.*)')
418 self
.time_last_output
= time
.time()
419 self
.count_last_output
= 0
422 def RunElfSymbolizer(outfile
, library
, addr2line_binary
, nm_binary
, jobs
):
423 nm_output
= RunNm(library
, nm_binary
)
424 nm_output_lines
= nm_output
.splitlines()
425 nm_output_lines_len
= len(nm_output_lines
)
427 progress
= Progress()
428 def map_address_symbol(symbol
, addr
):
430 if addr
in address_symbol
:
431 # 'Collision between %s and %s.' % (str(symbol.name),
432 # str(address_symbol[addr].name))
433 progress
.collisions
+= 1
435 address_symbol
[addr
] = symbol
438 if progress
.count
% progress_chunk
== 0:
439 time_now
= time
.time()
440 time_spent
= time_now
- progress
.time_last_output
442 # Only output at most once per second.
443 progress
.time_last_output
= time_now
444 chunk_size
= progress
.count
- progress
.count_last_output
445 progress
.count_last_output
= progress
.count
447 speed
= chunk_size
/ time_spent
450 progress_percent
= (100.0 * (progress
.count
+ progress
.skip_count
) /
452 print('%.1f%%: Looked up %d symbols (%d collisions) - %.1f lookups/s.' %
453 (progress_percent
, progress
.count
, progress
.collisions
, speed
))
455 symbolizer
= elf_symbolizer
.ELFSymbolizer(library
, addr2line_binary
,
457 max_concurrent_jobs
=jobs
)
458 for line
in nm_output_lines
:
459 match
= sNmPattern
.match(line
)
461 location
= match
.group(5)
463 addr
= int(match
.group(1), 16)
464 size
= int(match
.group(2), 16)
465 if addr
in address_symbol
: # Already looked up, shortcut ELFSymbolizer.
466 map_address_symbol(address_symbol
[addr
], addr
)
469 # Save time by not looking up empty symbols (do they even exist?)
470 print('Empty symbol: ' + line
)
472 symbolizer
.SymbolizeAsync(addr
, addr
)
475 progress
.skip_count
+= 1
479 with
open(outfile
, 'w') as out
:
480 for line
in nm_output_lines
:
481 match
= sNmPattern
.match(line
)
483 location
= match
.group(5)
485 addr
= int(match
.group(1), 16)
486 symbol
= address_symbol
[addr
]
488 if symbol
.source_path
is not None:
489 path
= symbol
.source_path
491 if symbol
.source_line
is not None:
492 line_number
= symbol
.source_line
493 out
.write('%s\t%s:%d\n' % (line
, path
, line_number
))
496 out
.write('%s\n' % line
)
498 print('%d symbols in the results.' % len(address_symbol
))
501 def RunNm(binary
, nm_binary
):
503 cmd
= [nm_binary
, '-C', '--print-size', binary
]
504 nm_process
= subprocess
.Popen(cmd
,
505 stdout
=subprocess
.PIPE
,
506 stderr
=subprocess
.PIPE
)
507 (process_output
, err_output
) = nm_process
.communicate()
509 if nm_process
.returncode
!= 0:
511 raise Exception, err_output
513 raise Exception, process_output
516 return process_output
519 def GetNmSymbols(nm_infile
, outfile
, library
, jobs
, verbose
,
520 addr2line_binary
, nm_binary
):
521 if nm_infile
is None:
523 outfile
= tempfile
.NamedTemporaryFile(delete
=False).name
526 print 'Running parallel addr2line, dumping symbols to ' + outfile
527 RunElfSymbolizer(outfile
, library
, addr2line_binary
, nm_binary
, jobs
)
532 print 'Using nm input from ' + nm_infile
533 with
file(nm_infile
, 'r') as infile
:
534 return list(binary_size_utils
.ParseNm(infile
))
537 def _find_in_system_path(binary
):
538 """Locate the full path to binary in the system path or return None
540 system_path
= os
.environ
["PATH"].split(os
.pathsep
)
541 for path
in system_path
:
542 binary_path
= os
.path
.join(path
, binary
)
543 if os
.path
.isfile(binary_path
):
549 usage
= """%prog [options]
551 Runs a spatial analysis on a given library, looking up the source locations
552 of its symbols and calculating how much space each directory, source file,
553 and so on is taking. The result is a report that can be used to pinpoint
554 sources of large portions of the binary, etceteras.
556 Under normal circumstances, you only need to pass two arguments, thusly:
558 %prog --library /path/to/library --destdir /path/to/output
560 In this mode, the program will dump the symbols from the specified library
561 and map those symbols back to source locations, producing a web-based
562 report in the specified output directory.
564 Other options are available via '--help'.
566 parser
= optparse
.OptionParser(usage
=usage
)
567 parser
.add_option('--nm-in', metavar
='PATH',
568 help='if specified, use nm input from <path> instead of '
569 'generating it. Note that source locations should be '
570 'present in the file; i.e., no addr2line symbol lookups '
571 'will be performed when this option is specified. '
572 'Mutually exclusive with --library.')
573 parser
.add_option('--destdir', metavar
='PATH',
574 help='write output to the specified directory. An HTML '
575 'report is generated here along with supporting files; '
576 'any existing report will be overwritten.')
577 parser
.add_option('--library', metavar
='PATH',
578 help='if specified, process symbols in the library at '
579 'the specified path. Mutually exclusive with --nm-in.')
580 parser
.add_option('--nm-binary',
581 help='use the specified nm binary to analyze library. '
582 'This is to be used when the nm in the path is not for '
583 'the right architecture or of the right version.')
584 parser
.add_option('--addr2line-binary',
585 help='use the specified addr2line binary to analyze '
586 'library. This is to be used when the addr2line in '
587 'the path is not for the right architecture or '
588 'of the right version.')
589 parser
.add_option('--jobs',
590 help='number of jobs to use for the parallel '
591 'addr2line processing pool; defaults to 1. More '
592 'jobs greatly improve throughput but eat RAM like '
593 'popcorn, and take several gigabytes each. Start low '
594 'and ramp this number up until your machine begins to '
595 'struggle with RAM. '
596 'This argument is only valid when using --library.')
597 parser
.add_option('-v', dest
='verbose', action
='store_true',
598 help='be verbose, printing lots of status information.')
599 parser
.add_option('--nm-out', metavar
='PATH',
600 help='keep the nm output file, and store it at the '
601 'specified path. This is useful if you want to see the '
602 'fully processed nm output after the symbols have been '
603 'mapped to source locations. By default, a tempfile is '
604 'used and is deleted when the program terminates.'
605 'This argument is only valid when using --library.')
606 parser
.add_option('--legacy', action
='store_true',
607 help='emit legacy binary size report instead of modern')
608 opts
, _args
= parser
.parse_args()
610 if ((not opts
.library
) and (not opts
.nm_in
)) or (opts
.library
and opts
.nm_in
):
611 parser
.error('exactly one of --library or --nm-in is required')
614 print >> sys
.stderr
, ('WARNING: --jobs has no effect '
615 'when used with --nm-in')
617 parser
.error('--destdir is required argument')
619 # Use the number of processors but cap between 2 and 4 since raw
620 # CPU power isn't the limiting factor. It's I/O limited, memory
621 # bus limited and available-memory-limited. Too many processes and
622 # the computer will run out of memory and it will be slow.
623 opts
.jobs
= max(2, min(4, str(multiprocessing
.cpu_count())))
625 if opts
.addr2line_binary
:
626 assert os
.path
.isfile(opts
.addr2line_binary
)
627 addr2line_binary
= opts
.addr2line_binary
629 addr2line_binary
= _find_in_system_path('addr2line')
630 assert addr2line_binary
, 'Unable to find addr2line in the path. '\
631 'Use --addr2line-binary to specify location.'
634 assert os
.path
.isfile(opts
.nm_binary
)
635 nm_binary
= opts
.nm_binary
637 nm_binary
= _find_in_system_path('nm')
638 assert nm_binary
, 'Unable to find nm in the path. Use --nm-binary '\
639 'to specify location.'
641 print('nm: %s' % nm_binary
)
642 print('addr2line: %s' % addr2line_binary
)
644 symbols
= GetNmSymbols(opts
.nm_in
, opts
.nm_out
, opts
.library
,
645 opts
.jobs
, opts
.verbose
is True,
646 addr2line_binary
, nm_binary
)
647 if not os
.path
.exists(opts
.destdir
):
648 os
.makedirs(opts
.destdir
, 0755)
651 if opts
.legacy
: # legacy report
652 DumpTreemap(symbols
, os
.path
.join(opts
.destdir
, 'treemap-dump.js'))
653 DumpLargestSymbols(symbols
,
654 os
.path
.join(opts
.destdir
, 'largest-symbols.js'), 100)
655 DumpLargestSources(symbols
,
656 os
.path
.join(opts
.destdir
, 'largest-sources.js'), 100)
657 DumpLargestVTables(symbols
,
658 os
.path
.join(opts
.destdir
, 'largest-vtables.js'), 100)
659 treemap_out
= os
.path
.join(opts
.destdir
, 'webtreemap')
660 if not os
.path
.exists(treemap_out
):
661 os
.makedirs(treemap_out
, 0755)
662 treemap_src
= os
.path
.join('third_party', 'webtreemap', 'src')
663 shutil
.copy(os
.path
.join(treemap_src
, 'COPYING'), treemap_out
)
664 shutil
.copy(os
.path
.join(treemap_src
, 'webtreemap.js'), treemap_out
)
665 shutil
.copy(os
.path
.join(treemap_src
, 'webtreemap.css'), treemap_out
)
666 shutil
.copy(os
.path
.join('tools', 'binary_size', 'legacy_template',
667 'index.html'), opts
.destdir
)
668 else: # modern report
669 DumpCompactTree(symbols
, os
.path
.join(opts
.destdir
, 'data.js'))
670 d3_out
= os
.path
.join(opts
.destdir
, 'd3')
671 if not os
.path
.exists(d3_out
):
672 os
.makedirs(d3_out
, 0755)
673 d3_src
= os
.path
.join(os
.path
.dirname(__file__
),
676 'third_party', 'd3', 'src')
677 template_src
= os
.path
.join(os
.path
.dirname(__file__
),
679 shutil
.copy(os
.path
.join(d3_src
, 'LICENSE'), d3_out
)
680 shutil
.copy(os
.path
.join(d3_src
, 'd3.js'), d3_out
)
681 print('Copying index.html')
682 shutil
.copy(os
.path
.join(template_src
, 'index.html'), opts
.destdir
)
683 shutil
.copy(os
.path
.join(template_src
, 'D3SymbolTreeMap.js'), opts
.destdir
)
686 print 'Report saved to ' + opts
.destdir
+ '/index.html'
689 if __name__
== '__main__':